R/clusterer_farm.R

Defines functions local_hierarchical_clusterer global_hierarchical_clusterer get_hierarchical_clusters get_longevity_cut_height process_dendrograms get_agglomerative_dendrogram

Documented in global_hierarchical_clusterer local_hierarchical_clusterer

###########################################################################
# CLUSTERER FARM
# here we nuture young clusterers to maturity
# included clustering methods that can be used with mapper
###########################################################################

# hierarchical clustering -------------------------------------------------

#' Grow A Dendrogram
#'
#' Perform agglomerative clustering on a single distance matrix.
#'
#' @param dist A distance matrix. Needs to be a `dist` object.
#' @param method A string to pass to [hclust] to determine clustering method.
#'
#' @return A dendrogram generated by `fastcluster`.
#'
#' @noRd
get_agglomerative_dendrogram <- function(dist, method) {
  if (!(inherits(dist, "dist")) & (any(is.na(dist)))) {
    return(vector())
  } else if (!(inherits(dist, "dist"))) {
    res = list(1)
    names(res) = dist
    return(res)
  } else {
    return(fastcluster::hclust(dist, method))
  }
}

#' Grow Dendrograms
#'
#' Cut many dendrograms at specified cut heights.
#'
#' @param dends A `list` of dendrograms to be cut.
#' @param cut_heights A list of cut heights which correspond to each dendrogram in `dends`.
#' @return A list of named vectors (one per dendrogram) whose names are data point names and whose values are cluster labels.
#' @noRd
process_dendrograms <- function(dends, cut_heights) {
  # in case we get a single dendrogram and not a list of them
  if (inherits(dends, "hclust")) {
    return(cutree(dends, h = cut_heights))
  }

  snipped_dends = mapply(cutree,
                         dends,
                         h = cut_heights,
                         SIMPLIFY = FALSE)
  return(snipped_dends)
}

#' Lengthy Cut Height Finder
#'
#' Find the the longest-lived hierarchy of a dendrogram.
#'
#' @param dend A dendrogram.
#' @param max_height The maximum height of the dendrogram; if this is not provided the last merge height of the input dendrogram will be used, which will make cutting to one cluster impossible!
#'
#' @return The point 5% above the merge height with the longest time to the next merge point.
#' This value may not be the "ideal" cut height! This is just a heuristic.
#' @noRd
get_longevity_cut_height <- function(dend, max_height = max(cophenetic(dend))) {
  # TODO remove all the duplicate code lol
  heights = append(sort(unique(cophenetic(dend))), max_height) # merge heights of dendrogram

  branch_lengths = diff(heights) # differences are branch lengths

  tallest_branch_height = max(branch_lengths)
  tallest_branch_id = which(branch_lengths == tallest_branch_height)
  cutval = heights[tallest_branch_id] + .05*tallest_branch_height

  if (length(cutval) > 1) {
    cutval = sample(cutval, 1)
  }

  return(cutval)
}

#' Do Much Clustering
#'
#' Perform hierarchical clustering on multiple distance matrices.
#'
#' @param dist_mats A list of distance matrices to be used for clustering. Must be a `list` of `dist` objects.
#' @param method A string to pass to [hclust] to tell it what kind of clustering to do. Can be `single`, `complete`, etc.
#' @param cut_height A global cut height. If not specified or negative, dendrograms will be cut individually.
#'
#' @return A `list` containing named vectors (one per dendrogram), whose names are data point names and whose values are cluster labels (integers).
#' @noRd
get_hierarchical_clusters <- function(dist_mats, method, cut_height = -1) {
  # do agglomerative clustering on each patch
  dends = lapply(dist_mats, get_agglomerative_dendrogram, method)

  # find heights for each dendrogram
  max_dists = sapply(dist_mats, max)

  # remove trivial heights
  nonzero_max_dists = max_dists[max_dists != 0]

  # we would like to only cut non-trivial dendrograms
  real_dends = dends[lapply(dends, length) > 1]
  imposter_dends = dends[lapply(dends, length) == 1]

  # if a global cut height was not supplied, calculate cut heights for each dendrogram
  if (cut_height < 0) {
    cut_heights = mapply(get_longevity_cut_height, real_dends, max_dists)
  # otherwise, use with uniform cut heights
  } else {
    cut_heights = rep(cut_height, length(max_dists))
  }

  # cut nontrival dendrograms and get cluster assignments
  processed_dends = process_dendrograms(real_dends, cut_heights)

  if (typeof(processed_dends) != "list") {
    names = rownames(processed_dends)
    processed_dends = list(unlist(as.list(processed_dends)))
    names(processed_dends[[1]]) = names
  }

  # combine nontrival and trivial clusterings and return results
  if (length(imposter_dends) != 0) {
    return(append(processed_dends, sapply(imposter_dends, function(x)
      list(unlist(x))))) # LMAO what is this
  } else {
    return(processed_dends)
  }
}

#' Global Longevity Clusterer
#'
#' Create a dude to perform hierarchical clustering in a global context using the [hclust] package.
#'
#' @param method A string to pass to [hclust] to tell it what kind of clustering to do.
#' @param dists The global distance matrix on which to run clustering to determine a global cutting height.
#' @param cut_height The cutting height at which you want all dendrograms to be cut. If this is not specified then the clusterer will use a cut height 5% above the merge point preceding the tallest branch in the global dendrogram.
#'
#' @returns A function that inputs a list of distance matrices and returns a list containing one vector per matrix, whose element names are data point names and whose values are cluster labels (relative to each matrix).
#' @details This clusterer cuts all dendrograms it is given at a uniform cutting height, defaulting to a heuristic if necessary.
#' @export
#'
#' @examples
#' data = data.frame(x = sapply(1:100, function(x) cos(x)), y = sapply(1:100, function(x) sin(x)))
#' projx = data$x
#' names(projx) = row.names(data)
#'
#' dists = dist(data)
#'
#' num_bins = 10
#' percent_overlap = 25
#'
#' cover = create_width_balanced_cover(min(projx), max(projx), num_bins, percent_overlap)
#'
#' create_1D_mapper_object(data, dists, projx, cover, global_hierarchical_clusterer("mcquitty", dists))
global_hierarchical_clusterer <- function(method, dists, cut_height = -1) {
  # do hierarchical clustering on entire dataset
  global_linkage = get_agglomerative_dendrogram(as.dist(dists), method)

  # each dendrogram will be normalized to this height
  max_dist = max(dists)

  # use tallest branch if a cut height is not specified
  if (cut_height < 0) {
    cut_height = get_longevity_cut_height(global_linkage, max_dist)
  }


  # return clusterer which can accept patches from mapper
  return(function(dist_mats) get_hierarchical_clusters(dist_mats, method, cut_height = cut_height))
}

#' Local Longevity Clusterer
#'
#' Create a dude to perform hierarchical clustering in a local context using the [hclust] package.
#'
#' @param method A string to pass to [hclust] to tell it what kind of clustering to do.
#'
#' @returns A function that inputs a list of distance matrices and returns a list containing one vector per matrix, whose element names are data point names and whose values are cluster labels (within each patch).
#' @details This clusterer determines cutting heights for dendrograms by cutting them individually, 5% above the merge point with the longest unbroken gap until the next merge point.
#' @export
#'
#' @examples
#' data = data.frame(x = sapply(1:100, function(x) cos(x)), y = sapply(1:100, function(x) sin(x)))
#' projx = data$x
#' names(projx) = row.names(data)
#'
#' dists = dist(data)
#'
#' num_bins = 10
#' percent_overlap = 25
#'
#' cover = create_width_balanced_cover(min(projx), max(projx), num_bins, percent_overlap)
#'
#' create_1D_mapper_object(data, dists, projx, cover, local_hierarchical_clusterer("mcquitty"))
local_hierarchical_clusterer <- function(method) {
  # clusterer which can accept patches from mapper
  return(function(dist_mats) get_hierarchical_clusters(dist_mats, method))
}

Try the mappeR package in your browser

Any scripts or data that you put into this service are public.

mappeR documentation built on June 29, 2025, 1:07 a.m.