R/clusterer_farm.R

Defines functions hierarchical_clusterer get_hierarchical_clusters process_dendrograms cut_dendrogram get_tallest_branch run_link subset_dists

Documented in cut_dendrogram get_hierarchical_clusters get_tallest_branch hierarchical_clusterer process_dendrograms run_link subset_dists

###########################################################################
# CLUSTERER FARM
# here we nuture young clusterers to maturity
# included clustering methods that can be used with mapper
###########################################################################

#' Subset a distance matrix
#'
#' @param bin A list of names of data points.
#' @param dists A distance matrix for data points in the bin, possibly including extra points.
#'
#' @return A distance matrix for only the data points in the input bin.
subset_dists <- function(bin, dists) {
  bin_length = length(bin)
  if (bin_length == 0) {
    return(NA)
  } else if (bin_length == 1) {
    return(bin)
  } else {
    res = as.dist(as.matrix(dists)[bin, bin]) # this is how it's done in the usedist package
    return(res)
  }
}

# hierarchical clustering -------------------------------------------------

#' Perform agglomerative clustering on a single distance matrix.
#'
#' @param dist A distance matrix.
#' @param method A string to pass to [hclust] to determine clustering method.
#'
#' @return A dendrogram generated by `fastcluster`.
run_link <- function(dist, method) {
  if (!(inherits(dist, "dist")) & (any(is.na(dist)))) {
    return(vector())
  } else if (!(inherits(dist, "dist"))) {
    res = list(1)
    names(res) = dist
    return(res)
  } else {
    return(fastcluster::hclust(dist, method))
  }
}

#' Find the tallest branch of a dendrogram
#'
#' @param dend A single dendrogram.
#'
#' @return The height of the tallest branch (longest time between merge heights) of the input dendrogram.
get_tallest_branch <- function(dend) {
  heights = sort(unique(cophenetic(dend)))
  if (length(heights) <= 1) {
    return(max(heights))
  }
  branch_lengths = diff(heights)
  return(max(branch_lengths))
}

#' Cut a dendrogram in context
#'
#' @param dend A single dendrogram.
#' @param threshold A mininum tallest branch value.
#'
#' @return A named vector whose names are data point names and whose values are cluster labels.
#' @details The number of clusters is determined to be 1 if the tallest branch of the dendrogram is less than the threshold, or if the index of dispersion (standard deviation squared divided by mean) of the branch heights is below 0.015. Otherwise, we cut at the longest branch of the dendrogram to determine the number of clusters.
cut_dendrogram <- function(dend, threshold) {
  # TODO remove all the duplicate code lol
  heights = sort(unique(cophenetic(dend))) # merge heights of dendrogram

  if (length(heights) <= 2) {
    if (max(heights) < threshold) {
      return(cutree(dend, k = 1))
    } else {
      return(cutree(dend, k = 2))
    }
  }

  branch_lengths = diff(heights) # differences are branch lengths

  tallest_branch_height = max(branch_lengths)
  tallest_branch_id = which(branch_lengths == tallest_branch_height)
  cutval = (heights[tallest_branch_id] + heights[tallest_branch_id + 1]) / 2 # midpoint of tallest branch

  if (length(cutval) > 1) {
    cutval = sample(cutval, 1)
  }

  # one cluster condition: dendrogram has no sufficiently tall branches
  thresholdcondition = tallest_branch_height < threshold

  # one cluster condition: lengths of branches are not well-dispersed
  indexofdispersion = sd(heights) ^ 2 / mean(heights)
  dispersioncondition = indexofdispersion < .015

  if (thresholdcondition | dispersioncondition) {
    # abline(h = max(heights), lty = 2, col = "red")
    return(cutree(dend, k = 1))
  } else {
    # abline(h = cutval, lty = 2, col = "red")
    return(cutree(dend, h = cutval))
  }
}

#' Cut many dendrograms in context
#'
#' @param dends A list of dendrograms to be cut.
#' @param semi_local_clustering Whether you want clustering to happen in a semi-local (entire dataset visible) or strictly local (only current level set visible) context. Defaults to semi-local.
#'
#' @return A list of named vectors (one per dendrogram) whose names are data point names and whose values are cluster labels.
#' @details This function uses a value of 10 percent of the tallest branch across dendrograms as a threshold for [cut_dendrogram].
process_dendrograms <- function(dends, semi_local_clustering = TRUE) {
  if (inherits(dends, "hclust")) {
    return(cut_dendrogram(dends, 0))
  }

  tallest_branches = sapply(dends, get_tallest_branch)
  biggest_branch_length = max(tallest_branches)
  threshold = ifelse(semi_local_clustering, biggest_branch_length * .1, 0)

  snipped_dends = mapply(cut_dendrogram,
                         dend = dends, SIMPLIFY = FALSE,
                         MoreArgs = list(threshold = threshold))
  return(snipped_dends)
}

#' Perform single-linkage hierarchical clustering and process dendrograms in a semi-global context.
#'
#' @param dist_mats A list of distance matrices to be used for clustering.
#' @param method A string to pass to [hclust] to tell it what kind of clustering to do.
#'
#' @return A list containing named vectors (one per dendrogram), whose names are data point names and whose values are cluster labels.
get_hierarchical_clusters <- function(dist_mats, method) {
  # do agglomerative clustering on distance matrices
  dends = lapply(dist_mats, run_link, method)

  # we would like to cut non-trivial dendrograms to determine number of clusters
  real_dends = dends[lapply(dends, length) > 1]
  imposter_dends = dends[lapply(dends, length) == 1]

  # cut nontrival dendrograms and get clusters
  processed_dends = process_dendrograms(real_dends, TRUE)

  # combine nontrival and trivial clusterings and return results
  if (length(imposter_dends) != 0) {
    return(append(processed_dends, sapply(imposter_dends, function(x)
      list(unlist(x))))) # LMAO what is this
  } else {
    return(processed_dends)
  }
}

#' Create a little dude to perform hierarchical clustering in a semi-global context using the [hclust] package.
#'
#' @param method A string to pass to [hclust] to tell it what kind of clustering to do.
#'
#' @returns A function that inputs a list of distance matrices and returns a list containing one vector per bin, whose element names are data point names and whose values are cluster labels (within each bin).
#' @details This clusterer determines cutting heights for bin dendrograms generated by [hclust] by first considering the tallest branches across all dendrograms; if all branch heights of a given dendrogram are below a threshold (10 percent of the global tallest), that dendrogram will be considered to describe a single cluster. Additionally, if the index of dispersion of the branch heights of a dendrogram are below 0.015, we will also consider it as describing a single cluster. If neither of these are true, then we will cut the dendrogram at its longest branch.
#' @export
#'
#' @examples
#' data = data.frame(x = sapply(1:100, function(x) cos(x)), y = sapply(1:100, function(x) sin(x)))
#' projx = data$x
#'
#' num_bins = 10
#' percent_overlap = 25
#'
#' cover = create_width_balanced_cover(min(projx), max(projx), num_bins, percent_overlap)
#'
#' create_1D_mapper_object(data, dist(data), projx, cover, hierarchical_clusterer("mcquitty"))
hierarchical_clusterer <- function(method) {
  return(function(dist_mats) get_hierarchical_clusters(dist_mats, method))
}

Try the mappeR package in your browser

Any scripts or data that you put into this service are public.

mappeR documentation built on April 3, 2025, 6:19 p.m.