Nothing
###########################################################################
# CLUSTERER FARM
# here we nuture young clusterers to maturity
# included clustering methods that can be used with mapper
###########################################################################
# hierarchical clustering -------------------------------------------------
#' Grow A Dendrogram
#'
#' Perform agglomerative clustering on a single distance matrix.
#'
#' @param dist A distance matrix. Needs to be a `dist` object.
#' @param method A string to pass to [hclust] to determine clustering method.
#'
#' @return A dendrogram generated by `fastcluster`.
#'
#' @noRd
get_agglomerative_dendrogram <- function(dist, method) {
if (!(inherits(dist, "dist")) & (any(is.na(dist)))) {
return(vector())
} else if (!(inherits(dist, "dist"))) {
res = list(1)
names(res) = dist
return(res)
} else {
return(fastcluster::hclust(dist, method))
}
}
#' Grow Dendrograms
#'
#' Cut many dendrograms at specified cut heights.
#'
#' @param dends A `list` of dendrograms to be cut.
#' @param cut_heights A list of cut heights which correspond to each dendrogram in `dends`.
#' @return A list of named vectors (one per dendrogram) whose names are data point names and whose values are cluster labels.
#' @noRd
process_dendrograms <- function(dends, cut_heights) {
# in case we get a single dendrogram and not a list of them
if (inherits(dends, "hclust")) {
return(cutree(dends, h = cut_heights))
}
snipped_dends = mapply(cutree,
dends,
h = cut_heights,
SIMPLIFY = FALSE)
return(snipped_dends)
}
#' Lengthy Cut Height Finder
#'
#' Find the the longest-lived hierarchy of a dendrogram.
#'
#' @param dend A dendrogram.
#' @param max_height The maximum height of the dendrogram; if this is not provided the last merge height of the input dendrogram will be used, which will make cutting to one cluster impossible!
#'
#' @return The point 5% above the merge height with the longest time to the next merge point.
#' This value may not be the "ideal" cut height! This is just a heuristic.
#' @noRd
get_longevity_cut_height <- function(dend, max_height = max(cophenetic(dend))) {
# TODO remove all the duplicate code lol
heights = append(sort(unique(cophenetic(dend))), max_height) # merge heights of dendrogram
branch_lengths = diff(heights) # differences are branch lengths
tallest_branch_height = max(branch_lengths)
tallest_branch_id = which(branch_lengths == tallest_branch_height)
cutval = heights[tallest_branch_id] + .05*tallest_branch_height
if (length(cutval) > 1) {
cutval = sample(cutval, 1)
}
return(cutval)
}
#' Do Much Clustering
#'
#' Perform hierarchical clustering on multiple distance matrices.
#'
#' @param dist_mats A list of distance matrices to be used for clustering. Must be a `list` of `dist` objects.
#' @param method A string to pass to [hclust] to tell it what kind of clustering to do. Can be `single`, `complete`, etc.
#' @param cut_height A global cut height. If not specified or negative, dendrograms will be cut individually.
#'
#' @return A `list` containing named vectors (one per dendrogram), whose names are data point names and whose values are cluster labels (integers).
#' @noRd
get_hierarchical_clusters <- function(dist_mats, method, cut_height = -1) {
# do agglomerative clustering on each patch
dends = lapply(dist_mats, get_agglomerative_dendrogram, method)
# find heights for each dendrogram
max_dists = sapply(dist_mats, max)
# remove trivial heights
nonzero_max_dists = max_dists[max_dists != 0]
# we would like to only cut non-trivial dendrograms
real_dends = dends[lapply(dends, length) > 1]
imposter_dends = dends[lapply(dends, length) == 1]
# if a global cut height was not supplied, calculate cut heights for each dendrogram
if (cut_height < 0) {
cut_heights = mapply(get_longevity_cut_height, real_dends, max_dists)
# otherwise, use with uniform cut heights
} else {
cut_heights = rep(cut_height, length(max_dists))
}
# cut nontrival dendrograms and get cluster assignments
processed_dends = process_dendrograms(real_dends, cut_heights)
if (typeof(processed_dends) != "list") {
names = rownames(processed_dends)
processed_dends = list(unlist(as.list(processed_dends)))
names(processed_dends[[1]]) = names
}
# combine nontrival and trivial clusterings and return results
if (length(imposter_dends) != 0) {
return(append(processed_dends, sapply(imposter_dends, function(x)
list(unlist(x))))) # LMAO what is this
} else {
return(processed_dends)
}
}
#' Global Longevity Clusterer
#'
#' Create a dude to perform hierarchical clustering in a global context using the [hclust] package.
#'
#' @param method A string to pass to [hclust] to tell it what kind of clustering to do.
#' @param dists The global distance matrix on which to run clustering to determine a global cutting height.
#' @param cut_height The cutting height at which you want all dendrograms to be cut. If this is not specified then the clusterer will use a cut height 5% above the merge point preceding the tallest branch in the global dendrogram.
#'
#' @returns A function that inputs a list of distance matrices and returns a list containing one vector per matrix, whose element names are data point names and whose values are cluster labels (relative to each matrix).
#' @details This clusterer cuts all dendrograms it is given at a uniform cutting height, defaulting to a heuristic if necessary.
#' @export
#'
#' @examples
#' data = data.frame(x = sapply(1:100, function(x) cos(x)), y = sapply(1:100, function(x) sin(x)))
#' projx = data$x
#' names(projx) = row.names(data)
#'
#' dists = dist(data)
#'
#' num_bins = 10
#' percent_overlap = 25
#'
#' cover = create_width_balanced_cover(min(projx), max(projx), num_bins, percent_overlap)
#'
#' create_1D_mapper_object(data, dists, projx, cover, global_hierarchical_clusterer("mcquitty", dists))
global_hierarchical_clusterer <- function(method, dists, cut_height = -1) {
# do hierarchical clustering on entire dataset
global_linkage = get_agglomerative_dendrogram(as.dist(dists), method)
# each dendrogram will be normalized to this height
max_dist = max(dists)
# use tallest branch if a cut height is not specified
if (cut_height < 0) {
cut_height = get_longevity_cut_height(global_linkage, max_dist)
}
# return clusterer which can accept patches from mapper
return(function(dist_mats) get_hierarchical_clusters(dist_mats, method, cut_height = cut_height))
}
#' Local Longevity Clusterer
#'
#' Create a dude to perform hierarchical clustering in a local context using the [hclust] package.
#'
#' @param method A string to pass to [hclust] to tell it what kind of clustering to do.
#'
#' @returns A function that inputs a list of distance matrices and returns a list containing one vector per matrix, whose element names are data point names and whose values are cluster labels (within each patch).
#' @details This clusterer determines cutting heights for dendrograms by cutting them individually, 5% above the merge point with the longest unbroken gap until the next merge point.
#' @export
#'
#' @examples
#' data = data.frame(x = sapply(1:100, function(x) cos(x)), y = sapply(1:100, function(x) sin(x)))
#' projx = data$x
#' names(projx) = row.names(data)
#'
#' dists = dist(data)
#'
#' num_bins = 10
#' percent_overlap = 25
#'
#' cover = create_width_balanced_cover(min(projx), max(projx), num_bins, percent_overlap)
#'
#' create_1D_mapper_object(data, dists, projx, cover, local_hierarchical_clusterer("mcquitty"))
local_hierarchical_clusterer <- function(method) {
# clusterer which can accept patches from mapper
return(function(dist_mats) get_hierarchical_clusters(dist_mats, method))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.