Nothing
###########################################################################
# CLUSTERER FARM
# here we nuture young clusterers to maturity
# included clustering methods that can be used with mapper
###########################################################################
#' Subset a distance matrix
#'
#' @param bin A list of names of data points.
#' @param dists A distance matrix for data points in the bin, possibly including extra points.
#'
#' @return A distance matrix for only the data points in the input bin.
subset_dists <- function(bin, dists) {
bin_length = length(bin)
if (bin_length == 0) {
return(NA)
} else if (bin_length == 1) {
return(bin)
} else {
res = as.dist(as.matrix(dists)[bin, bin]) # this is how it's done in the usedist package
return(res)
}
}
# hierarchical clustering -------------------------------------------------
#' Perform agglomerative clustering on a single distance matrix.
#'
#' @param dist A distance matrix.
#' @param method A string to pass to [hclust] to determine clustering method.
#'
#' @return A dendrogram generated by `fastcluster`.
run_link <- function(dist, method) {
if (!(inherits(dist, "dist")) & (any(is.na(dist)))) {
return(vector())
} else if (!(inherits(dist, "dist"))) {
res = list(1)
names(res) = dist
return(res)
} else {
return(fastcluster::hclust(dist, method))
}
}
#' Find the tallest branch of a dendrogram
#'
#' @param dend A single dendrogram.
#'
#' @return The height of the tallest branch (longest time between merge heights) of the input dendrogram.
get_tallest_branch <- function(dend) {
heights = sort(unique(cophenetic(dend)))
if (length(heights) <= 1) {
return(max(heights))
}
branch_lengths = diff(heights)
return(max(branch_lengths))
}
#' Cut a dendrogram in context
#'
#' @param dend A single dendrogram.
#' @param threshold A mininum tallest branch value.
#'
#' @return A named vector whose names are data point names and whose values are cluster labels.
#' @details The number of clusters is determined to be 1 if the tallest branch of the dendrogram is less than the threshold, or if the index of dispersion (standard deviation squared divided by mean) of the branch heights is below 0.015. Otherwise, we cut at the longest branch of the dendrogram to determine the number of clusters.
cut_dendrogram <- function(dend, threshold) {
# TODO remove all the duplicate code lol
heights = sort(unique(cophenetic(dend))) # merge heights of dendrogram
if (length(heights) <= 2) {
if (max(heights) < threshold) {
return(cutree(dend, k = 1))
} else {
return(cutree(dend, k = 2))
}
}
branch_lengths = diff(heights) # differences are branch lengths
tallest_branch_height = max(branch_lengths)
tallest_branch_id = which(branch_lengths == tallest_branch_height)
cutval = (heights[tallest_branch_id] + heights[tallest_branch_id + 1]) / 2 # midpoint of tallest branch
if (length(cutval) > 1) {
cutval = sample(cutval, 1)
}
# one cluster condition: dendrogram has no sufficiently tall branches
thresholdcondition = tallest_branch_height < threshold
# one cluster condition: lengths of branches are not well-dispersed
indexofdispersion = sd(heights) ^ 2 / mean(heights)
dispersioncondition = indexofdispersion < .015
if (thresholdcondition | dispersioncondition) {
# abline(h = max(heights), lty = 2, col = "red")
return(cutree(dend, k = 1))
} else {
# abline(h = cutval, lty = 2, col = "red")
return(cutree(dend, h = cutval))
}
}
#' Cut many dendrograms in context
#'
#' @param dends A list of dendrograms to be cut.
#' @param semi_local_clustering Whether you want clustering to happen in a semi-local (entire dataset visible) or strictly local (only current level set visible) context. Defaults to semi-local.
#'
#' @return A list of named vectors (one per dendrogram) whose names are data point names and whose values are cluster labels.
#' @details This function uses a value of 10 percent of the tallest branch across dendrograms as a threshold for [cut_dendrogram].
process_dendrograms <- function(dends, semi_local_clustering = TRUE) {
if (inherits(dends, "hclust")) {
return(cut_dendrogram(dends, 0))
}
tallest_branches = sapply(dends, get_tallest_branch)
biggest_branch_length = max(tallest_branches)
threshold = ifelse(semi_local_clustering, biggest_branch_length * .1, 0)
snipped_dends = mapply(cut_dendrogram,
dend = dends, SIMPLIFY = FALSE,
MoreArgs = list(threshold = threshold))
return(snipped_dends)
}
#' Perform single-linkage hierarchical clustering and process dendrograms in a semi-global context.
#'
#' @param dist_mats A list of distance matrices to be used for clustering.
#' @param method A string to pass to [hclust] to tell it what kind of clustering to do.
#'
#' @return A list containing named vectors (one per dendrogram), whose names are data point names and whose values are cluster labels.
get_hierarchical_clusters <- function(dist_mats, method) {
# do agglomerative clustering on distance matrices
dends = lapply(dist_mats, run_link, method)
# we would like to cut non-trivial dendrograms to determine number of clusters
real_dends = dends[lapply(dends, length) > 1]
imposter_dends = dends[lapply(dends, length) == 1]
# cut nontrival dendrograms and get clusters
processed_dends = process_dendrograms(real_dends, TRUE)
# combine nontrival and trivial clusterings and return results
if (length(imposter_dends) != 0) {
return(append(processed_dends, sapply(imposter_dends, function(x)
list(unlist(x))))) # LMAO what is this
} else {
return(processed_dends)
}
}
#' Create a little dude to perform hierarchical clustering in a semi-global context using the [hclust] package.
#'
#' @param method A string to pass to [hclust] to tell it what kind of clustering to do.
#'
#' @returns A function that inputs a list of distance matrices and returns a list containing one vector per bin, whose element names are data point names and whose values are cluster labels (within each bin).
#' @details This clusterer determines cutting heights for bin dendrograms generated by [hclust] by first considering the tallest branches across all dendrograms; if all branch heights of a given dendrogram are below a threshold (10 percent of the global tallest), that dendrogram will be considered to describe a single cluster. Additionally, if the index of dispersion of the branch heights of a dendrogram are below 0.015, we will also consider it as describing a single cluster. If neither of these are true, then we will cut the dendrogram at its longest branch.
#' @export
#'
#' @examples
#' data = data.frame(x = sapply(1:100, function(x) cos(x)), y = sapply(1:100, function(x) sin(x)))
#' projx = data$x
#'
#' num_bins = 10
#' percent_overlap = 25
#'
#' cover = create_width_balanced_cover(min(projx), max(projx), num_bins, percent_overlap)
#'
#' create_1D_mapper_object(data, dist(data), projx, cover, hierarchical_clusterer("mcquitty"))
hierarchical_clusterer <- function(method) {
return(function(dist_mats) get_hierarchical_clusters(dist_mats, method))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.