Nothing
#' Reorder or Group observations based on hierarchical clustering
#'
#' @description
#' `r lifecycle::badge('stable')`
#'
#' This function aligns observations within the layout according to a
#' hierarchical clustering tree, enabling reordering or grouping of elements
#' based on clustering results.
#'
#' @param data A matrix-like object. By default, it inherits from the layout
#' `matrix`.
#' @inheritParams align
#' @inheritParams hclust2
#' @param reorder_dendrogram A single boolean value indicating whether to
#' reorder the dendrogram based on the means. Alternatively, you can provide a
#' custom function that accepts an [`hclust`][stats::hclust] object and the data
#' used to generate the tree, returning either an [`hclust`][stats::hclust] or
#' [`dendrogram`][stats::as.dendrogram] object. Default is `FALSE`.
#' @param reorder_group A single boolean value, indicates whether we should do
#' Hierarchical Clustering between groups, only used when previous groups have
#' been established. Default: `FALSE`.
#' @param k An integer scalar indicates the desired number of groups.
#' @param h A numeric scalar indicates heights where the tree should be cut.
#' @param cutree A function used to cut the [`hclust`][stats::hclust] tree. It
#' should accept four arguments: the [`hclust`][stats::hclust] tree object,
#' `distance` (only applicable when `method` is a string or a function for
#' performing hierarchical clustering), `k` (the number of clusters), and `h`
#' (the height at which to cut the tree). By default,
#' [`cutree()`][stats::cutree()] is used.
#' @inheritSection align Discrete Axis Alignment
#' @seealso [`hclust2()`]
#' @examples
#' # align_hclust won't add a dendrogram
#' ggheatmap(matrix(rnorm(81), nrow = 9)) +
#' anno_top() +
#' align_hclust(k = 3L)
#' @export
align_hclust <- function(distance = "euclidean",
method = "complete",
use_missing = "pairwise.complete.obs",
reorder_dendrogram = FALSE,
reorder_group = FALSE,
k = NULL, h = NULL, cutree = NULL,
data = NULL, active = NULL) {
assert_active(active)
active <- update_active(active, new_active(use = FALSE))
.align_hclust(
align = AlignHclust,
distance = distance,
method = method,
use_missing = use_missing,
reorder_dendrogram = reorder_dendrogram,
reorder_group = reorder_group,
k = k, h = h, cutree = cutree, data = data, active = active
)
}
.align_hclust <- function(align, ..., plot = NULL,
distance = "euclidean",
method = "complete",
use_missing = "pairwise.complete.obs",
reorder_dendrogram = FALSE,
reorder_group = FALSE,
k = NULL, h = NULL, cutree = NULL,
data = NULL, schemes = NULL, active = NULL,
call = caller_call()) {
reorder_dendrogram <- allow_lambda(reorder_dendrogram)
if (!rlang::is_bool(reorder_dendrogram) &&
!is.null(reorder_dendrogram) &&
!is.function(reorder_dendrogram)) {
cli_abort(
"{.arg reorder_dendrogram} must be a single boolean value or a function",
call = call
)
}
assert_number_whole(k, allow_null = TRUE, call = call)
assert_number_decimal(h, allow_null = TRUE, call = call)
assert_bool(reorder_group, call = call)
cutree <- allow_lambda(cutree)
assert_(cutree, is.function, "a function", allow_null = TRUE, call = call)
if (inherits(method, "hclust")) {
if (vec_size(.subset2(method, "order")) == 0L) {
cli_abort("{.cls hclust} defined in {.arg method} cannot be empty",
call = call
)
}
} else if (inherits(method, "dendrogram")) {
if (stats::nobs(method) == 0L) {
cli_abort(
"{.cls dendrogram} defined in {.arg method} cannot be empty",
call = call
)
}
}
if (isTRUE(reorder_dendrogram)) {
reorder_dendrogram <- function(tree, data) {
if (!inherits(tree, "dendrogram")) {
tree <- stats::as.dendrogram(tree)
}
reorder(x = tree, wts = rowMeans(data), agglo.FUN = mean)
}
} else if (is.function(reorder_dendrogram)) {
user_reorder <- reorder_dendrogram
reorder_dendrogram <- function(tree, data) {
# we ensure, what we input for user is a `hclust` object.
if (!inherits(tree, "hclust")) tree <- stats::as.hclust(tree)
ans <- user_reorder(tree, data)
if (!inherits(ans, "hclust") &&
!inherits(ans, "dendrogram")) {
cli_abort(
"{.fn reorder_dendrogram} must return a {.cls hclust} or {.cls dendrogram} object",
call = call
)
}
ans
}
}
align(
align = align,
distance = distance, method = method,
use_missing = use_missing,
reorder_dendrogram = reorder_dendrogram,
reorder_group = reorder_group,
k = k, h = h, cutree = cutree,
active = active,
..., # additional fields to be added, used by align_dendro
schemes = schemes %||% default_schemes(),
data = data,
plot = plot,
call = call
)
}
#' @importFrom ggplot2 ggproto aes
AlignHclust <- ggproto("AlignHclust", CraftAlign,
interact_layout = function(self, layout) {
if (inherits(self$method, "hclust") ||
inherits(self$method, "dendrogram")) {
layout <- ggproto_parent(CraftAlign, self)$interact_layout(layout)
if (inherits(self$method, "hclust")) {
nobs <- vec_size(.subset2(self$method, "order"))
} else {
nobs <- stats::nobs(self$method)
}
if (is.null(layout_nobs <- .subset2(layout@design, "nobs"))) {
layout@design["nobs"] <- list(nobs)
} else {
assert_mismatch_nobs(self, layout_nobs, nobs, arg = "method")
}
} else {
layout <- ggproto_parent(AlignOrder2, self)$interact_layout(layout)
}
# initialize the internal parameters
self$multiple_tree <- FALSE
self$height <- NULL
self$panel <- NULL
layout
},
compute = function(self, panel, index) {
if (!is.null(self$data) && vec_size(self$data) < 2L) {
cli_abort(c(
"Cannot do Hierarchical Clustering",
i = "must have >= 2 observations to cluster"
), call = self$call)
}
# if the old panel exist, we do sub-clustering
if (!is.null(panel) && is.null(self$k) && is.null(self$h) &&
is.null(self$cutree)) {
# in this way, we prevent sub-clustering
if (inherits(self$method, "hclust") ||
inherits(self$method, "dendrogram")) {
cli_abort(
"{.arg method} cannot be a {.cls hclust} or {.cls dendrogram} when previous layout panel groups exist",
call = self$call
)
}
children <- vector("list", nlevels(panel))
names(children) <- levels(panel)
labels <- vec_names(self$data)
# we do clustering within each group ---------------
for (g in levels(panel)) {
idx <- which(panel == g)
gdata <- vec_slice(self$data, idx)
if (vec_size(gdata) == 1L) {
children[[g]] <- tree_one_node(idx, .subset(labels, idx))
} else {
child <- stats::as.dendrogram(hclust2(
gdata,
distance = self$distance,
method = self$method,
use_missing = self$use_missing
))
# we restore the actual index of the original matrix
child <- stats::dendrapply(child, function(x) {
if (stats::is.leaf(x)) {
ans <- .subset(idx, x)
attributes(ans) <- attributes(x)
ans
} else {
x
}
})
if (is.function(self$reorder_dendrogram)) {
child <- self$reorder_dendrogram(child, gdata)
}
children[[g]] <- child
}
}
return(children) # can be a list of `dendrogram` or `hclust` or mix
}
hclust2(self$data, self$distance, self$method, self$use_missing)
},
#' @importFrom stats order.dendrogram
align = function(self, panel, index) {
statistics <- self$statistics
if (!is.null(panel) && is.null(self$k) && is.null(self$h) &&
is.null(self$cutree)) {
# reordering the dendrogram ------------------------
if (nlevels(panel) > 1L && self$reorder_group) {
parent_levels <- levels(panel)
parent_data <- t(sapply(parent_levels, function(g) {
colMeans(vec_slice(self$data, panel == g), na.rm = TRUE)
}))
rownames(parent_data) <- parent_levels
parent <- hclust2(
parent_data,
distance = self$distance,
method = self$method,
use_missing = self$use_missing
)
# reorder parent based on the parent tree
if (is.function(self$reorder_dendrogram)) {
parent <- self$reorder_dendrogram(parent, parent_data)
}
# we always ensure the parent is a dendrogram
# since we'll call `merge_dendrogram()` which requires a
# dendrogram
parent <- stats::as.dendrogram(parent)
panel <- factor(panel, parent_levels[order.dendrogram(parent)])
# we don't cutree, so we won't draw the height line
# self$draw_params$height <- attr(ans, "cutoff_height")
} else {
parent <- NULL
}
# merge children tree ------------------------------
if (nlevels(panel) == 1L) {
statistics <- .subset2(statistics, 1L)
} else if (isTRUE(self$merge_dendro)) {
# we have a function named merge_dendrogram(), so we use
# `merge_dendro` as the argument name
# `merge_dendrogram` will follow the order of the parent
statistics <- lapply(statistics, stats::as.dendrogram)
statistics <- merge_dendrogram(parent, statistics)
} else {
# if no parent tree, and we havn't merged the tree
# we must manually reorder the dendrogram
if (!is.null(parent)) {
statistics <- .subset(statistics, levels(panel))
}
self$multiple_tree <- TRUE
}
} else {
# hclust2() will attach the distance used
distance <- attr(statistics, "distance")
if (is.function(self$reorder_dendrogram)) {
statistics <- self$reorder_dendrogram(statistics, self$data)
}
if (!is.null(self$k) || !is.null(self$h) || !is.null(self$cutree)) {
if (is.null(cutree <- self$cutree)) {
cutree <- function(tree, dist, k, h) {
if (!is.null(k)) {
stats::cutree(tree, k = k)
} else {
stats::cutree(tree, h = h)
}
}
}
# we need `hclust` object to cutree
statistics <- stats::as.hclust(statistics)
panel <- cutree(statistics, distance, self$k, self$h)
# For `cutree`, we always respect the height user specified
# For user defined function, we always calculate
# height from the number of `panels`
if (isTRUE(self$plot_cut_height)) {
self$height <- self$h %||% cutree_k_to_h(
statistics, vec_unique_count(panel)
)
}
}
}
# save the modified `statistics`
self$statistics <- statistics
if (self$multiple_tree) {
index <- unlist(lapply(statistics, order2), FALSE, FALSE)
} else {
index <- order2(statistics)
}
# reorder panel factor levels to following the dendrogram order
if (!is.null(panel)) {
panel <- factor(panel, unique(panel[index]))
# save panel information, in case of user change it
self$panel <- panel
}
list(panel, index)
},
summary_align = function(self, ...) {
c(TRUE, !is.null(self$k) || !is.null(self$h) || !is.null(self$cutree))
}
)
#' Generate Tree Structures with Hierarchical Clustering
#'
#' @param matrix A numeric matrix, or data frame.
#' @param distance A string of distance measure to be used. This must be one of
#' `"euclidean"`, `"maximum"`, `"manhattan"`, `"canberra"`, `"binary"` or
#' `"minkowski"`. Correlation coefficient can be also used, including
#' `"pearson"`, `"spearman"` or `"kendall"`. In this way, `1 - cor` will be used
#' as the distance. In addition, you can also provide a [`dist`][stats::dist]
#' object directly or a function return a [`dist`][stats::dist] object. Use
#' `NULL`, if you don't want to calculate the distance.
#' @param method A string of the agglomeration method to be used. This should be
#' (an unambiguous abbreviation of) one of `"ward.D"`, `"ward.D2"`, `"single"`,
#' `"complete"`, `"average"` (= UPGMA), `"mcquitty"` (= WPGMA), `"median"` (=
#' WPGMC) or `"centroid"` (= UPGMC). You can also provide a function which
#' accepts the calculated distance (or the input matrix if `distance` is `NULL`)
#' and returns a [`hclust`][stats::hclust] object. Alternative, you can supply
#' an object which can be coerced to [`hclust`][stats::hclust].
#' @param use_missing An optional character string giving a method for computing
#' covariances in the presence of missing values. This must be (an abbreviation
#' of) one of the strings `"everything"`, `"all.obs"`, `"complete.obs"`,
#' `"na.or.complete"`, or `"pairwise.complete.obs"`. Only used when `distance`
#' is a correlation coefficient string.
#' @seealso
#' - [cor()][stats::cor]
#' - [dist()][stats::dist]
#' - [hclust()][stats::hclust]
#' @examples
#' hclust2(dist(USArrests), method = "ward.D")
#' @return A [hclust][stats::hclust] object.
#' @importFrom rlang is_string try_fetch
#' @export
hclust2 <- function(matrix, distance = "euclidean", method = "complete",
use_missing = "pairwise.complete.obs") {
method <- allow_lambda(method)
if (!is_string(method) && !is.function(method)) {
ans <- try_fetch(
stats::as.hclust(method),
error = function(cnd) {
cli_abort(paste(
"{.arg method} can only be a {.cls string},",
"{.cls function} or an object which can be coerced to",
"{.cls hclust}."
), parent = cnd)
}
)
return(ans)
}
if (is.null(distance)) {
d <- matrix
} else {
d <- make_dist(matrix, distance, use_missing)
}
if (is_string(method)) {
ans <- stats::hclust(d, method = method)
} else if (is.function(method)) {
ans <- method(d)
ans <- try_fetch(
stats::as.hclust(ans),
error = function(cnd) {
cli_abort(paste(
"{.arg method} must return an object which",
"can be coerced to {.cls hclust}"
), parent = cnd)
}
)
}
if (!is.null(distance)) attr(ans, "distance") <- d
ans
}
#' @importFrom rlang arg_match0
make_dist <- function(matrix, distance, use_missing,
arg = caller_arg(distance), call = caller_call()) {
distance <- allow_lambda(distance)
if (is_string(distance)) {
distance <- arg_match0(distance, c(
"euclidean", "maximum", "manhattan", "canberra",
"binary", "minkowski", "pearson", "spearman", "kendall"
), arg_nm = arg, error_call = call)
d <- switch(distance,
euclidean = ,
maximum = ,
manhattan = ,
canberra = ,
binary = ,
minkowski = stats::dist(matrix, method = distance),
pearson = ,
spearman = ,
kendall = stats::as.dist(
1 - stats::cor(t(matrix), use = use_missing, method = distance)
),
cli_abort("Unsupported {.arg {arg}} specified", call = call)
)
} else if (is.function(distance)) {
if (!inherits(d <- distance(matrix), "dist")) {
cli_abort(
"{.arg {arg}} must return a {.cls dist} object",
call = call
)
}
} else if (inherits(distance, "dist")) {
d <- distance
} else {
cli_abort(paste(
"{.arg {arg}} can only be a {.cls string}, {.cls dist}",
"object, or a {.cls function} return {.cls dist}"
), call = call)
}
d
}
cutree_k_to_h <- function(tree, k) {
if (is.null(n1 <- nrow(tree$merge)) || n1 < 1) {
cli_abort("invalid {.arg tree} ({.field merge} component)")
}
n <- n1 + 1
if (is.unsorted(tree$height)) {
cli_abort(
"the 'height' component of 'tree' is not sorted (increasingly)"
)
}
mean(tree$height[c(n - k, n - k + 1L)])
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.