Nothing
#' K-modes
#'
#' Combine clustering results using K-modes.
#'
#' Combine clustering results generated using different algorithms and different
#' data perturbations by k-modes. This method is the categorical data analog of
#' k-means clustering. Complete cases are needed: i.e. no `NA`s. If the matrix
#' contains `NA`s those are imputed by majority voting (after class relabeling).
#'
#' @param E a matrix of clusterings with number of rows equal to the number of
#' cases to be clustered, number of columns equal to the clustering obtained
#' by different resampling of the data, and the third dimension are the
#' different algorithms. Matrix may already be two-dimensional.
#' @param is.relabelled logical; if `FALSE` the data will be relabelled using
#' the first clustering as the reference.
#' @param seed random seed for reproducibility
#' @return a vector of cluster assignments based on k-modes
#' @family consensus functions
#' @author Aline Talhouk
#' @references Luo, H., Kong, F., & Li, Y. (2006, August). Combining multiple
#' clusterings via k-modes algorithm. In International Conference on Advanced
#' Data Mining and Applications (pp. 308-315). Springer, Berlin, Heidelberg.
#' @export
#' @examples
#' data(hgsc)
#' dat <- hgsc[1:100, 1:50]
#' cc <- consensus_cluster(dat, nk = 4, reps = 6, algorithms = "pam", progress =
#' FALSE)
#' table(k_modes(cc[, , 1, 1, drop = FALSE], is.relabelled = FALSE))
k_modes <- function(E, is.relabelled = TRUE, seed = 1) {
# Flatten and fill in remaining missing entries with majority voting
mv <- majority_voting(E, is.relabelled = is.relabelled)
flat_E <- flatten_E(E, is.relabelled = is.relabelled) %>%
purrr::array_branch(margin = 1) %>%
purrr::map2(mv, ~ .x %>%
magrittr::inset(is.na(.), .y)) %>%
do.call(rbind, .) %>%
as.data.frame()
# k-modes clustering if there are multiple columns of assignments
if (ncol(flat_E) > 1) {
set.seed(seed)
k_modes <- klaR::kmodes(flat_E,
modes = max(purrr::map_int(flat_E,
dplyr::n_distinct)))
return(k_modes$cluster)
} else {
return(flat_E)
}
}
#' Majority voting
#'
#' Combine clustering results using majority voting.
#'
#' Combine clustering results generated using different algorithms and different
#' data perturbations by majority voting. The class of a sample is the cluster
#' label which was selected most often across algorithms and subsamples.
#'
#' @inheritParams k_modes
#' @return a vector of cluster assignments based on majority voting
#' @family consensus functions
#' @author Aline Talhouk
#' @export
#' @examples
#' data(hgsc)
#' dat <- hgsc[1:100, 1:50]
#' cc <- consensus_cluster(dat, nk = 4, reps = 6, algorithms = "pam", progress =
#' FALSE)
#' table(majority_voting(cc[, , 1, 1, drop = FALSE], is.relabelled = FALSE))
majority_voting <- function(E, is.relabelled = TRUE) {
# Flatten (and relabel) E then find most common element in every row
E %>%
flatten_E(is.relabelled = is.relabelled) %>%
apply(1, function(x) as.numeric(names(which.max(table(x)))))
}
#' Cluster-based Similarity Partitioning Algorithm (CSPA)
#'
#' Performs hierarchical clustering on a stack of consensus matrices to obtain
#' consensus class labels.
#'
#' @param E is an array of clustering results.
#' @param k number of clusters
#' @return cluster assignments for the consensus class
#' @family consensus functions
#' @author Derek Chiu
#' @references Strehl, A., & Ghosh, J. (2002). Cluster ensembles---a knowledge
#' reuse framework for combining multiple partitions. Journal of machine
#' learning research, 3(Dec), 583-617.
#' @export
#' @examples
#' data(hgsc)
#' dat <- hgsc[1:100, 1:50]
#' x <- consensus_cluster(dat, nk = 4, reps = 4, algorithms = c("hc", "diana"),
#' progress = FALSE)
#' CSPA(x, k = 4)
CSPA <- function(E, k) {
assertthat::assert_that(k %in% dimnames(E)[[4]])
consensus_combine(E, element = "matrix") %>%
magrittr::extract2(as.character(k)) %>%
Reduce(`+`, .) %>%
magrittr::divide_by(dim(E)[3]) %>%
stats::dist() %>%
hc(k = k)
}
#' Linkage Clustering Ensemble
#'
#' Generate a cluster assignment from a CTS, SRS, or ASRS similarity matrix.
#'
#' @param E is an array of clustering results. An error is thrown if there are
#' missing values. [impute_missing()] can be used beforehand.
#' @param k requested number of clusters
#' @param dc decay constant for CTS, SRS, or ASRS matrix
#' @param R number of repetitions for SRS matrix
#' @param sim.mat similarity matrix; choices are "cts", "srs", "asrs".
#' @family consensus functions
#' @author Johnson Liu
#' @return a vector containing the cluster assignment from either the CTS, SRS,
#' or ASRS similarity matrices
#' @export
#' @examples
#' data(hgsc)
#' dat <- hgsc[1:100, 1:50]
#' x <- consensus_cluster(dat, nk = 4, reps = 4, algorithms = c("km", "hc"),
#' progress = FALSE)
#' \dontrun{
#' LCE(E = x, k = 4, sim.mat = "asrs")
#' }
#'
#' x <- apply(x, 2:4, impute_knn, data = dat, seed = 1)
#' x_imputed <- impute_missing(x, dat, nk = 4)
#' LCE(E = x_imputed, k = 4, sim.mat = "cts")
LCE <- function(E, k, dc = 0.8, R = 10, sim.mat = c("cts", "srs", "asrs")) {
assertthat::assert_that(is.array(E), dc >= 0 && dc <= 1)
# Check that the Cluster matrix is complete otherwise return Error
if (anyNA(E)) stop("'E' must be complete for LCE algorithm.")
S <- switch(match.arg(sim.mat, c("cts", "srs", "asrs")),
cts = cts(E = E, dc = dc),
srs = srs(E = E, dc = dc, R = R),
asrs = asrs(E = E, dc = dc))
hc(stats::dist(S), k)
}
#' Latent Class Analysis
#'
#' Combine clustering results using latent class analysis.
#'
#' @inheritParams k_modes
#' @return a vector of cluster assignments based on LCA
#' @family consensus functions
#' @author Derek Chiu
#' @export
#' @examplesIf rlang::is_installed("poLCA")
#' data(hgsc)
#' dat <- hgsc[1:100, 1:50]
#' cc <- consensus_cluster(dat, nk = 4, reps = 6, algorithms = "pam", progress =
#' FALSE)
#' table(LCA(cc[, , 1, 1, drop = FALSE], is.relabelled = FALSE))
LCA <- function(E, is.relabelled = TRUE, seed = 1) {
if (!requireNamespace("poLCA", quietly = TRUE)) {
stop("Package \"poLCA\" is needed. Please install it.",
call. = FALSE)
} else {
if (!"package:MASS" %in% search()) attachNamespace("MASS")
flat_E <- E %>%
flatten_E(is.relabelled = is.relabelled) %>%
as.data.frame()
res <- paste(names(flat_E), collapse = ", ")
f <- stats::as.formula(paste0("cbind(", res, ") ~ 1"))
set.seed(seed)
M <- poLCA::poLCA(
formula = f,
data = flat_E,
nclass = max(flat_E, na.rm = TRUE),
verbose = FALSE,
nrep = 5,
na.rm = FALSE
)
M[["predclass"]]
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.