Nothing
#' Evaluate, trim, and reweigh algorithms
#'
#' Evaluates algorithms on internal/external validation indices. Poor performing
#' algorithms can be trimmed from the ensemble. The remaining algorithms can be
#' given weights before use in consensus functions.
#'
#' This function always returns internal indices. If `ref.cl` is not `NULL`,
#' external indices are additionally shown. Relevant graphical displays are also
#' outputted. Algorithms are ranked across internal indices using Rank
#' Aggregation. Only the top `n` algorithms are kept, the rest are trimmed.
#'
#' @param data data matrix with rows as samples and columns as variables
#' @param cons.cl matrix of cluster assignments from consensus functions such as
#' `kmodes` and `majority_voting`
#' @param ref.cl reference class
#' @param k.method determines the method to choose k when no reference class is
#' given. When `ref.cl` is not `NULL`, k is the number of distinct classes of
#' `ref.cl`. Otherwise the input from `k.method` chooses k. The default is to
#' use the PAC to choose the best k(s). Specifying an integer as a
#' user-desired k will override the best k chosen by PAC. Finally, specifying
#' "all" will produce consensus results for all k. The "all" method is
#' implicitly performed when there is only one k used.
#' @param plot logical; if `TRUE`, `graph_all` is called
#' @param trim logical; if `TRUE`, algorithms that score low on internal indices
#' will be trimmed out
#' @param reweigh logical; if `TRUE`, after trimming out poor performing
#' algorithms, each algorithm is reweighed depending on its internal indices.
#' @param n an integer specifying the top `n` algorithms to keep after trimming
#' off the poor performing ones using Rank Aggregation. If the total number of
#' algorithms is less than `n` no trimming is done.
#' @inheritParams consensus_combine
#' @inheritParams PAC
#' @return `consensus_evaluate` returns a list with the following elements
#' * `k`: if `ref.cl` is not `NULL`, this is the number of distinct classes
#' in the reference; otherwise the chosen `k` is determined by the one giving
#' the largest mean PAC across algorithms
#' * `pac`: a data frame showing the PAC for each combination of algorithm
#' and cluster size
#' * `ii`: a list of data frames for all k showing internal evaluation
#' indices
#' * `ei`: a data frame showing external evaluation indices for `k`
#' * `trim.obj`: A list with 4 elements
#' * `alg.keep`: algorithms kept
#' * `alg.remove`: algorithms removed
#' * `rank.matrix`: a matrix of ranked algorithms for every internal
#' evaluation index
#' * `top.list`: final order of ranked algorithms
#' * `E.new`: A new version of a `consensus_cluster` data object
#' @export
#' @examplesIf rlang::is_installed("apcluster")
#' # Consensus clustering for multiple algorithms
#' set.seed(911)
#' x <- matrix(rnorm(500), ncol = 10)
#' CC <- consensus_cluster(x, nk = 3:4, reps = 10, algorithms = c("ap", "km"),
#' progress = FALSE)
#'
#' # Evaluate algorithms on internal/external indices and trim algorithms:
#' # remove those ranking low on internal indices
#' set.seed(1)
#' ref.cl <- sample(1:4, 50, replace = TRUE)
#' z <- consensus_evaluate(x, CC, ref.cl = ref.cl, n = 1, trim = TRUE)
#' str(z, max.level = 2)
consensus_evaluate <- function(data, ..., cons.cl = NULL, ref.cl = NULL,
k.method = NULL, plot = FALSE, trim = FALSE,
reweigh = FALSE, n = 5, lower = 0, upper = 1) {
# Assertions
if (!is.null(ref.cl))
assertthat::assert_that(is.integer(ref.cl), nrow(data) == length(ref.cl))
# Extract classes and matrices separately
E <- abind::abind(list(...), along = 3)
cl.mat <- consensus_combine(E, element = "class") %>%
purrr::map(as.data.frame)
cons.mat <- consensus_combine(E, element = "matrix")
# Calculate PAC and choose k
pac <- cons.mat %>%
purrr::modify_depth(2, PAC, lower = lower, upper = upper) %>%
purrr::map_df(data.frame, .id = "k")
k <- choose_k(ref.cl, k.method, pac)
# If matrix of cluster assignments from cons.funs given, cbind to cl.mat
an <- dimnames(E)[[3]]
if (!is.null(cons.cl)) {
assertthat::assert_that(is.matrix(cons.cl))
cl.mat <- purrr::map(cl.mat, cbind, cons.cl)
an <- c(an, colnames(cons.cl))
}
# Internal indices
ii <- cl.mat %>% purrr::map(ivi_table, data = data)
# External indices if reference is given
ei <- ref.cl %>%
purrr::when(
!is.null(.) ~ cl.mat %>%
magrittr::extract(match(k, names(.))) %>% # for chosen k only
purrr::map(evi_table, ref.cl = ref.cl),
TRUE ~ NULL
)
# Only trim if more than one algorithm and trim is specified
if (dim(E)[3] > 1 & trim) {
trim.obj <- purrr::map(k, consensus_trim, E = E, ii = ii,
k.method = k.method, reweigh = reweigh, n = n) %>%
purrr::transpose() %>%
purrr::map_at(c("alg.keep", "alg.remove"),
~ unlist(unique(.x)))
} else {
trim.obj <- list(alg.keep = an,
alg.remove = character(0),
rank.matrix = list(NULL),
top.list = list(NULL),
E.new = list(E))
}
# Reorder ii (and ei if not NULL) by top.list order if trimmed
if (all(purrr::map_lgl(trim.obj$top.list, ~ !is.null(.x)))) {
ii <- purrr::map2(ii, trim.obj$top.list,
~ dplyr::arrange(.x, match(.y, Algorithms)))
if (!is.null(ei))
ei <- purrr::map2(ei, trim.obj$top.list,
~ dplyr::arrange(.x, match(.y, Algorithms)))
}
if (plot) graph_all(E) # Graph all plotting functions
dplyr::lst(k, pac, ii, ei, trim.obj)
}
#' @param E consensus object from `consensus_evaluate`
#' @param ii internal indices object from `consensus_evaluate`
#' @param k chosen value(s) of k from `consensus_evaluate`
#' @noRd
consensus_trim <- function(E, ii, k, k.method, reweigh, n) {
# Extract ii only for chosen k
k <- as.character(k)
ii <- ii[[k]]
alg.all <- as.character(ii$Algorithms)
# Rank algorithms on internal indices, store rank matrix and top alg list
rank.obj <- consensus_rank(ii, n)
rank.matrix <- rank.obj$rank.matrix
top.list <- rank.obj$top.list
# Algorithms to keep and remove based on top-ranked list
alg.keep <- top.list %>% purrr::when(is.null(.) ~ alg.all,
TRUE ~ .[seq_len(n)])
alg.remove <- as.character(alg.all[!(alg.all %in% alg.keep)])
E.new <- E[, , alg.keep, k, drop = FALSE]
# Reweigh only if specified, more than 1 algorithm is kept, trimming done
if (reweigh && length(alg.keep) > 1 && !is.null(top.list)) {
E.new <- consensus_reweigh(E.new, rank.obj, alg.keep, alg.all)
}
# If k.method is to select "all", need to add suffixes to algorithms
if (!is.null(k.method) && k.method == "all") {
alg.keep <- paste_k(alg.keep, k)
alg.remove <- alg.remove %>%
purrr::when(length(.) > 0 ~ paste_k(., k), TRUE ~ .)
dimnames(E.new)[[3]] <- paste_k(dimnames(E.new)[[3]], k)
}
dplyr::lst(alg.keep, alg.remove, rank.matrix, top.list, E.new)
}
#' Rank based on internal validity indices
#' @noRd
consensus_rank <- function(ii, n) {
# Extract internal indices and remove NaN idx
ii.cc <- ii %>%
magrittr::extract(!names(.) %in% c("Algorithms") &
purrr::map_lgl(., ~ all(!is.nan(.x))))
# Which algorithm is the best for each index?
bests <- purrr::imap_int(ii.cc, ~ {
switch(
.y,
calinski_harabasz = which.max(.x),
dunn = which.max(.x),
gamma = which.max(.x),
c_index = which.min(.x),
davies_bouldin = which.min(.x),
sd = which.min(.x),
s_dbw = which.min(.x),
silhouette = which.max(.x),
Compactness = which.min(.x),
Connectivity = which.min(.x)
)
})
max.bests <- ii.cc %>%
magrittr::extract(purrr::map_int(., which.max) == bests) %>%
magrittr::multiply_by(-1)
min.bests <- ii.cc %>%
magrittr::extract(purrr::map_int(., which.min) == bests)
# Determine trimmed ensemble using rank aggregation
if (nrow(ii) <= n) {
rank.matrix <- top.list <- NULL
} else {
rank.matrix <- cbind(max.bests, min.bests) %>%
scale(center = FALSE, scale = TRUE) %>%
as.data.frame() %>%
purrr::map_dfc(~ ii$Algorithms[order(.x, sample(length(.x)))]) %>%
t()
top.list <- RankAggreg::RankAggreg(rank.matrix, ncol(rank.matrix),
method = "GA", verbose = FALSE)$top.list
}
dplyr::lst(max.bests, min.bests, rank.matrix, top.list)
}
#' Reweigh the algorithms in the ensemble if some were trimmed out
#' @noRd
consensus_reweigh <- function(E.new, rank.obj, alg.keep, alg.all) {
# Create multiples of each algorithm proportion to weight
multiples <- rank.obj %>%
magrittr::extract(c("max.bests", "min.bests")) %>%
dplyr::bind_cols() %>% # Recombine internal validity indices
dplyr::select_if(~ !any(is.infinite(.))) %>% # Remove indices with Inf
magrittr::extract(match(alg.keep, alg.all), ) %>% # Extract algs to keep
as.matrix() %>% # Cannot calculate proportions on data.frame
prop.table(2) %>% # Proportion for each index
rowMeans() %>% # Average proportion across indices
magrittr::multiply_by(100) %>% # Percentage
round(0) %>% # Round to closest whole number
magrittr::divide_by(Reduce(`gcd`, .)) %>% # Divide by gcd to minimize copies
purrr::set_names(alg.keep) # reassign names of algs to keep
# Generate multiples for each algorithm, updating dimnames 3rd dimension
E.new %>%
purrr::array_branch(c(3, 4)) %>%
purrr::map2(multiples, ~ rep(list(.x), .y)) %>%
purrr::map(abind::abind, along = 3) %>%
abind::abind(along = 3) %>%
abind::abind(along = 4) %>%
`dimnames<-`(append(dimnames(E.new)[-3],
list(purrr::flatten_chr(
purrr::imap(multiples, ~ rep(.y, .x))
)),
2))
}
#' Table of internal validity indices for each algorithm
#' @param cl.df data frame of cluster assignments for each algorithm
#' @param data data frame with rows as samples and columns as variables
#' @noRd
ivi_table <- function(cl.df, data) {
ndata <- apply(data, 2, function(x) as.numeric(as.character(x)))
data.frame(
Algorithms = colnames(cl.df),
calinski_harabasz = cl.df %>% purrr::map_dbl(clusterSim::index.G1, x = ndata),
dunn = cl.df %>% purrr::map_dbl(clValid::dunn, distance = stats::dist(ndata)),
gamma = cl.df %>% purrr::map_dbl(clusterSim::index.G2, d = stats::dist(ndata)),
c_index = cl.df %>% purrr::map_dbl(clusterSim::index.C, d = stats::dist(ndata)),
davies_bouldin = cl.df %>% purrr::map_dbl(~ clusterSim::index.DB(x = ndata, cl = ., q = 1)[["DB"]]),
sd = cl.df %>% purrr::map_dbl(SD, data = ndata),
s_dbw = cl.df %>% purrr::map_dbl(S_Dbw, data = ndata),
silhouette = cl.df %>% purrr::map_dbl(clusterSim::index.S, d = stats::dist(ndata)),
Compactness = cl.df %>% purrr::map_dbl(compactness, data = data),
Connectivity = cl.df %>% purrr::map_dbl(
~ suppressWarnings(clValid::connectivity(Data = ndata, clusters = .)))
) %>%
dplyr::mutate_all(list(~ structure(., names = colnames(cl.df))))
}
#' Table of external validity indices for each algorithm
#' @param cl.df data frame of cluster assignments for each algorithm
#' @param ref.cl reference class
#' @noRd
evi_table <- function(cl.df, ref.cl) {
data.frame(
Algorithms = colnames(cl.df),
hubert = cl.df %>% purrr::map_dbl(ev_hubert, cl2 = ref.cl),
jaccard = cl.df %>% purrr::map_dbl(ev_jaccard, cl2 = ref.cl),
mcnemar = cl.df %>% purrr::map_dbl(ev_mcnemar, cl2 = ref.cl),
rand = cl.df %>% purrr::map_dbl(ev_rand, cl2 = ref.cl),
NMI = cl.df %>% purrr::map_dbl(ev_nmi, ref.lab = ref.cl)
) %>%
dplyr::inner_join(
cl.df %>%
purrr::map_dfr(ev_confmat, ref.lab = ref.cl, .id = "Algorithms") %>%
dplyr::select(-.data$.estimator) %>%
tidyr::pivot_wider(names_from = ".metric", values_from = ".estimate"),
by = "Algorithms"
) %>%
dplyr::mutate_all(list(~ structure(., names = colnames(cl.df))))
}
#' Choose k using PAC
#' @noRd
choose_k <- function(ref.cl, k.method, pac) {
# If reference given, k is number of distinct classes
if (!is.null(ref.cl)) {
k <- dplyr::n_distinct(ref.cl)
# Otherwise k is chosen using the following methods
} else if (is.null(k.method)) {
k <- pac %>%
magrittr::use_series("k") %>%
magrittr::extract(pac[, -1, drop = FALSE] %>%
apply(2, which.min) %>%
unlist() %>%
table() %>%
magrittr::extract(magrittr::is_in(., max(.))) %>%
names() %>%
as.numeric()) %>%
as.integer() %>%
min()
} else if (any(nrow(pac) == 1 | k.method == "all")) {
k <- pac$k
} else if (purrr::is_scalar_integer(k.method) ||
purrr::is_scalar_double(k.method)) {
k <- k.method
} else {
stop("Invalid input. Check documentation for possible options.")
}
as.integer(k)
}
#' Add cluster size suffix to algorithms when k.method == "all"
#' @noRd
paste_k <- function(a, k) {
paste0(a, " k=", k)
}
#' Recursively find the greater common divisor of two numbers
#' @noRd
gcd <- function(x, y) {
r <- x %% y
ifelse(r, gcd(y, r), y)
}
#' Compactness Measure
#'
#' Compute the compactness validity index for a clustering result.
#'
#' This index is agnostic to any reference clustering results, calculating
#' cluster performance on the basis of compactness and separability. Smaller
#' values indicate a better clustering structure.
#'
#' @param data a dataset with rows as observations, columns as variables
#' @param labels a vector of cluster labels from a clustering result
#' @return the compactness score
#' @author Derek Chiu
#' @references MATLAB function `valid_compactness` by Simon Garrett in
#' LinkCluE
#' @export
#'
#' @examples
#' set.seed(1)
#' E <- matrix(rep(sample(1:4, 1000, replace = TRUE)), nrow = 100, byrow =
#' FALSE)
#' set.seed(1)
#' dat <- as.data.frame(matrix(runif(1000, -10, 10), nrow = 100, byrow = FALSE))
#' compactness(dat, E[, 1])
compactness <- function(data, labels) {
assertthat::assert_that(is.data.frame(data) || is.matrix(data),
length(labels) == nrow(data))
C <- sort(unique(labels))
cp <- 0
for (i in seq_along(C)) {
ind <- which(labels == C[i])
nk <- length(ind)
if (nk > 1) {
sum_d <- sum(stats::dist(data[ind, ], method = "euclidean"))
cp <- cp + (nk * (sum_d / (nk * (nk - 1) / 2)))
}
}
cp / length(labels)
}
#' SD validity index
#' @noRd
SD <- function(data, labels) {
scatt <- clv::clv.Scatt(data, labels)
dis <- clv::clv.Dis(cluster.center = scatt[["cluster.center"]])
alfa <- length(unique(labels))
clv::clv.SD(scatt[["Scatt"]], dis, alfa)
}
#' SDbw valdiity index
#' @noRd
S_Dbw <- function(data, labels) {
scatt <- clv::clv.Scatt(data, labels)
dens <- clv::clv.DensBw(data, labels, scatt, dist = "correlation")
clv::clv.SDbw(scatt[["Scatt"]], dens)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.