#' Dispersion Separability Criterion
#'
#' Used by TCGA Batch Effects Viewer
#' \url{https://bioinformatics.mdanderson.org/public-software/tcga-batch-effects/}.
#' Based on \url{https://www.jmlr.org/papers/v5/dy04a.html}.
#' Can be used to measure batch effect.
#'
#' @param data_matrix numeric matrix, samples on columns
#' @param batch_label categorical variable, must be vector
#'
#' @return Returns the DSC of \code{data_matrix} with respect to
#' \code{batch_label} as a scalar value
#' @export
DSC <- function(data_matrix, batch_label) {
if (is.null(names(batch_label))) names(batch_label) <- colnames(data_matrix)
n <- table(batch_label)
p <- n / sum(n)
# traces of covariance matrices
Sw <- sapply(
split(names(batch_label), batch_label),
function(x) sum(apply(data_matrix[, x, drop = FALSE], 1, var)))
Dw <- sqrt(sum(Sw * p[names(Sw)], na.rm = TRUE))
mu <- lapply(
split(names(batch_label), batch_label),
function(x) apply(data_matrix[, x, drop = FALSE], 1, mean))
# Global mean
#M <- apply(data_matrix, 1, mean)
# This is actually more efficient (not much though)
M <- rep(0, nrow(data_matrix))
for (i in names(mu)) {
M <- M + p[[i]] * mu[[i]]
}
Sb <- c()
for (i in names(mu)) {
Sb[i] <- sum((mu[[i]] - M)**2)
}
Db <- sqrt(sum(Sb * p[names(Sb)]))
return(Db/Dw)
}
#' @importFrom cluster silhouette
silhouette_adjusted <- function(
x,
diss,
min_size = 0.05
) {
csize <- table(x)
under_size <- which(csize/length(x) < min_size)
diss_main <- as.matrix(diss)[,!x %in% names(csize)[under_size]]
x_main <- x[!x %in% names(csize)[under_size]]
out <- x
for (i in under_size) {
out[x == names(csize)[i]] <- x_main[
apply(
diss_main[x == names(csize)[i], ],
1,
which.min
)
]
}
silh <- cluster::silhouette(out, diss)
if (length(silh) == 1 && is.na(silh)) {
silh <- data.frame(sil_width = NA)
}
return(silh)
}
#' Clustering stability evaluation
#'
#' Performs stability analysis on cross-validated clusterings.
#'
#' Default settings work with \code{\link{subsample_clustering_evaluation}}
#' output 'clusters'.
#'
#' @param clusters clustering \code{data.frame} such as returned by
#' \code{\link{subsample_clustering_evaluation}}
#' @param by vector of column names to keep
#' @param parallel number of threads
#' @param reference_fold fold number that corresponds to reference which other
#' folds are compared against, inferred from input by default
#' @param ... extra arguments are ignored
#'
#' @return Returns a \code{data.frame} where each row corresponds to clustering
#' stability with respect to kept column variables
#' @export
#' @importFrom foreach foreach %dopar%
#' @importFrom data.table data.table is.data.table rbindlist setDT setDTthreads
stability_evaluation <- function(
clusters,
by = c("datname", "drname", "run", "k", "m"),
#by2 = c("fold"),
parallel = 1,
reference_fold = NULL,
...
) {
reference_fold <- get_reference_fold(clusters, reference_fold)
by2 = c("fold")
# Function to be applied for each result
f1 <- function(clust, clustref) {
if (length(clust) != length(clustref)) {
stop("Number of given references does not match number of given inputs.")
}
if (length(clust) > 0) {
jsc <- c()
nmi <- c()
ari <- c()
for (i in 1:length(clust)) {
jsc[i] <- jaccard_similarity(clust[[i]], clustref[[i]])
nmi[i] <- igraph::compare(
clust[[i]],
clustref[[i]],
method = "nmi"
)
ari[i] <- igraph::compare(
clust[[i]],
clustref[[i]],
method = "adjusted.rand"
)
}
} else {
jsc <- NA
nmi <- NA
ari <- NA
}
return(list(jsc = jsc, nmi = nmi, ari = ari))
}
# Function to be applied for method combinations in clustering table
f2 <- function(x, ref_i) {
data.table::setDTthreads(1)
ref <- x[fold == ref_i,]
colnames(ref)[colnames(ref) == "cluster"] <- "reference_cluster"
ref$fold <- NULL
nonref <- x[fold != ref_i,]
if ("cv_index" %in% colnames(x)) {
nonref$test_ind <- nonref$cv_index == nonref$fold
} else {
# Assume all samples are from the training set
nonref$test_ind <- FALSE
}
ref_cols <- c("id", by2[by2 != "fold"], "reference_cluster")
if (is(ref, "data.table")) {
nonref <- merge(nonref, ref[, ..ref_cols], by = c("id", by2[by2 != "fold"]))
} else {
nonref <- merge(nonref, ref[, ref_cols], by = c("id", by2[by2 != "fold"]))
}
if (any(!nonref$test_ind)) {
if(is(nonref, "data.table")) {
train_nonref <- split(
nonref$cluster[!nonref$test_ind],
nonref[!nonref$test_ind, ..by2])
train_ref <- split(
nonref$reference_cluster[!nonref$test_ind],
nonref[!nonref$test_ind, ..by2])
train_res <- f1(train_nonref, train_ref)
} else {
train_nonref <- split(
nonref$cluster[!nonref$test_ind],
nonref[!nonref$test_ind, by2])
train_ref <- split(
nonref$reference_cluster[!nonref$test_ind],
nonref[!nonref$test_ind, by2])
train_res <- f1(train_nonref, train_ref)
}
} else {
train_res <- list()
}
if (any(nonref$test_ind)) {
if(is(nonref, "data.table")) {
test_ref <- split(
nonref$cluster[nonref$test_ind],
nonref[nonref$test_ind, ..by2])
test_nonref <- split(
nonref$reference_cluster[nonref$test_ind],
nonref[nonref$test_ind, ..by2])
test_res <- f1(test_nonref, test_ref)
} else {
test_ref <- split(
nonref$cluster[nonref$test_ind],
nonref[nonref$test_ind, by2])
test_nonref <- split(
nonref$reference_cluster[nonref$test_ind],
nonref[nonref$test_ind, by2])
test_res <- f1(test_nonref, test_ref)
}
} else {
test_res <- list()
}
out_f2 <- data.table::data.table(
fold = names(train_ref),
train_jsc = train_res$jsc,
train_nmi = train_res$nmi,
train_ari = train_res$ari,
test_jsc = test_res$jsc,
test_nmi = test_res$nmi,
test_ari = test_res$ari)
return(out_f2)
}
by <- by[by %in% colnames(clusters)]
temp_list <- split_by_safe(clusters, by)
parallel_clust <- setup_parallelization(parallel)
stability <- tryCatch(
foreach(
temp = temp_list,
.combine = function(...) data.table::rbindlist(list(...)),
#.export = c(),
#.packages = c("data.table", "igraph"),
.multicombine = TRUE,
.maxcombine = max(length(temp_list), 2)
) %dopar% {
out <- tryCatch({
out <- f2(temp, reference_fold)
for (j in by) {
out[[j]] <- temp[[j]][1]
}
out}, error = function(e) {warning(e); return(NULL)})
out
},
finally = close_parallel_cluster(parallel_clust))
return(as.data.frame(stability))
}
get_reference_fold <- function(
clusters,
reference_fold = NULL
) {
if (is.null(reference_fold)) {
if (!"cv_index" %in% colnames(clusters)) {
stop("Please define the reference fold number.")
} else {
all_folds <- unique(clusters$fold)
non_ref_fold_inds <- all_folds %in% unique(clusters$cv_index)
reference_fold <- all_folds[!non_ref_fold_inds]
}
}
return(reference_fold)
}
#' @describeIn stability_evaluation Stability evaluation with Proportion of
#' Ambiguously Clustered pairs (PAC)
#'
#' @export
stability_evaluation_pac <- function(
clusters,
by = c("datname", "drname", "k", "m"),
parallel = 1,
reference_fold = NULL,
...
) {
by <- by[by %in% colnames(clusters)]
dicer_package <- requireNamespace("diceR", quietly = TRUE)
if (!dicer_package) stop("Using PAC requires the optional package 'diceR'.")
if (is.null(reference_fold)) {
if (!"cv_index" %in% colnames(clusters)) {
warning("Could not determine the reference fold number.")
reference_fold <- c()
} else {
reference_fold <- unique(clusters[["fold"]])
reference_fold <- reference_fold[!reference_fold %in% unique(clusters$cv_index)]
}
if (inherits(clusters, "data.table")) {
nonref_ind <- !clusters[,fold] %in% reference_fold
clusters <- clusters[nonref_ind]
} else {
clusters <- clusters[!(clusters[["fold"]] %in% reference_fold),]
}
}
temp_list <- split_by_safe(clusters, by)
by2 = c("fold", "run")
by2 <- by2[by2 %in% colnames(clusters)]
parallel_clust <- setup_parallelization(parallel)
stability_pac <- tryCatch(foreach(
temp = temp_list,
.combine = function(...) data.table::rbindlist(list(...)),
#.export = c("reference_fold"),
#.packages = c("data.table", "diceR"),
.multicombine = TRUE,
.maxcombine = max(length(temp_list), 2)) %dopar% {
uids <- unique(temp[,id])
samples <- 1:length(uids)
names(samples) <- uids
clusti <- split(temp, by = by2)
prototype <- rep(NA, length(samples))
clusti <- lapply(
clusti,
function(x) {
prototype[samples[x[,id]]] <- x[,cluster]
return(prototype)
}
)
conmat <- diceR::consensus_matrix(Reduce(cbind, clusti))
paci <- diceR::PAC(conmat)
data.frame(as.data.frame(temp)[1, by], PAC = paci)
#data.frame(temp[1, by], PAC = paci)
}, finally = close_parallel_cluster(parallel_clust))
return(stability_pac)
}
#' Feature batch-effect analysis
#'
#' Three batch effect estimators for numeric features put together. \cr
#' Can be used for other purposes, e.g., as a multivariate association index.
#'
#' @param x data matrix, samples on columns
#' @param class vector, data.frame or list where columns are categorical variables
#' @param n_pc_max maximum number of principal components to analyze
#' @param ... extra arguments are ignored
#'
#' @return Returns a \code{list} containing the following objects:
#' \itemize{
#' \item{associations}{
#' \code{data.frame} summary of \code{\link[kBET]{batch_sil}},
#' \code{\link[kBET]{pcRegression}} (\code{$maxR2}) and
#' \code{\link{DSC}} computed for each column of \code{class}
#' }
#' \item{eigenvalues}{
#' PC eigenvalues and percentage of explained variance from
#' \code{\link[FactoMineR]{PCA}}
#' }
#' }
#' @export
#' @importFrom FactoMineR PCA
#' @importFrom kBET batch_sil
#' @importFrom kBET pcRegression
feature_associations <- function(
x,
class,
n_pc_max = 10,
...
) {
if (!is(class, "list") & is.null(dim(class))) {
class <- data.frame(class = as.character(class))
}
class <- lapply(class, as.factor)
class <- class[sapply(class, function(x) length(levels(x)))>1]
pca_silh <- c()
pca_reg <- list()
DSC_res <- c()
dat_pca <- FactoMineR::PCA(
t(x),
scale.unit = FALSE,
ncp = min(c(n_pc_max, dim(x))),
graph = FALSE)
for (i in 1:length(class)) {
# PCA based
pca_silh[i] <- kBET::batch_sil(
list(x = dat_pca$ind$coord[!is.na(class[[i]]),]),
class[[i]][!is.na(class[[i]])],
nPCs = min(c(n_pc_max, dim(x))))
pca_reg[[i]] <- kBET::pcRegression(
list(
x = dat_pca$ind$coord[!is.na(class[[i]]),],
sdev = sqrt(dat_pca$eig[,"eigenvalue"])
),
class[[i]][!is.na(class[[i]])],
n_top = min(c(n_pc_max, dim(x))))
# Other
DSC_res[i] <- DSC(
x[,!is.na(class[[i]])],
class[[i]][!is.na(class[[i]])])
}
if (is.null(names(class))) names(class) <- 1:length(class)
out <- data.frame(
class = names(class),
PC_silhouette = pca_silh,
PC_R2_max = sapply(pca_reg, function(a) a[["maxR2"]]),
DSC = DSC_res)
npc <- min(c(n_pc_max, dim(x)))
pc_wise_silhouettes <- list()
for (i in names(class)) {
pc_wise_silhouettes[[i]] <- sapply(
1:npc,
function(j) {
mean(
cluster::silhouette(
as.integer(class[[i]][!is.na(class[[i]])]),
dist(dat_pca$ind$coord[!is.na(class[[i]]), j, drop = FALSE])
)[,"sil_width"]
)
}
)
}
return(list(
associations = out,
eigenvalues = dat_pca$eig,
pc_wise_silhouettes = pc_wise_silhouettes))
}
#' Association analysis for clustering results
#'
#' @param clusters data.frame with columns id and cluster.
#' @param association_data data.frame with association variables, rownames
#' should match with id in \code{clusters}.
#'
#' @return A data.frame of NMI, ARI and Chi-squared test p-values for
#' categorical variables as well as analysis of variance and kruskal-wallis
#' test p-values for continuous variables.
#' @export
cluster_associations <- function(
clusters,
association_data
) {
association_data_matched <- association_data[
match(clusters$id, rownames(association_data)),
, drop = FALSE]
out <- list()
for (i in 1:ncol(association_data)) {
association_var <- association_data_matched[,i]
nna_ind <- which(!is.na(association_var))
n_samples <- length(nna_ind)
valid <- n_samples > 1
if (valid) {
n_categories <- length(unique(association_var[nna_ind]))
n_clusters <- length(unique(clusters$cluster[nna_ind]))
valid <- n_categories > 1 &
n_clusters > 1
}
if(valid) {
if (is(association_var, "character") || is(association_var, "factor")) {
valid <- n_categories < n_samples &
n_clusters < n_samples
if (valid) {
if (is(association_var, "factor")) {
clean_var <- droplevels(association_var[nna_ind])
} else {
clean_var <- association_var[nna_ind]
}
out[[i]] <- data.frame(
nmi = igraph::compare(
clusters$cluster[nna_ind],
clean_var,
method = "nmi"),
ari = igraph::compare(
clusters$cluster[nna_ind],
clean_var,
method = "adjusted.rand"))
temp <- tryCatch(
suppressWarnings(
chisq.test(clusters$cluster[nna_ind], clean_var)
), error = function(e) NULL)
if (is.null(temp)) {
out[[i]]$chisq.p <- NA
} else {
out[[i]]$chisq.p <- temp$p.value
}
colnames(out[[i]]) <- paste0(
colnames(association_data)[i], ".", colnames(out[[i]]))
}
} else if (is(association_var, "numeric") || is(association_var, "integer")) {
anova_res <- stats::aov(
association_var[nna_ind] ~ clusters$cluster[nna_ind])
kruskal_res <- stats::kruskal.test(
association_var[nna_ind],
clusters$cluster[nna_ind])
out[[i]] <- data.frame(
aov.p = summary(anova_res)[[1]][1,"Pr(>F)"],
kruskal.p = kruskal_res$p.value)
colnames(out[[i]]) <- paste0(
colnames(association_data)[i], ".", colnames(out[[i]]))
} else {
warning(paste0(
"Unknown data type in association data column ",
colnames(association_data)[i], "."))
}
}
}
return(Reduce("cbind", out[!sapply(out, is.null)]))
}
#' Cross-validated cluster association analysis
#'
#' Runs \code{\link{cluster_associations}} in parallel.
#'
#' Assumes that a parallel backend has been registered for \code{foreach}.
#'
#' @param clusters A data.table or data.frame with clustering information.
#' @param association_data A data.frame with association variables (categoric
#' or numeric).
#' @param by Column names that identify a single clustering result.
#' @param parallel Number of parallel threads.
#' @param ... Extra arguments are ignored.
#'
#' @return \code{data.table} containing association variables
#' @export
subsample_association_analysis <- function(
clusters,
association_data,
by = c("run", "fold", "datname", "drname", "k", "m"),
parallel = 1,
...
) {
by <- by[by %in% colnames(clusters)]
clust_list <- split_by_safe(clusters, by)
parallel_clust <- setup_parallelization(parallel)
out <- tryCatch(
foreach(
clust = clust_list,
.combine = function(...) data.table::rbindlist(list(...), fill = TRUE),
#.export = c("cluster_associations"),
#.packages = c(),
.multicombine = TRUE,
.maxcombine = max(length(clust_list), 2)
) %dopar% {
cluster_assoc <- cluster_associations(clust, association_data)
if (!is.null(cluster_assoc)) {
cluster_assoc <- data.frame(as.data.frame(clust)[1,by], cluster_assoc)
}
cluster_assoc
},
finally = close_parallel_cluster(parallel_clust)
)
return(out)
}
#' Gene module score
#'
#' Metric based on gene module eigen gene correlation
#'
#' @param x A \code{data.frame} with columns "id" and "cluster".
#' @param module_eigs Gene module eigen-genes for each sample (samples x modules).
#' @param module_cor_threshold Threshold for counting correlations.
#' @param module_nan.substitute Substituted value when dividing by zero when
#' there are no correlated clusters for a module.
#' @param ... Extra arguments are ignored.
#'
#' @export
#' @examples library(COPS)
#'
#' # Generate module eigen genes with WGCNA
#' gene_correlation <- cor(t(ad_ge_micro_zscore), method = "spearman")
#' adj <- abs(gene_correlation)^2
#' TOM <- unsigned_TOM_matrix(adj)
#' geneTree <- flashClust::flashClust(as.dist(1 - TOM), method="average")
#' dynamicMods <- dynamicTreeCut::cutreeDynamic(
#' dendro = geneTree,
#' method="tree",
#' minClusterSize = 20
#' )
#'
#' MEs <- gene_module_eigengenes(
#' ad_ge_micro_zscore[dynamicMods != 0,],
#' dynamicMods[dynamicMods != 0]
#' )
#'
#' # Compute the module score for a given clustering result
#' clust <- cutree(
#' hclust(
#' as.dist(1-cor(ad_ge_micro_zscore, method = "spearman")),
#' method = "average"
#' ),
#' k = 3
#' )
#' clust <- data.frame(id = names(clust), cluster = clust)
#'
#' score <- gene_module_score(clust, MEs)
#'
#' # Within full pipeline
#' res <- COPS(ad_ge_micro_zscore,
#' association_data = ad_studies,
#' parallel = 1, nruns = 2, nfolds = 5,
#' dimred_methods = c("pca", "umap"),
#' cluster_methods = c("hierarchical", "kmeans"),
#' distance_metric = "euclidean",
#' n_clusters = 2:4,
#' module_eigs = MEs)
#'
#' scores <- scoring(res, wsum = Silhouette - Module_score, summarise = TRUE)
gene_module_score <- function(
x,
module_eigs,
module_cor_threshold = 0.3,
module_nan.substitute = 0,
...
) {
clust_cor <- lapply(
as.data.frame(module_eigs[x$id,]),
function(a) sapply(
unique(x$cluster),
function(b) cor(a, x$cluster == b)
)
)
clust_cor_mat <- Reduce("rbind", clust_cor)
clust_cor_mat_pos <- clust_cor_mat > module_cor_threshold
clust_cor_mat_neg <- clust_cor_mat < -module_cor_threshold
score <- apply(clust_cor_mat_pos, 1, function(a) min(1, sum(a)))
score <- score + apply(clust_cor_mat_neg, 1, function(a) min(1, sum(a)))
score <- score / apply(clust_cor_mat_pos + clust_cor_mat_neg, 1, sum)
score[is.nan(score)] <- module_nan.substitute
score <- mean(score, na.rm = TRUE)
return(score)
}
#' Gene module correlation score for multiple clustering results
#'
#' Runs \code{\link{gene_module_score}} in parallel.
#'
#' Assumes that a parallel backend has been registered for \code{foreach}.
#'
#' @param clusters A data.table or data.frame with clustering information.
#' @param by Column names that identify a single clustering result.
#' @param parallel Number of parallel threads.
#' @param ... Extra arguments are passed to \code{\link{gene_module_score}}.
#'
#' @return \code{data.table}
#' @export
subsample_module_evaluation <- function(
clusters,
by = c("run", "fold", "datname", "drname", "k", "m"),
parallel = 1,
...
) {
clust_list <- split_by_safe(clusters, by)
parallel_clust <- setup_parallelization(parallel)
out <- tryCatch(
foreach(
clust = clust_list,
.combine = function(...) data.table::rbindlist(list(...)),
#.export = c("gene_module_score"),
#.packages = c(),
.multicombine = TRUE,
.maxcombine = max(length(clust_list), 2)
) %dopar% {
gm_score <- gene_module_score(x = clust, ...)
gm_score <- data.frame(
as.data.frame(clust)[1,by],
Module_score = gm_score
)
gm_score
},
finally = close_parallel_cluster(parallel_clust)
)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.