#' @name benchmark.clustering
#' @aliases benchmark.clustering
#'
#' @title Benchmarks the cluster assignments
#'
#' @description Supervised (ARI and NMI) and unsupervised (ASW, Dunn Index, and Connectivity) benchmarking metrics are calculated for cluster assignments. assays, clustering and reduction for distance calculations are iterated through.
#'
#' @param object IBRAP S4 class object
#' @param assay Character. String containing indicating which assay to use
#' @param clustering Character. The names of the cluster assignment dataframes to use
#' @param reduction Character. Which reduction(s) within the assay should be supplied for distance calcultions
#' @param n.dims Numerical. How many dimensions of the reduction should be supplied. Default = 1:3
#' @param dist.method Character. Which distance method should be used, options: 'euclidean', 'maximum', 'manhattan', 'canberra', 'binary', 'minkowski'. Default = 'euclidean'
#' @param ground.truth.column Character. If available, supply the column in the object metadata that contains ground truth labels, i.e. true cell type labels. If this is not supplied, only unsupervised methods will be supplied. Default = NULL
#' @param verbose Logical. Should system information be printed. Default = FALSE
#' @param seed Numeric. What should the seed be set as. Default = 1234
#'
#' @return Benchmarking scores for the supplied cluster assignments
#'
#' @examples
#'
#' # without ground truth labels
#'
#' object <- benchmark.clustering(object = object, assay = c('SCT', 'SCRAN', 'SCANPY'),
#' clustering = c("pca_harmony_nn:louvain"),
#' reduction = c('pca_harmony_umap'),
#' n.dims = 1:2)
#'
#' # With ground truth labels
#'
#' object <- benchmark.clustering(object = object, assay = c('SCT', 'SCRAN', 'SCANPY'),
#' clustering = c("pca_harmony_nn.v1:louvain"),
#' reduction = c('pca_harmony_umap'),
#' n.dims = 1:2, ground.truth = metadata$celltypes)
#'
#' @export
benchmark.clustering <- function(object,
assay,
clustering,
reduction,
n.dims = 2,
dist.method='euclidean',
ground.truth.column=NULL,
threads=1,
verbose=FALSE,
seed=1234) {
if(!is(object = object, class2 = 'IBRAP')) {
stop('object must be of class IBRAP \n')
}
if(!is.character(assay)) {
stop('assay must be character string(s) \n')
}
for(x in assay) {
if(!x %in% names(object@methods)) {
stop(paste0('assay: ', x, 'does not exist\n'))
}
}
if(!is.character(clustering)) {
stop('clustering must be character string(s)\n')
}
for(l in assay) {
for(x in clustering) {
if(!x %in% names(object@methods[[l]]@cluster_assignments)) {
stop('clustering: ,', x, 'not present in assay: ', l, '\n')
}
}
for(x in reduction) {
for(i in assay) {
if(!x %in% names(c(object@methods[[i]]@computational_reductions,
object@methods[[i]]@visualisation_reductions,
object@methods[[i]]@integration_reductions))) {
stop(paste0('reduction: ', x, ' does not exist\n'))
}
}
}
if(!is.numeric(n.dims)) {
stop('n.dims must be numerical\n')
}
if(!is.character(dist.method)) {
stop('dist.method must be a character string\n')
}
reduction.list <- list()
red.names <- c(names(object@methods[[l]]@computational_reductions),
names(object@methods[[l]]@integration_reductions),
names(object@methods[[l]]@visualisation_reductions))
for(i in red.names) {
if(i %in% names(object@methods[[l]]@computational_reductions)) {
reduction.list[[i]] <- object@methods[[l]]@computational_reductions[[i]]
}
if(i %in% names(object@methods[[l]]@integration_reductions)) {
reduction.list[[i]] <- object@methods[[l]]@integration_reductions[[i]]
}
if(i %in% names(object@methods[[l]]@visualisation_reductions)) {
reduction.list[[i]] <- object@methods[[l]]@visualisation_reductions[[i]]
}
}
for(r in reduction) {
if(!r %in% names(reduction.list)) {
stop('reductions could not be found\n')
}
}
if(!is.logical(verbose)) {
stop('verbose should be logical, TRUE/FALSE \n')
}
if(!is.numeric(seed)) {
stop('seed should be numerical \n')
}
set.seed(seed = seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
reticulate::py_set_seed(seed, disable_hash_randomization = TRUE)
count <- 1
reduction.list <- reduction.list[reduction]
reduction.list <- reduction.list[match(reduction, names(reduction.list))]
for(k in clustering) {
if(isTRUE(verbose)) {
cat(crayon::cyan(paste0(Sys.time(), ': benchmarking for assay: ', l, ' cluster dataframe: ', k, '\n')))
}
reduction_sub <- reduction.list[[reduction[count]]][,1:n.dims]
count <- count + 1
clusters <- object@methods[[l]]@cluster_assignments[[k]]
for(p in colnames(clusters)) {
if(length(unique(clusters[,p])) <= 1) {
if(isTRUE(verbose)) {
cat(crayon::cyan(paste0(Sys.time(), ': cluster column: ', p, ' contains only 1 cluster group, omitting now\n')))
}
clusters[,p] <- NULL
}
}
dist.matrix <- parallelDist::parDist(x = reduction_sub, method = dist.method, threads = threads)
sil.results <- data.frame(average_silhoeutte=NA)
for (v in colnames(clusters)[1:length(colnames(clusters))]) {
if(isTRUE(verbose)) {
cat(crayon::cyan(paste0(Sys.time(), ': calculating silhouette for ', v, '\n')))
}
tmp <- cluster::silhouette(x = as.numeric(x = as.factor(x = clusters[,v])), dist = dist.matrix)
average <- sum(tmp[,3])/length(tmp[,3])
sil.results[v,] <- average
}
sil.results <- sil.results[complete.cases(sil.results),]
max.AS <- max(sil.results)
dunn.results <- data.frame(dunn.index=NA)
for (p in colnames(clusters)[1:length(colnames(clusters))]){
if(isTRUE(verbose)) {
cat(crayon::cyan(paste0(Sys.time(), ': calculating dunn index for ', p, '\n')))
}
dunn.results[p,] <- clValid::dunn(distance = dist.matrix,
clusters = as.numeric(x = as.factor(x = clusters[,p])))
}
dunn.results <- dunn.results[complete.cases(dunn.results),]
max.dunn <- max(dunn.results)
conn.results <- data.frame(connectivity = NA)
for (p in colnames(clusters)[1:length(colnames(clusters))]) {
if(isTRUE(verbose)) {
cat(crayon::cyan(paste0(Sys.time(), ": calculating connectivity for ", p, "\n")))
}
conn.results[p, ] <- clValid::connectivity(distance = dist.matrix, clusters = clusters[, p])
}
conn.results <- conn.results[complete.cases(conn.results),]
cluster.results <- data.frame(N_CLUSTERS=NA)
for (p in colnames(clusters)[1:length(colnames(clusters))]){
if(isTRUE(verbose)) {
cat(crayon::cyan(paste0(Sys.time(), ': calculating connectivity for ', p, '\n')))
}
cluster.results[p,] <- length(unique(clusters[,p]))
}
cluster.results <- cluster.results[complete.cases(cluster.results),]
if(!is.null(ground.truth.column)) {
ground.truth <- object@sample_metadata[,ground.truth.column]
ARI.results <- data.frame(ARI=NA)
for (p in colnames(clusters)[1:length(colnames(clusters))]){
if(isTRUE(verbose)) {
cat(crayon::cyan(paste0(Sys.time(), ': calculating ARI for ', p, '\n')))
}
ARI.results[p,] <- mclust::adjustedRandIndex(x = clusters[,p], y = ground.truth)
}
ARI.results <- ARI.results[complete.cases(ARI.results),]
NMI.results <- data.frame(NMI=NA)
for (p in colnames(clusters)[1:length(colnames(clusters))]) {
if(isTRUE(verbose)) {
cat(crayon::cyan(paste0(Sys.time(), ': calculating NMI for ', p, '\n')))
}
NMI.results[p,] <- aricode::AMI(c1 = clusters[,p], c2 = ground.truth)
}
NMI.results <- NMI.results[complete.cases(NMI.results),]
results <- cbind(sil.results, dunn.results, conn.results, ARI.results, NMI.results,cluster.results)
rownames(results) <- colnames(clusters)
colnames(results) <- c('ASW',
'DUNN_INDEX',
'CONNECTIVITY',
'ARI',
'ASW',
'N_CLUSTERS')
object@methods[[l]]@benchmark_results$clustering[[paste0(k,'_benchmarked')]] <- as.data.frame(results)
} else {
results <- cbind(sil.results, dunn.results, conn.results,cluster.results)
rownames(results) <- colnames(clusters)
colnames(results) <- c('ASW', 'DUNN_INDEX', 'CONNECTIVITY','N_CLUSTERS')
object@methods[[l]]@benchmark_results$clustering[[paste0(k,'_benchmarked')]] <- as.data.frame(results)
}
}
}
if(isTRUE(verbose)) {
cat(crayon::cyan(paste0(Sys.time(), ': completed calculating benchmarking metrices \n')))
}
return(object)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.