####################
# Authorship of each function:
#
# Jessica N. Torres: runAllIterations, runForeachLoop, combineForeachOutput, getFinalPartition, calculatePAC, PlotCDF, PlotTracking
# Marcos M. Prunello: selectSubsample, connectivityVector, vectToMatr, PlotHeatmaps, ConsensusStatsAndPlots, consensusClustering
####################
## Fix for NOTEs from R CMD check
if(getRversion() >= "2.15.1") {
utils::globalVariables(c('K_Cl', 'ItemClusterBelongs', 'consensus', 'k', 'delta', 'idx', 'i'))
}
#' The consensusClustering function
#'
#' Runs Consensus Clustering for class discovery and clustering validation.
#'
#' @param dataMatrix matrix or data frame with data to cluster, samples/items in the columns and features in the rows.
#' @param K vector of integers representing numbeer of clusters to evaluate. It can be of length 1 and it does not need to consist of consecutive integers. For example, either of \code{K = 4}, \code{K = 2:5} or \code{K = c(5, 10, 15)} would work.
#' @param nIters number of iterations (bootstrap samples).
#' @param propSamples proportion of items to sample in each bootstrap sample.
#' @param clusterAlgorithm algorithm to perform the clustering, for the moment only K-means is available.
#' @param verbose logical, print progress messages to screen. During the bootstrap iterations, a report to monitor the progress is created in \code{pathOutput}.
#' @param seed numerical value to set random seed for reproducible results. It uses \code{doRNG} package to guarantee reproducible results even when running in parallel.
#' @param saveResults logical indicating if the output should be saved as an .rds file in the directory \code{pathOutput}.
#' @param pathOutput directory for output files and iterations progress report, the current working directory by default.
#' @param finalLinkage heirarchical linkage method for producing a final classification with the consensus indexes generated by the bootstrap samples.
#' @param PACLowerLim lower limit for the interval of ambiguous clustering used for calculating PAC score, belongs to the interval (0, 1).
#' @param PACUpperLim upper limit for the interval of ambiguous clustering used for calculating PAC score, belongs to the interval (0, 1).
#' @param plotHeatmaps character string indicating which heatmaps should be produced: "consensus" (only heatmap of the consensus indexes), "data" (only heatmap of input data set), "both" (default), or "no" (no plot is produced).
#' @param plotSave character string indicating the format the plot to be saved as files in directory \code{pathOutput}. Default is "no", the plots are not saved, but printed to the screen. Other options are: "pdf", "bmp", "png", "ps".
#' @param showDendrogram logical indicating if dendrograms should be plotted in the heatmaps (defaults to TRUE).
#' @param showSamplesNames logical indicating if sample names should be displayed in the plots (defaults to TRUE).
#' @param showFeaturesNames logical indicating if features names should be displayed in the plots (defaults to TRUE).
#' @param plotCDF logical indicating if the plot for the Cumulative Distribution Function (CDF) of the consensus indexes and for the relative change under the CDF should be produced. The second plot is not produced if \code{length(K) == 1}, since there is no comparison to be made. If \code{plotCDF == TRUE}, a vector with the area under the CDF curve for each K is returned.
#' @param plotTracking logical indicating if the plot with the tracking of samples through different values of K should be produced. No tracking plot is produced if \code{length(K) == 1}, since there is no tracking to be done.
#' @param consensusStats logical indicating if consensus statistics should be computed.
#' @param consensusStatsPlots logical indicating if plots of consensus statistics should be produced (only considered if \code{consensusStats == TRUE}).
#'
#' @details
#' Consensus Clustering is a revised tool for implementing the methodology for class discovery and clustering validation, based off of 2003 Monti's paper.
#' This method is used to find a consensus assignment across multiple runs of a clustering approach, allowing one to assess and
#' validate the stability of the discovered clusters empirically. The objective of this method is to identify robust clusters
#' in the context of genomic data, but is applicable for any unsupervised learning task.
#'
#' This function is parallelizad under the unifying paradigm, so it will automatically detect clusters or cores registered by the user before hand or will run sequentially if no parallel capabilities are available. Reproducible results are guaranteed when running in parallel if a \code{seed} is provided, through the use of the \code{doRNG} package.
#'
#' @return A list with the results of the consensus clustering. The first elements of the list correspond to each value of K evaluated, each
#' one containing:
#' \itemize{
#' \item \code{consensusTree}: final heirarchical tree based on the matrix of consensus indexes after running all the iterations of the clustering.
#' \item \code{consensusClass}: vector with the final cluster assignment for each sample.
#' \item \code{consensusVector}: vector of consensus indexes for each pair of samples. The consensus index is the proportion of times that a pair of samples
#' was clustered together in the same group, out of the total number times they were in the same bootstrap sample.
#' }
#' The following elements of the list returned are:
#' \itemize{
#' \item \code{PACscores}: vector with the Proportion of Ambiguous Clustering score (PAC) for each value of K evaluated. The PAC score
#' is the fraction of sample pairs with consensus index values falling in the intermediate sub-interval (PACLowerLim, PACUpperLim), by default (0.1, 0.9).
#' Lower PAC score is indicative of robustness.
#' \item \code{colorTracking}: list with clusters and samples color assignment for each K. If \code{length(K) > 1} colors are assigned tracking samples
#' across different values of K, letting the user track the history of clusters relative to earlier clusters.
#' }
#' The list returned may also include the following elements if the correspondent arguments were set to TRUE:
#' \itemize{
#' \item \code{areaUnderCDF}: if \code{plotCDF == TRUE}, vector with area under the CDF curve for each K, see \link{PlotCDF}.
#' \item \code{consensusStats}: if \code{consensusStats == TRUE}, list with consensus statistics, see \link{ConsensusStatsAndPlots}.
#' }
#'
#' @keywords robust_clustering consensus unsupervised_learning class_discovery
#' @export
#' @examples
#' mat <- matrix(rnorm(10*6), 10, 6)
#' result <- consensusClustering(mat, nIters = 5)
#'
consensusClustering <- function(dataMatrix,
K = 2:3,
nIters = 30,
propSamples = 0.8,
clusterAlgorithm = "Kmeans",
verbose = TRUE,
seed = NULL,
saveResults = FALSE,
pathOutput = "",
finalLinkage = "average",
PACLowerLim = 0.1,
PACUpperLim = 0.9,
plotHeatmaps = c("both", "consensus", "data", "no"),
plotSave = c("no", "pdf", "bmp", "png", "ps"),
showDendrogram = TRUE,
showSamplesNames = TRUE,
showFeaturesNames = TRUE,
plotCDF = TRUE,
plotTracking = TRUE,
consensusStats = TRUE,
consensusStatsPlots = TRUE) {
# Step 0: validating arguments
stopifnot(
class(dataMatrix) %in% c("matrix", "data.frame"),
all(sapply(c(verbose, saveResults, showDendrogram, showSamplesNames, showFeaturesNames, plotCDF, plotTracking, consensusStats, consensusStatsPlots), class) == "logical"),
all(sapply(c(clusterAlgorithm, pathOutput, finalLinkage, plotHeatmaps, plotSave), class) == "character"),
all(sapply(c(K, nIters, propSamples, PACLowerLim, PACUpperLim), class) == "numeric"),
all(K > 1),
length(nIters) == 1 && nIters > 0,
length(propSamples) == 1 && propSamples >= 0 && propSamples <= 1,
length(PACLowerLim) == 1 && length(PACUpperLim) == 1 && all(c(PACLowerLim, PACUpperLim) >= 0) && all(c(PACLowerLim, PACUpperLim) <= 1),
all(plotHeatmaps %in% c("both", "consensus", "data", "no")),
all(plotSave %in% c("no", "pdf", "bmp", "png", "ps"))
)
# Step 1: run all consensus clustering iterations and get consensus indexes
consensusVector <- runAllIterations(dataMatrix, sort(K), nIters, propSamples, clusterAlgorithm, pathOutput, verbose, seed)
# Step 2: use consensus indexes to generate final partition for each k
results <- getFinalPartition(consensusVector, sort(K), ncol(dataMatrix), colnames(dataMatrix), finalLinkage, pathOutput, verbose)
# Step 3: calculate PAC scores
results$PACscores <- calculatePAC(results, PACLowerLim, PACUpperLim)
# Step 4: do color assignment to clusters and samples tracking samples across different values of K
results$colorTracking <- trackColors(results)
# Step 5: make heatmaps
plotHeatmaps <- match.arg(plotHeatmaps)
if (plotHeatmaps != "no") {
PlotHeatmaps(results, plotHeatmaps, plotSave, pathOutput, showDendrogram, showSamplesNames, showFeaturesNames, dataMatrix)
}
# Step 6: plot Cumlative Distribution Function of consensus indexes
if (plotCDF) results$areaUnderCDF <- PlotCDF(results, plotSave, pathOutput, PACLowerLim, PACUpperLim)
# Step 7: tracking plot (only if we have more than one value of K to compare)
if (plotTracking) PlotTracking(results, showSamplesNames, plotSave, pathOutput)
# Step 8: calculate consensus statistics and plots
if (consensusStats) results$consensusStats <- ConsensusStatsAndPlots(results, consensusStatsPlots, plotSave, pathOutput)
# Step 9: save results
if (saveResults) saveRDS(results, file = paste0(pathOutput, "results.rds"))
return(results)
}
#' The runAllIterations function
#'
#' Internal. Runs all the iterations of consensus clustering.
#' @param dataMatrix matrix or data frame with data to cluster, samples/items in the columns and features in the rows.
#' @param K vector of integers representing numbeer of clusters to evaluate. It can be of length 1 and it does not need to consist of consecutive integers. For example, either of \code{K = 4}, \code{K = 2:5} or \code{K = c(5, 10, 15)} would work.
#' @param nIters number of iterations (bootstrap samples).
#' @param propSamples proportion of items to sample in each bootstrap sample.
#' @param clusterAlgorithm algorithm to perform the clustering, for the moment only K-means is available.
#' @param pathOutput directory for output files and iterations progress report, the current working directory by default.
#' @param verbose logical, print progress messages to screen. During the bootstrap iterations, a report to monitor the progress is created in \code{pathOutput}.
#' @param seed numerical value to set random seed for reproducible results. It uses \code{doRNG} package to guarantee reproducible results even when running in parallel.
#' @return Vector with consensus index for each pair of samples.
#' @keywords internal
#'
runAllIterations <- function(dataMatrix, K, nIters, propSamples, clusterAlgorithm, pathOutput, verbose, seed) {
# Step 1: Perform all the iterations of the consensus clustering
if (verbose) {
# File for monitoring progress (can't cat or print inside foreach loops)
writeLines(c(""), paste0(pathOutput, "progress.txt"))
sink(paste0(pathOutput, "progress.txt"), append = TRUE)
}
tmpResults <- runForeachLoop(nIters, propSamples, dataMatrix, K, clusterAlgorithm, verbose, seed)
if (verbose) sink()
jointOccurrenceVector <- tmpResults[[1]]
consensusVector <- tmpResults[[2]]
# Step 2: calculate consensus indexes
consensusVector <- lapply(consensusVector, function(x) {
x <- x / jointOccurrenceVector
x[jointOccurrenceVector == 0] <- 0 # handle NaN is the pair was never seen together
x
})
return(consensusVector)
}
#' The runForeachLoop function
#'
#' Internal. Wrapper function to call foreach, to keep the code tidy.
#' @param nIters number of iterations (bootstrap samples).
#' @param propSamples proportion of items to sample in each bootstrap sample.
#' @param dataMatrix matrix or data frame with data to cluster, samples/items in the columns and features in the rows.
#' @param K vector of integers representing numbeer of clusters to evaluate. It can be of length 1 and it does not need to consist of consecutive integers. For example, either of \code{K = 4}, \code{K = 2:5} or \code{K = c(5, 10, 15)} would work.
#' @param clusterAlgorithm algorithm to perform the clustering, for the moment only K-means is available.
#' @param verbose logical, print progress messages to screen. During the bootstrap iterations, a report to monitor the progress is created in \code{pathOutput}.
#' @param seed numerical value to set random seed for reproducible results. It uses \code{doRNG} package to guarantee reproducible results even when running in parallel.
#' @return A list with the jointOccurrenceVector and consensusVector.
#' @importFrom doRNG %dorng%
#' @keywords internal
#'
runForeachLoop <- function(nIters, propSamples, dataMatrix, K, clusterAlgorithm, verbose, seed) {
n <- ncol(dataMatrix)
if (!is.null(seed)) set.seed(seed)
tmpResults <- foreach::foreach (i = seq(nIters), .combine="combineForeachOutput", .inorder = FALSE, .export = c("selectSubsample", "connectivityVector")) %dorng% {
if (verbose) cat("Starting random subsample", i, "\n")
# Get bootstrap subsample
selectedItems <- selectSubsample(n, propSamples)
subDataMatrix <- dataMatrix[, selectedItems$selectedSamples]
# jointOccurrenceVector is a vector with an element for each pair of samples, and it accumulates the number of bootstrap samples where a particular pair was cluster together
# connectivityVector is designed to take as input a current jointOccurrenceVector and update it with a new bootstrap samples adding 1s where necessary
# But since we are running in parallel and the combination is done later, the input is always a vector of zeros.
# consensusVector has a vector for each K with the number of times each pair was clustered in the same cluster
# Similarly, connectivityVector can update it, but here we just start with zero vectors in each iteration and combine the results later
jointOccurrenceVector <- connectivityVector(selectedItems$selectedSamples, rep(1, n * propSamples), numeric(n * (n + 1) / 2), n)
consensusVector <- rep(list(numeric(n*(n + 1)/2)), length(K))
# Evaluate each k in K
for (idx in 1:length(K)) {
k <- K[idx]
if (verbose) cat(" subsample", i, "; k =", k, "\n")
if (clusterAlgorithm == "Kmeans") {
clusterResult <- kmeans(t(subDataMatrix), k)$cluster
} else {
stop("Invalid clusterAlgorithm argument.")
}
# Update count of pairs clustered together
consensusVector[[idx]] <- connectivityVector(selectedItems$selectedSamples, clusterResult, consensusVector[[idx]], n)
}
# Return objects
list(jointOccurrenceVector, consensusVector)
}
return(list(jointOccurrenceVector = tmpResults[[1]], consensusVector = tmpResults[[2]]))
}
#' The selectSubsample function
#'
#' Internal. Select the samples for each bootstrap iteration.
#' Although it is only one line, we put this task in its own function to easily add more functionality in the future, like also subsetting the features, or giving different weights for the samples
#' @param nSamples number of total items
#' @param propSamples proportion of items to be sampled
#' @return A list in with a vector with the indices of selected samples.
#' @keywords internal
#'
selectSubsample <- function(nSamples, propSamples) {
selectedSamples <- sort(sample(nSamples, propSamples * nSamples))
return(list(selectedSamples = selectedSamples)) # A list to add more things in the future
}
#' The connectivityVector function
#'
#' Internal. Given a vector indicating the samples involved (samplesID) and a vector indicating to which group they belong (groupID),
#' this function adds 1 to the positions in connectVector corresponding to pair of samples that belong to the same group.
#' We use this to: 1) update jointOccurrenceVector adding 1 to the positions corresponding to each pair of samples present in the current bootstrap subsample, to indicate that they have the chance to be clustered together later
#' 2) update consensusVector adding 1 to the positions corresponding to pairs of samples that were clustered together
#' @param sampleID vector indicating which are the samples being considered
#' @param groupID vector with group indexes. Positions corresponding to pair of samples present in sampleID with the same groupID will be increased in 1.
#' @param connectVector Current vector of counts to be updated
#' @param n total number of samples
#' @return Updated vector of counts
#' @keywords internal
#'
connectivityVector <- function(sampleID, groupID, connectVector, n){
names(groupID) <- sampleID
listID <- lapply(unique(groupID), function(x) sampleID[groupID == x]) # list with samples in each groupID
for (i in seq(listID)) {
# All the pairs of samples in the same group (also including as a pair a sample with itself)
if (length(listID[[i]]) > 1) {
pairsID <- rbind(t(combn(listID[[i]], 2)), matrix(rep(listID[[i]], 2), ncol = 2))
} else {
pairsID <- matrix(rep(listID[[i]], 2), ncol = 2)
}
# Find corresponding position for each pair in connectVector
pairsPos <- n * (pairsID[, 1] - 1) + pairsID[, 2] - sapply(pairsID[, 1], function(x) sum(seq(x-1)) * (x>1))
# Update
connectVector[pairsPos] <- connectVector[pairsPos] + 1
}
return(connectVector)
}
#' The combineForeachOutput function
#'
#' Internal. Function to combine output from foreach iterations.
#' @param a output from one foreach loop
#' @param b output from another foreach loop
#' @keywords internal
#'
combineForeachOutput <- function(a, b) {
res1 <- a[[1]] + b[[1]]
len <- length(a[[2]])
res2 <- vector(mode = "list", len)
for (j in seq(len)) {
res2[[j]] <- a[[2]][[j]] + b[[2]][[j]]
}
return(list(res1, res2))
}
#' The getFinalPartition function
#'
#' Internal. Takes the consensus vector for each K and performs hierachical clustering to get the final k clusters.
#' @param consensusVector list a vector of counts of how many times each pair of samples have been clustered together so far, for each value of K.
#' @param K vector of integers representing numbeer of clusters to evaluate.
#' @param n number of samples in input data matrix
#' @param sampleNames vector with samples names (colnames of input data)
#' @param finalLinkage heirarchical linkage method for producing a final classification with the consensus indexes
#' @param pathOutput directory for output files
#' @param verbose logical, print progress messages to screen
#' @return A list with one element for each value of K, each containing:
#' \itemize{
#' \item \code{consensusTree}: final heirarchical tree based on the matrix of consensus indexes after running all the iterations of the clustering.
#' \item \code{consensusClass}: vector with the final cluster assignment for each sample.
#' \item \code{consensusVector}: vector of consensus indexes for each pair of samples. The consensus index is the proportion of times that a pair of samples
#' was clustered together in the same group, out of the total number times they were in the same bootstrap sample.
#' }
#' @importFrom foreach %dopar%
#' @keywords internal
#'
getFinalPartition <- function(consensusVector, K, n, sampleNames, finalLinkage, pathOutput, verbose) {
lengthK <- length(K)
if (verbose) sink(paste0(pathOutput, "progress.txt"), append = TRUE) # writing in progress file because foreach does not cat or print
results <- foreach::foreach(idx = 1:length(K), .export = c("vectToMatr")) %dopar% {
k <- K[idx]
if (verbose) cat("Getting final partition for k =", k, ".\n")
distMatrix <- as.dist(1 - vectToMatr(consensusVector[[idx]], n)) # Create distance matrix from consensus indexes
consensusTree <- hclust(distMatrix, method = finalLinkage)
consensusClass <- cutree(consensusTree, k)
names(consensusClass) <- sampleNames
list(consensusTree = consensusTree, consensusClass = consensusClass, consensusVector = consensusVector[[idx]])
}
if (verbose) sink()
names(results) <- paste0("K_", K)
return(results)
}
#' The vectToMatr function
#'
#' Internal. Takes a vector and converts it into a symmetric matrix.
#' @param v input vector.
#' @param N number of columns in output matrix.
#' @return A symmetric matrix.
#' @keywords internal
#'
vectToMatr <- function(v, N) {
m <- matrix(NA, N, N)
m[lower.tri(m, diag = T)] <- v
m <- t(m)
m[lower.tri(m, diag = T)] <- v
return(m)
}
#' The calculatePAC function
#'
#' Internal. Calculates the Proportion of Ambiguous Clustering (PAC) for each K, according to Senbabaogly (2014).
#' @param results output from getFinalPartition function.
#' @param lowerLim lower limit for the interval of ambiguous clustering used for calculating PAC score, belongs to the interval (0, 1).
#' @param upperLim upper limit for the interval of ambiguous clustering used for calculating PAC score, belongs to the interval (0, 1).
#' @return A data frame with PAC scores for each K.
#' @details
#' The CDF plot shows the cumulative distribution functions of the consensus indexes for all pairs of samples for each k (indicated by colors). The empirical CDF plot holds the cumulative distribution function (CDF) values on the y and the consensus index values on the x-axis. In the CDF curve, the lower left portion represents sample pairs rarely clustered together, the upper right portion represents those almost always clustered together, whereas the middle portion represents those with occasional co-assignments in different clustering runs. The CDF curves show a flat middle segment for the true K, suggesting that very few sample pairs are ambiguous when K is correctly inferred.
#' The PAC score can be used to quantify this characteristic. The Proportion of Ambiguous Clustering (PAC) is the fraction of sample pairs that hold consensus index values within a given sub-interval (x1, x2) in [0,1] (usually, x1 = 0.1 and x2 = 0.9). The CDF values correspond to the fraction of sample pairs with a consensus index values less or equal to the value 'c'. The PAC is then calculated by CDF(x2) - CDF(x1), optimal K should present a low PAC score.
#' @references Senbabaoglu, Y et al (2014) Critical limitations of consensus clustering in class discovery. \emph{Scientific Reports}, \strong{4}, Article number 6207.
calculatePAC <- function(results, lowerLim = 0.1, upperLim = 0.9) {
K <- as.numeric(sapply(grep("K_", names(results), value = TRUE), function(x) unlist(strsplit(x, "_"))[2]))
consensusData <- data.frame(sapply(results[1:length(K)], function(x) x$consensusVector))
colnames(consensusData) <- K
PACscores <- apply(consensusData, 2, function(x) ecdf(x)(upperLim) - ecdf(x)(lowerLim))
names(PACscores) <- paste0("K_", K)
return(PACscores)
}
#' The trackColors function
#'
#' Internal. Wrapper function to call function \code{\link{setClusterColors}} from \code{ConsensusClusterPlus} package. Does color assignment to clusters and samples tracking samples across different values of K.
#' @param results output from getFinalPartition function.
#' @return A list with the colors assigned to each cluster and sample for each value of K. This colors are assigned tracking the samples trying to preserve same of groups of samples with the same color through different values of K.
#' @keywords internal
#'
trackColors <- function(results) {
K <- as.numeric(sapply(grep("K_", names(results), value = TRUE), function(x) unlist(strsplit(x, "_"))[2]))
# See how to replace this by something that get automatically k well distinguished colors, for K not limited, possibly very big
listColors <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928",
"#bd18ea", "#2ef4ca", "#f4cced", "#f4cc03", "#05188a", "#e5a25a", "#06f106", "#85848f", "#000000",
"#076f25", "#93cd7f", "#4d0776", "#ffffff", "yellow")
# For each k, assign colors for the clusters, keeping track of the partition of the samples through the different values of k
colorTracking <- vector("list", length(K))
names(colorTracking) <- paste0("K_", K)
for (idx in 1:length(K)) {
if (idx == 1) {
colorTracking[[idx]] <- setClusterColors(NULL, results[[idx]]$consensusClass, listColors, list(), K)
} else {
colorTracking[[idx]] <- setClusterColors(results[[idx - 1]]$consensusClass, results[[idx]]$consensusClass, listColors, colorTracking[[idx - 1]], K)
}
}
return(colorTracking)
}
#' The setClusterColors function
#'
#' Internal. Sets common color of clusters between different K. Taken from the package ConsensusClusteringPlus, we only modified some variable names.
#' @param previousConsensusClass vector of cluster classifications for the previous value of K.
#' @param consensusClass vector of cluster classifications for the current value of K.
#' @param listColors list of colors to be used.
#' @param colorTracking list for storing the colors assigned for the current value of K.
#' @param K vector of integers representing numbeer of clusters to evaluate, minimum 2.
#' @return A list with colors assignments for current value of K.
#' @keywords internal
#' @note Function taken from ConsensusClusterPlus package.
#' @references Wilkerson M and Hayes D (2010) ConsensusClusterPlus: a class discovery tool with confidence assessments and item tracking. \emph{Bioinformatics}, \strong{26}, 1572-1573.
#'
setClusterColors <- function(previousConsensusClass, consensusClass, listColors, colorTracking, K) {
newColors <- c()
if (length(colorTracking) == 0) {
# first value of K
newColors <- listColors[consensusClass]
colori <- K[1]
} else {
newColors <- rep(NULL, length(consensusClass))
colori <- colorTracking[[2]]
mo <- table(previousConsensusClass, consensusClass)
m <- mo / apply(mo, 1, sum)
for (tci in 1:ncol(m)) { # for each cluster
maxC <- max(m[, tci])
pci <- which(m[, tci] == maxC)
if (sum(m[, tci] == maxC) == 1 & max(m[pci, ]) == maxC & sum(m[pci, ] == maxC) == 1) {
# if new column maximum is unique, same cell is row maximum and is also unique
# Note: the greatest of the prior clusters' members are the greatest in a current cluster's members.
newColors[which(consensusClass == tci)] <- unique(colorTracking[[1]][which(previousConsensusClass == pci)]) # one value
} else { #add new color
colori <- colori + 1
newColors[which(consensusClass == tci)] <- listColors[colori]
}
}
}
return(list(newColors, colori, unique(newColors)))
}
#' The PlotHeatmaps function
#'
#' Plots heatmaps of the consensus indexes and original data matrix.
#' @param results output from consensusClustering function.
#' @param plotHeatmaps character string indicating which heatmaps should be produced: "consensus" (only heatmap of the consensus indexes), "data" (only heatmap of input data set), or "both" (default).
#' @param plotSave character string indicating the format the plot to be saved as files in directory \code{pathOutput}. Default is "no", the plots are not saved, but printed to the screen. Other options are: "pdf", "bmp", "png", "ps".
#' @param pathOutput directory for plot files
#' @param showDendrogram logical indicating if dendrograms should be plotted in the heatmaps (defaults to TRUE).
#' @param showSamplesNames logical indicating if sample names should be displayed in the plots (defaults to TRUE).
#' @param showFeaturesNames logical indicating if features names should be displayed in the plots (defaults to TRUE).
#' @param dataMatrix matrix with data to cluster, samples/items in the columns and features in the rows, only needed if plotHeatmaps is "data" or "both".
#' @keywords external
#' @export
#'
#' @details
#' The consensus heatmap displays the matrix of consensus indexes for all pairs of samples. The consensus index is the proportion of times that a pair of samples was clustered together in the same group, out of the total number times they were in the same bootstrap sample. We expect to observe high consensus indexes for samples that were clustered together and low for samples in finally assigned in different clusters. The Consensus heatmap is a visual aid allowing for the consensus clustering membership to be depicted in an easy to understand approach. Cluster membership are highlighted by the colored rectangles between the dendrogram (top) and heatmap according to the legend within the graphic. This layout is beneficial to the user to help compare a clusters' membership count in the context of their consensus. Ideally we aim for high consensus in the blocks along the diagonal (samples in the same cluster) and low consensus outside the diagonal.
#' @examples
#' mat <- matrix(rnorm(10*6), 10, 6)
#' result <- consensusClustering(mat, K=2:3, nIters = 5, plotHeatmaps = "no")
#' PlotHeatmaps(result, plotHeatmaps = "both", dataMatrix = mat)
#'
PlotHeatmaps <- function(results,
plotHeatmaps = c("both", "consensus", "data"),
plotSave = c("no", "pdf", "bmp", "png", "ps"),
pathOutput = "",
showDendrogram = TRUE,
showSamplesNames = TRUE,
showFeaturesNames = TRUE,
dataMatrix = NULL) {
stopifnot(
class(results) == "list",
all(sapply(c(showDendrogram, showSamplesNames, showFeaturesNames), class) == "logical"),
all(sapply(c(pathOutput, plotHeatmaps, plotSave), class) == "character"),
class(dataMatrix) %in% c("NULL", "matrix", "data.frame"),
all(plotHeatmaps %in% c("both", "consensus", "data")),
all(plotSave %in% c("no", "pdf", "bmp", "png", "ps"))
)
plotHeatmaps <- match.arg(plotHeatmaps)
K <- as.numeric(sapply(grep("K_", names(results), value = TRUE), function(x) unlist(strsplit(x, "_"))[2]))
# Initialize device if we save the plot
plotSave <- match.arg(plotSave)
if (plotSave == "pdf") {
pdf(onefile = TRUE, paste0(pathOutput, "Heatmaps.pdf"))
} else if (plotSave == "bmp") {
bitmap(paste0(pathOutput, "Heatmaps%03d.png"))
} else if (plotSave == "png") {
png(paste0(pathOutput, "Heatmaps%03d.png"), type="cairo")
} else if (plotSave == "ps") {
postscript(onefile = FALSE, paste0(pathOutput, "Heatmaps%03d.ps"))
}
# Heatmap for consensus indexes
if (plotHeatmaps == "consensus" || plotHeatmaps == "both") {
# Define colors
colHeatmapConsensus <- circlize::colorRamp2(0:1, c("white", "blue"))
for (k in K) {
K_k <- paste0("K_", k)
# Create symmetric consensus matrix
consensusMatrix <- vectToMatr(results[[K_k]]$consensusVector, length(results[[1]]$consensusClass))
# Provide sample names
if (showSamplesNames) colnames(consensusMatrix) = names(results[[K_k]]$consensusClass)
# Heatmap indicating clusters
clusterCols <- structure(results$colorTracking[[K_k]][[3]], names = unique(results[[K_k]]$consensusClass))
heatmapCluster <- ComplexHeatmap::Heatmap(results[[K_k]]$consensusClass,
name = "Cluster",
col = clusterCols,
cluster_rows = results[[K_k]]$consensusTree,
show_row_dend = FALSE,
show_row_names = FALSE,
width = grid::unit(0.5, "cm"))
clusterAnnotationOnColumns <- ComplexHeatmap::HeatmapAnnotation(df = data.frame(clID = results[[K_k]]$consensusClass),
col = list(clID = clusterCols),
show_legend = FALSE,
name = "colAnnot")
heatmapConsensus <- ComplexHeatmap::Heatmap(consensusMatrix,
col = colHeatmapConsensus,
name = "consensusMatrix",
heatmap_legend_param = list(title = "Consensus", color_bar = "continuous"),
column_title = "Samples",
column_title_side = "top",
row_title = "Samples",
row_title_side = "left",
cluster_columns = results[[K_k]]$consensusTree,
cluster_rows = results[[K_k]]$consensusTree,
show_row_names = FALSE,
show_column_names = showSamplesNames,
show_column_dend = showDendrogram,
show_row_dend = FALSE,
top_annotation = clusterAnnotationOnColumns)
ComplexHeatmap::draw(ComplexHeatmap::add_heatmap(heatmapCluster, heatmapConsensus),
row_title = "Samples",
gap = grid::unit(0.005, "npc"),
column_title = sprintf("Consensus Heatmap for K = %d", k))
# Separate clusters with lines
consClass <- results[[K_k]]$consensusClass[results[[K_k]]$consensusTree$order]
linesAt <- cumsum(table(consClass)[as.character(unique(consClass))])[-length(unique(consClass))] / length(consClass)
# vertical lines
ComplexHeatmap::decorate_heatmap_body("consensusMatrix", {
for (i in 1:length(linesAt)) {
grid::grid.lines(c(linesAt[i], linesAt[i]), c(0, 1), gp = grid::gpar(lty = 1, lwd = 2))
}
})
ComplexHeatmap::decorate_annotation("clID", {
for (i in 1:length(linesAt)) {
grid::grid.lines(c(linesAt[i], linesAt[i]), c(0, 1), gp = grid::gpar(lty = 1, lwd = 2))
}
})
# horizontal lines
linesAt = abs(1 - linesAt)
for (annot in c("consensusMatrix", "Cluster")) {
ComplexHeatmap::decorate_heatmap_body(annot, {
for (i in 1:length(linesAt)) {
grid::grid.lines(c(0, 1), c(linesAt[i], linesAt[i]), gp = grid::gpar(lty = 1, lwd = 2))
}
})
}
}
}
# Heatmap for original data, need to provide dataMatrix as argument for this function
if (plotHeatmaps == "data" || plotHeatmaps == "both") {
if (is.null(dataMatrix)) {
message("Heatmaps for the input data cannot be plot if dataMatrix is not provided.")
} else {
for (k in K) {
K_k <- paste0("K_", k)
clusterCols <- structure(results$colorTracking[[K_k]][[3]], names = unique(results[[K_k]]$consensusClass))
clusterAnnotationOnColumns <- ComplexHeatmap::HeatmapAnnotation(df = data.frame(clID = results[[K_k]]$consensusClass),
col = list(clID = clusterCols),
show_legend = FALSE,
name = "colAnnot")
heatmapData <- ComplexHeatmap::Heatmap(dataMatrix,
name = "heatmapData",
heatmap_legend_param = list(title = "Input Data", color_bar = "continuous"),
column_title = "Samples",
column_title_side = "top",
row_title = "Features",
row_dend_side = "left",
cluster_columns = results[[K_k]]$consensusTree,
cluster_rows = TRUE,
show_column_names = showSamplesNames,
show_column_dend = showDendrogram,
show_row_names = showFeaturesNames,
row_names_side = "right",
show_row_dend = showDendrogram,
top_annotation = clusterAnnotationOnColumns)
ComplexHeatmap::draw(heatmapData, column_title = sprintf("Heatmap for input data, K = %d", k))
# Separate clusters with lines
consClass <- results[[K_k]]$consensusClass[results[[K_k]]$consensusTree$order]
linesAt <- cumsum(table(consClass)[as.character(unique(consClass))])[-length(unique(consClass))] / length(consClass)
# vertical lines
ComplexHeatmap::decorate_heatmap_body("heatmapData", {
for (i in 1:length(linesAt)) {
grid::grid.lines(c(linesAt[i], linesAt[i]), c(0, 1), gp = grid::gpar(lty = 1, lwd = 2))
}
})
ComplexHeatmap::decorate_annotation("clID", {
for (i in 1:length(linesAt)) {
grid::grid.lines(c(linesAt[i], linesAt[i]), c(0, 1), gp = grid::gpar(lty = 1, lwd = 2))
}
})
}
}
}
# Close device
if (plotSave != "no") dev.off()
}
#' The PlotCDF function
#'
#' Plots the Cumulative Distribution Function (CDF) of the consensus indexes for assessment of the optimal number K,
#' while displaying PAC scores in the legend. Also plots the relative change under the CDF curve, only if
#' results were produced for more than one value of K, so that the comparison can be made.
#' @param results output from consensusClustering function.
#' @param plotSave character string indicating the format the plot to be saved as files in directory \code{pathOutput}. Default is "no", the plots are not saved, but printed to the screen. Other options are: "pdf", "bmp", "png", "ps".
#' @param pathOutput directory for saving plots if \code{plotSave == TRUE}, defaults to current working directory.
#' @param PACLowerLim lower limit for the interval of ambiguous clustering used for calculating PAC score.
#' @param PACUpperLim upper limit for the interval of ambiguous clustering used for calculating PAC score.
#' @return A vector with the area under the CDF curve for each value of K (note, not the relative change under the area).
#' @details
#' The \strong{CDF plot} shows the cumulative distribution functions of the consensus indexes for all pairs of samples for each k (indicated by colors). The empirical CDF plot holds the cumulative distribution function (CDF) values on the y and the consensus index values on the x-axis. In the CDF curve, the lower left portion represents sample pairs rarely clustered together, the upper right portion represents those almost always clustered together, whereas the middle portion represents those with occasional co-assignments in different clustering runs. Tthe CDF curves show a flat middle segment for the true K, suggesting that very few sample pairs are ambiguous when K is correctly inferred.
#'
#' The \strong{PAC score} can be used to quantify this characteristic. The Proportion of Ambiguous Clustering (PAC) is the fraction of sample pairs that hold consensus index values within a given sub-interval (x1, x2) in [0,1] (usually, x1 = 0.1 and x2 = 0.9). The CDF values correspond to the fraction of sample pairs with a consensus index values less or equal to the value 'c'. The PAC is then calculated by CDF(x2) - CDF(x1), optimal K should present a low PAC score.
#'
#' The difference between the two CDFs can be partially summarized by measuring the area under the two curves. The \strong{plot of the relative change in the area under the CDF curve} allows a user to determine the relative increase in consensus and determine the value of K at which there is no appreciable increase.
#'
#' The relative change in the area under the CDF curve for a given value \code{k} is defined as (area(k') - area(k))/area(k)
#' where k' is the consecutive value to \code{k} in the input vector \code{K}. For the first value in the vector \code{K},
#' the relative change is just the area under the curve since there is no previous value to compute the relative change. The vector \code{K} provided
#' by the user is always sorted internally, so even if it does not contain consecutive integers, the relative change under the CDF curve
#' between one value of \code{k} to the next one still can be meaningful.
#'
#' @keywords external
#' @export
#' @examples
#' mat <- matrix(rnorm(10*6), 10, 6)
#' result <- consensusClustering(mat, K=2:3, nIters = 5, plotCDF = FALSE)
#' PlotCDF(result)
#'
PlotCDF <- function(results,
plotSave = c("no", "pdf", "bmp", "png", "ps"),
pathOutput = "",
PACLowerLim = 0.1,
PACUpperLim = 0.9) {
stopifnot(
class(results) == "list",
all(sapply(c(pathOutput, plotSave), class) == "character"),
all(sapply(c(PACLowerLim, PACUpperLim), class) == "numeric"),
length(PACLowerLim) == 1 && length(PACUpperLim) == 1 && all(c(PACLowerLim, PACUpperLim) >= 0) && all(c(PACLowerLim, PACUpperLim) <= 1),
all(plotSave %in% c("no", "pdf", "bmp", "png", "ps"))
)
K <- as.numeric(sapply(grep("K_", names(results), value = TRUE), function(x) unlist(strsplit(x, "_"))[2]))
# Initialize device if we save the plot
plotSave <- match.arg(plotSave)
if (plotSave == "pdf") {
pdf(onefile = TRUE, paste0(pathOutput, "CDFPlot.pdf"))
} else if (plotSave == "bmp") {
bitmap(paste0(pathOutput, "CDFPlot.png"))
} else if (plotSave == "png") {
png(paste0(pathOutput, "CDFPlot.png"), type="cairo")
} else if (plotSave == "ps") {
postscript(onefile = FALSE, paste0(pathOutput, "CDFPlot.ps"))
}
consensusData <- data.frame(sapply(results[1:length(K)], function(x) x$consensusVector))
colnames(consensusData) <- K
consensusData <- tidyr::gather_(consensusData, key_col = "k", value_col = "consensus", gather_cols = colnames(consensusData))
cdfplot <- ggplot2::ggplot(consensusData, ggplot2::aes(x = consensus, colour = k)) +
ggplot2::ggtitle("Cumulative Distribution Function of Consensus Indexes") +
ggplot2::stat_ecdf(size = 1.5) +
ggplot2::scale_x_continuous(name = "Consensus Index", breaks = seq(0, 1, 0.2)) +
ggplot2::scale_y_continuous(name = "CDF", breaks = seq(0, 1, 0.2)) +
ggplot2::scale_colour_discrete(name = "PAC score", labels = paste0("K = ", K, ": ", round(results$PACscores, 3))) +
ggplot2::geom_vline(xintercept = c(PACLowerLim, PACUpperLim), linetype = "dotted")
print(cdfplot)
# Close device
if (plotSave != "no") dev.off()
# Calculate area under the CDF curve
auc <- sapply(split(consensusData, consensusData$k), function(x) {
xvalues <- seq(0, 1, by = 0.01)
yvalues <- ecdf(x$consensus)(xvalues)
sum(diff(xvalues) * (head(yvalues, -1) + tail(yvalues, -1))) / 2
})
names(auc) <- paste0("K_", K)
# Plot change in area under curve (first value remains the same, the relative differences)
# Only if we have more than one K
if (length(K) > 1) {
# Initialize device if we save the plot
if (plotSave == "pdf") {
pdf(onefile = TRUE, paste0(pathOutput, "areaUnderCDFPlot.pdf"))
} else if (plotSave == "bmp") {
bitmap(paste0(pathOutput, "areaUnderCDFPlot.png"))
} else if (plotSave == "png") {
png(paste0(pathOutput, "areaUnderCDFPlot.png"), type="cairo")
} else if (plotSave == "ps") {
postscript(onefile = FALSE, paste0(pathOutput, "areaUnderCDFPlot.ps"))
}
deltaAUC <- data.frame(delta = c(auc[1], diff(auc)) / c(1, auc[-1]), K = K)
deltaPlot <- ggplot2::ggplot(deltaAUC, ggplot2::aes(x = K, y = delta)) +
ggplot2::geom_point(size = 3) +
ggplot2::geom_line() +
ggplot2::scale_x_continuous(breaks = K) +
ggplot2::ylab("Relative change in area under CDF curve") +
ggplot2::ggtitle("Delta Area Plot")
print(deltaPlot)
# Close device
if (plotSave != "no") dev.off()
}
return(areaUnderCDF = auc)
}
#' The PlotTracking function
#'
#' This graphic shows the cluster assignment of items (columns) for each k (rows) by color. The colors correspond to the colors of the consensus matrix class asssignments.
#' This function only produces the plot if the results were produced for more than one value of K, so that the comparison and tracking can be done.
#' @param results output from consensusClustering function.
#' @param showSamplesNames logical indicating if the plot should display sample names or not.
#' @param plotSave character string indicating the format the plot to be saved in a file. Default is "no", the plot is not saved, but printed to the screen.
#' @param pathOutput directory for saving the plot if \code{plotSave == TRUE}, defaults to current working directory
#' @details
#' This graphic shows the cluster assignment of items (columns) for each k (rows). The color asssignments corresponds to the
#' consensus matrix class. This plot provides a view of item cluster membership across different k and enables users to track
#' the history of clusters relative to earlier clusters. For items that change clusters often (changing colors within a column)
#' are indicative of unstable membership and thus unstable/unreliable clusters where as items that do not frequently change colors
#' shows a strong membership and stable cluster asssignment.
#' @keywords external
#' @export
#' @examples
#' mat <- matrix(rnorm(10*6), 10, 6)
#' result <- consensusClustering(mat, nIters = 5, plotTracking = FALSE)
#' PlotTracking(result)
#'
PlotTracking <- function(results,
showSamplesNames = TRUE,
plotSave = c("no", "pdf", "bmp", "png", "ps"),
pathOutput = "") {
stopifnot(
class(results) == "list",
class(showSamplesNames) == "logical",
all(sapply(c(pathOutput, plotSave), class) == "character"),
all(plotSave %in% c("no", "pdf", "bmp", "png", "ps"))
)
K <- as.numeric(sapply(grep("K_", names(results), value = TRUE), function(x) unlist(strsplit(x, "_"))[2]))
if (length(K) == 1) {
cat("results provided were produced for only one value of K, no traking can be done.\n")
} else {
colorMatrix <- t(sapply(results$colorTracking, function(x) x[[1]]))
rownames(colorMatrix) <- K
colnames(colorMatrix) <- names(results[[1]]$consensusClass)
colorMatrix <- colorMatrix[, results[[paste0("K_", max(K))]]$consensusTree$order]
# Initialize device if we save the plot
plotSave <- match.arg(plotSave)
if (plotSave == "pdf") {
pdf(onefile = TRUE, file = paste0(pathOutput, "TrackingPlot.pdf"))
} else if (plotSave == "bmp") {
bitmap(file = paste0(pathOutput, "TrackingPlot.png"))
} else if (plotSave == "png") {
png(paste0(pathOutput, "TrackingPlot.png"), type="cairo")
} else if (plotSave == "ps") {
postscript(onefile = FALSE, file = paste0(pathOutput, "TrackingPlot.ps"))
}
heatmapTracking <- ComplexHeatmap::Heatmap(colorMatrix,
name = "heatmapTracking",
rect_gp = grid::gpar(type = "none"),
cell_fun = function(j, i, x, y, width, height, fill) {
grid::grid.rect(x = x, y = y, width = width, height = height, gp = grid::gpar(col = NA, fill = colorMatrix[i, j]))
},
cluster_rows = FALSE,
cluster_columns = FALSE,
show_row_names = TRUE,
row_names_side = "left",
show_column_names = showSamplesNames,
show_heatmap_legend = FALSE,
column_title = "Samples",
column_title_side = "bottom",
row_title = "K",
row_title_side = "left")
ComplexHeatmap::draw(heatmapTracking, column_title = "Tracking Plot")
# Close device
if (plotSave != "no") dev.off()
}
}
#' The ConsensusStatsAndPlots function
#'
#' Calculates cluster consensus and item consensus (with their plots), and intra and inter cluster consensus summary.
#' @param results output from consensusClustering function.
#' @param plots logical indicating whether the plots should be produced. Plots are produced if the results have been produced for more than a single value of K (\code{length(K) > 1}).
#' @param plotSave character string indicating the format the plot to be saved in a file. Default is \code{"no"}, the plot is not saved, but printed to the screen.
#' @param pathOutput directory for saving plots if \code{plotSave == TRUE}, defaults to current working directory.
#' @return A list with the following elements:
#' \itemize{
#' \item \code{clusterConsensus}: average consensus index between all pairs of items belonging to the same cluster, for each K.
#' \item \code{itemConsensus}: average consensus index between item i and all the (other) items in cluster cl, for all i and cl, for each K.
#' \item \code{intraInterCons}: matrix with intra consensus statistic (the mean of all cluster consensus for each K) and inter consensus statistic (mean of all item consensus between an item and all clusters to which the item does not belong, for each K).
#' }
#' @details
#' The \strong{Cluster Consensus Plot} highlights the mean pairwise consensus values between a cluster's members for each k. The color scheme
#' follows all previous graphs and sample are stacked bars grouped by K value on the horizontal x-axis. High values show that the clusters hold
#' high stability and likewise low values highlights a clusters instability.
#' In the \strong{Item Consensus Plots}, each stacked bar is a sample. Item-consensus values are indicated by the heights of the colored portion of the bars (using the tracking color scheme). This plot provides a view of item-consensus across all other clusters at a given k. As Wilkerman (2010) explains, with this plot it is possible to see if a sample is a very "pure" member of a cluster or if it shares high consensus to multiple clusters (large rectangles in a column of multiple colors), suggesting that it is an unstable member.
#' @keywords external
#' @export
#' @examples
#' mat <- matrix(rnorm(10*6), 10, 6)
#' result <- consensusClustering(mat, nIters = 5, consensusStatsPlots = FALSE)
#' ConsensusStatsAndPlots(result)
#'
ConsensusStatsAndPlots <- function(results,
plots = TRUE,
plotSave = c("no", "pdf", "bmp", "png", "ps"),
pathOutput = "") {
stopifnot(
class(results) == "list",
class(plots) == "logical",
all(sapply(c(pathOutput, plotSave), class) == "character"),
all(plotSave %in% c("no", "pdf", "bmp", "png", "ps"))
)
K <- as.numeric(sapply(grep("K_", names(results), value = TRUE), function(x) unlist(strsplit(x, "_"))[2]))
n <- length(results[[1]]$consensusClass)
# The consensus vector includes the consensus of each sample with itself (1), so we need to romove the positions that would correspond to the diagonal elements in the consensus matrix
removeIdx <- numeric(n)
removeIdx[1] <- length(results[[1]]$consensusVector)
for (i in 2:n) removeIdx[i] <- removeIdx[i - 1] - i
# Function to calculate cluster consensus m(k) and item consensus m_i(k) for one of the values of K, i.e, using one of the elements in the list results
singleKstats <- function(res, removeIdx) {
n <- length(res$consensusClass)
clusters <- unique(res$consensusClass)
# Holder for cluster consensus
mk <- matrix(c(clusters, rep(NA, length(clusters))), ncol = 2, dimnames = list(NULL, c("cluster", "mk")))
# Holder for item consensus
mik <- NULL
for (cl in 1:length(clusters)) {
cluster <- clusters[cl]
# Items in this cluster
items <- which(res$consensusClass == cluster)
names(items) <- NULL
Nk <- length(items)
# Calculate cluster consensus: m(k) (equation 3 Monti's paper), the average consensus index between all pairs of items belonging to the same cluster
if (length(items) > 1) {
# All the pairs of samples in this cluster (also including as a pair a sample with itself)
pairsID <- rbind(t(combn(items, 2)), matrix(rep(items, 2), ncol = 2))
# Remove the pair of each sample with itself
pairsID <- pairsID[pairsID[, 1] != pairsID[, 2], , drop = F]
# Find corresponding position for each pair in consensusVector
pairsPos <- n * (pairsID[, 1] - 1) + pairsID[, 2] - sapply(pairsID[, 1], function(x) sum(seq(x-1)) * (x>1))
mk[cl, 2] <- sum(res$consensusVector[pairsPos]) / length(pairsPos) # length(pairsPos) = Nk(Nk-1)/2
} else {
# if there's only one sample in the cluster, assume m(k) = 0
mk[cl, 2] <- 0
}
# Calculate item consensus: m_i(k) (equation 4 Monti's paper), the average consensus index between item ei and all the (other) items in cluster cl, for all ei and cl
# Here we calculate the item consensus for all items i with respect this particular cluster
mikTmp <- sapply(seq(n), function(x) {
otherItems <- setdiff(items, x) # items in this cluster, excluding x if x is in the cluster
if (length(otherItems) > 0) { # if this cluster has only 1 item, and it is equal to 0, then we don't have otherItems and mik = 0
pairsID <- matrix(c(rep(x, length(otherItems)), otherItems), ncol = 2) # All the pairs between this sample x and the other samples in the cluster
pairsPos <- n * (pairsID[, 1] - 1) + pairsID[, 2] - sapply(pairsID[, 1], function(y) sum(seq(y - 1)) * (x > 1))
sum(res$consensusVector[pairsPos]) / length(pairsPos) # length(pairsPos) = ifelse(x %in% items, Nk - 1, Nk)
} else {
0
}
})
mik <- rbind(mik, cbind(rep(cluster, length(mikTmp)), seq(n), mikTmp))
}
colnames(mik) <- c("cluster", "item", "mik")
return(list(mk, mik))
}
# Apply that function on all the values of K
res <- lapply(results[1:length(K)], function(x) singleKstats(x, removeIdx))
# Merge cluster consensus mk
if (length(K) == 1) {
mk <- structure(list(cbind(K = rep(K, nrow(res[[1]][[1]])), res[[1]][[1]])), names = paste0("K_", K))
} else {
mk <- mapply(function(x, y) cbind(K = rep(y, nrow(x[[1]])), x[[1]]), res, K)
}
mk <- do.call(rbind, mk)
mk <- data.frame(mk)
# Merge item consensus mik
if (length(K) == 1) {
mik <- structure(list(cbind(K = rep(K, nrow(res[[1]][[2]])), res[[1]][[2]])), names = paste0("K_", K))
} else {
mik <- mapply(function(x, y) cbind(K = rep(y, nrow(x[[2]])), x[[2]]), res, K)
}
mik <- do.call(rbind, mik)
mik <- data.frame(mik)
# Calculate intra consensus, as the mean of all cluster consensus m(k)
intraConsensus <- structure(tapply(mk[, "mk"], mk[, "K"], mean), names = paste0("K_", K))
# Calculate inter consensus
# We already have the item consensus mik, which is the mean consensus between an item i and all the (other) items in a particular cluster k
# For each item i, we'll take the mean of the mik with all k except for the cluster in which i belongs
# Then, we take the mean of those values across all items. That is going to be the interConsensus for a given K
interConsensus <-
sapply(split(mik, mik$K), function(x) { # for each K
k <- x$K[1]
mean_mik_between_item_i_and_other_clusters <-
mapply(function(mat, cluster) { # for each item
mat <- mat[mat$cluster != cluster, , drop = F] # exclude the cluster in which item i belongs
mean(mat$mik)
},
split(x, x$item), results[[paste0("K_", k)]]$consensusClass) # mat has all mik for each item i in a list, cluster is a vector with cluster in which item i belongs
mean(mean_mik_between_item_i_and_other_clusters)
})
names(interConsensus) <- paste0("K_", K)
intraInterCons <- cbind(intraConsensus, interConsensus)
# Plots (only if we have more than one value in K)
if (length(K) > 1) {
if (plots) {
plotSave <- match.arg(plotSave)
# Cluster consensus plot
if (plotSave == "pdf") {
pdf(onefile = TRUE, width = 21, file = paste0(pathOutput, "ClusterConsensusPlot.pdf"))
} else if (plotSave == "bmp") {
bitmap(file = paste0(pathOutput, "ClusterConsensusPlot.png"), width = 21)
} else if (plotSave == "png") {
png(paste0(pathOutput, "ClusterConsensusPlot.png"), type="cairo", width = 480 * 3)
} else if (plotSave == "ps") {
postscript(onefile = FALSE, file = paste0(pathOutput, "ClusterConsensusPlot.ps"), width = 21, height = 7)
}
# Using the tracking color scheme
mk2 <- mk
mk2$Klabel <- factor(mk2$K)
levels(mk2$Klabel) <- paste0("K=", levels(mk2$Klabel))
mk2$cluster <- factor(mk2$cluster)
mk2$color <- do.call(c, sapply(results$colorTracking, function(x) x[[3]]))
mk2$K_Cl <- paste0("K", mk2$K, "-C", mk2$cluster)
clusterConsensusPlot <-
ggplot2::ggplot(mk2, ggplot2::aes(x = K_Cl, y = mk, fill = K_Cl)) +
ggplot2::geom_bar(stat = "identity", position = ggplot2::position_dodge(), width = 1) +
ggplot2::facet_grid(. ~ Klabel, scale = "free_x", space = "free") +
ggplot2::ggtitle("Cluster Consensus Plot") +
ggplot2::scale_x_discrete(name = "Cluster", labels = structure(sapply(unique(mk2$K_Cl), function(y) unlist(strsplit(y, "-C"))[2]), names = unique(mk2$K_Cl))) + # provide appropiate labels, other wise we see the K_Cl labels
ggplot2::scale_y_continuous(name = "Cluster consensus", limits = 0:1, breaks = seq(0, 1, 0.2)) +
ggplot2::scale_fill_manual(values = mk2$color, guide = FALSE)
print(clusterConsensusPlot)
# Close device
if (plotSave != "no") dev.off()
# Item consensus plot
mik2 <- mik
# Add column indicating k-item combination, and then look to which cluster the item belongs
mik2$KItem <- paste0("K", mik2$K, "-I", mik2$item)
consClass <- c()
for (k in K) consClass <- c(consClass, results[[paste0("K_", k)]]$consensusClass)
names(consClass) <- NULL
consClass <- data.frame(consClass, row.names = paste0("K", rep(K, each = n), "-I", rep(seq(n), length(K))))
mik2$itemBelongsToCluster <- paste("Cluster", consClass[mik2$KItem, "consClass"])
# Add column indicating itemBelongsToCluster-item combination
mik2$ItemClusterBelongs <- paste0("I", mik2$item, "-CB", mik2$itemBelongsToCluster)
head(mik2)
# Add column of clusters labeled
mik2$clusterLab <- paste("Cluster", mik2$cluster)
# Add column indicating K-cluster combination
mik2$K_Cl <- paste0("K", mik2$K, "-C", mik2$cluster)
# Make plot for each K
itemConsPlots <- lapply(split(mik2, mik2$K), function (submix) {
ggplot2::ggplot(submix, ggplot2::aes(x = ItemClusterBelongs, y = mik, fill = K_Cl)) +
ggplot2::geom_bar(stat = "identity", width = 1) +
ggplot2::facet_grid(. ~ itemBelongsToCluster, scale = "free_x", space = "free") +
ggplot2::scale_x_discrete(name = "Samples") +
ggplot2::scale_y_continuous(name = "Sum of item consensus") +
ggplot2::scale_fill_manual(name = "Cluster", labels = sort(unique(submix$cluster)), values = structure(mk2$color, names = mk2$K_Cl)) +
ggplot2::theme(axis.ticks = ggplot2::element_blank(), axis.text.x = ggplot2::element_blank()) +
ggplot2::ggtitle(paste0("Item Consensus Plot - K = ", submix$K[1]))
})
# Initialize device if we save the plot
if (plotSave == "pdf") {
pdf(onefile = TRUE, width = 21, file = paste0(pathOutput, "ItemConsensusPlot.pdf"))
} else if (plotSave == "bmp") {
bitmap(file = paste0(pathOutput, "ItemConsensusPlot%03d.png"), width = 21)
} else if (plotSave == "png") {
png(paste0(pathOutput, "ItemConsensusPlot%03d.png"), type="cairo", width = 480 * 3)
} else if (plotSave == "ps") {
postscript(onefile = FALSE, file = paste0(pathOutput, "ItemConsensusPlot%03d.ps"), width = 21, height = 7)
}
# Print plots
for (i in 1:length(itemConsPlots)) print(itemConsPlots[[i]])
# Close device
if (plotSave != "no") dev.off()
}
}
return(list(clusterConsensus = mk, itemConsensus = mik, intraInterCons = intraInterCons))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.