#rmarkdown::render("clusterExperimentTutorial.Rmd")
knitr::opts_chunk$set(fig.align="center", cache=TRUE, cache.path = "R_cache/", fig.path="R_figure/", fig.width=6,fig.height=6,error=TRUE,autodep=TRUE,out.width="600px",out.height="600px")
knitr::dep_auto()
library(diagram)
library(clusterExperiment)
diagSize<-"600px"
drawClusterAll <- function(highlight=NULL, Layer2Options=NULL,plotOptions=TRUE,Layer2OnlyIfUsed=TRUE){
    plotLayer2<-length(Layer2Options)>0 | !Layer2OnlyIfUsed
    box.cex<-0.5
    boxRadx<-function(st){
        rad<-strwidth(paste(" ",st," ",sep=""),units="user",cex=box.cex)/2
        pmax(rad,rep(0.03,length(rad)))
    }
    diagram::openplotmat(main = "", xlim =c(0,1), ylim = c(0,1)) #so have plot open to determine string width
    Layer1Names<-c("clusterFunction","subsample", "sequential")
    Layer1Options<-list("clusterFunction"=c("tight", "hierarchical01", "pam", "kmeans"), "subsample"=c("subsample.FALSE","subsample.TRUE"), "sequential"=c("sequential.FALSE","sequential.TRUE"))

    Layer2Names<-c("mainClusterArgs","subsampleArgs","seqArgs")
    Layer2NamesSimple<-c("mainClusterArgs","subsampleArgs","seqArgs")
    #x,y,box.size,box.prop,box.type
    makeDf<-function(x,y,box.size,box.prop,box.type){data.frame(x=x,y=y,box.size=box.size,box.prop=box.prop,box.type=box.type,stringsAsFactors =FALSE)}
    posMatrix<-makeDf(x=0.5,y=0.95,box.size=0.06,box.prop=0.4,box.type="diamond")
    names<-c("clusterSingle")

    values<-"ellipse"
    options<-"rect"
    Layer1Pos<-list("clusterFunction"=makeDf(0.25, 0.77,boxRadx("clusterFunction"),0.4,options),"subsample"=makeDf(0.50,0.77,0.06,0.4,options), "sequential"=makeDf(0.75, 0.77,0.06,0.4,options))
    Layer1OptionsPos<-list(
        "clusterFunction"=list(
            "tight"=makeDf( 0.081, 0.59,boxRadx("tight"),0.4,values), "hierarchical01"=makeDf(0.162, 0.59,boxRadx("hierarchical01"),0.4,values), 
            "pam"=makeDf(0.243, 0.59,boxRadx("pam"),0.4,values), "kmeans"=makeDf(0.324, 0.59,boxRadx("kmeans"),0.4,values)), 
        "subsample"=list("subsample.FALSE"=makeDf(0.45, 0.59,0.03,0.4,values),"subsample.TRUE"=makeDf(0.55, 0.59,0.03,0.4,values)), 
        "sequential"=list("sequential.FALSE"=makeDf(0.77, 0.59,0.03,0.4,values),"sequential.TRUE"=makeDf(0.88, 0.59,0.03,0.4,values))
        )
    Layer2Pos<-list("mainClusterArgs"=makeDf(0.2,0.44,boxRadx(Layer2Names[1]),0.4,options), 
        "subsampleArgs"=makeDf(0.5,.44,boxRadx(Layer2Names[2]),0.4,options), 
        "seqArgs"=makeDf(0.825, 0.44,boxRadx(Layer2Names[3]),0.4,options))

        #remove Layer2 if not used
    if(any(!names(Layer2Options) %in% Layer2NamesSimple)) warning("some names of the Layer2Options list are not valid")
    Layer2Options<-Layer2Options[names(Layer2Options) %in% Layer2NamesSimple]
    if(Layer2OnlyIfUsed) {
        Layer2Names<-Layer2Names[Layer2NamesSimple %in% names(Layer2Options)]
        Layer2Pos<-Layer2Pos[names(Layer2Pos)%in%names(Layer2Options)]
    }

    posMatrix<-rbind(posMatrix,do.call("rbind",Layer1Pos))
    names<-c(names,Layer1Names)
    if(plotOptions){
        posMatrix<-rbind(posMatrix,do.call("rbind",unlist(Layer1OptionsPos,recursive=FALSE)))
        names<-c(names,unlist(Layer1Options))
    }
    if(plotLayer2){
        posMatrix<-rbind(posMatrix,do.call("rbind",Layer2Pos))
        names<-c(names,Layer2Names)
    }
    posMatrix$boxCol="black"
    if(!is.null(highlight)){
        if(any(!highlight %in% names)) warning("some highlight values not matching node names and will be ignored")
        highlight<-highlight[highlight %in% names]
        m<-match(highlight,names)
        posMatrix$boxCol[m]<-"red"
    }
    if(length(Layer2Options)>0){
        Layer2OptionsPos<-lapply(names(Layer2Options),function(nam){
            opts<-Layer2Options[[nam]]
            ys<-0.44-cumsum(rep(0.05,length(opts)))
             data.frame(x=Layer2Pos[[nam]][,"x"], y=ys, box.size=boxRadx(opts),box.prop=0.4,box.type=options,boxCol="red")
        })
        posMatrix<-rbind(posMatrix,do.call(rbind,Layer2OptionsPos))
        names<-c(names,unlist(Layer2Options))
    }



    #square coefficient matrix, specifying the links (rows=to, cols=from)
    M<-matrix("0",nrow=length(names), ncol=length(names))
    rownames(M)<-names
    colnames(M)<-names

    M["clusterSingle",Layer1Names]<-""
    if(plotOptions) lapply(Layer1Names,function(x){M[x,Layer1Options[[x]] ]<<-""})
        # if(2 %in% LayersToPlot){
    #   M["clusterSingle",Layer2Names]<-""
    # }
    # lapply(names(Layer2Names),function(x){
#       opts<-Layer1Options[[x]]
#       if("TRUE" %in% opts) opts<-"TRUE"
#       M[opts,Layer2Names[[x]] ]<<-1
#   })

#   print(M)
    names<-sapply(strsplit(names,"[.]"),function(x){if(length(x)>1) x[2] else x[1]})
    par(mar=c(1, 1, 1, 1))
    transY<-posMatrix[,"y"]-(min(posMatrix[,"y"])-0.02)
    transY<-transY/(max(transY)+0.02)
    diagram::plotmat(M, pos = cbind(posMatrix[,c("x")],transY), name = names, absent="0",lwd = 1, box.lwd = 2, box.cex = box.cex, box.size = posMatrix[,"box.size"], box.type = posMatrix[,"box.type"], box.prop = posMatrix[,"box.prop"], curve = 0, main="", arr.type="triangle", arr.width=0, arr.length=0, box.lcol=posMatrix[,"boxCol"],add=TRUE)

}

Introduction

The goal of this package is to allow the user to try many different clustering algorithms in one package structure, while implementing common post-processing steps unrelated to the clustering algorithm (e.g. consensus clustering). The package also provides tools for performing differential expression analysis between the clusters to find important genes. Finally, the package provides visualization tools.

Basic overview of clustering routines

The package encodes many common practices that are shared across clustering algorithms, like subsampling the data, computing silhouete width, sequential clustering procedures, and so forth. It allows the user to specify the underlying clustering algorithm, as well as its parameters.

There are two main user functions for clustering, clusterSingle and clusterMany. clusterSingle is the wrapper function that calls the underlying clustering functions. clusterMany is a convenience function that implements clusterSingle across combinations of parameter choices. Such parameter choices can be the clustering method, whether to subsample the data, or the specific parameters of the clustering algorithm (e.g. the number of clusters in kmeans).

A common workflow could be to apply clusterMany to the data to get a large collection of clusterings, each from clustering based on different parameters. To try to find a clustering presumably robust to the parameter choices, clusterExperiment provides a function combineMany that tries to find a unifying clustering of the samples.

We find that many methods for choosing the appropriate number of clusters for methods like k-means err on the side of smaller number of clusters. However, we find in practice that we tend to prefer to err on finding many clusters and then merging them based on examining the data (and many of the default parameters are set for finding many clusters). We provide the function clusterHclust to perform hierarchical clustering of the resulting clusters and the function mergeClusters to follow that hiearchy and merge the clusters based on the percentage of significantly different genes found in the comparisons of different arms of the hiearchy.

Finding related genes

A common practice after determining clusters is to perform differential gene expression analysis in order to find genes that show the greatest differences amongst the clusters. We would stress that this is purely an exploratory technique, and any p-values that result from this analysis are not valid, in the sense that they are likely to be inflated. This is because the same data was used to define the clusters and to perform differential expression analysis.

Since this is a common task, we provide the function getBestFeatures to perform various kinds of differential expression analysis between the clusters. A common F-statistic between groups can be chosen. However, we find that it is far more informative to do pairwise comparisons between clusters, or one cluster against all, in order to find genes that are specific to a particular cluster. An option for all of these choices is provided in the getBestFeatures function. The getBestFeatures function uses the DE analysis provided by the limma package.

In addition, the getBestFeatures function provides an option to do use the "voom" correction in the limma package to account for the mean-variance relationship that is common in count data. Unlike edgeR or DESeq, the voom correction does not require a count matrix, and therefore can be used on FPKM or TPM entries, as well as normalized data.

Visualization

We provide a visualization to compare many clusterings of the same data in the function plotClusters. We also provide some functions that are wrappers for common visualization tasks in clustering gene expression data. We provide an heatmap function, plotHeatmap that is an interface to the aheatmap function in NMF package. See below for detail on the plotHeatmap function.

Data example

We will work with a simulated gene expression data set with 300 samples and 3 hypothetical genes, which we will simply label as growth, immune, and structural genes. The expression values will be drawn from different normal distributions. Hence, our data matrix has dimensions $300 x 3$. In this data set, we know a priori that there are k = 5 clusters (1-100, 101-125, 126-175, 176-225, 226-300). There are 100 samples in the first cluster, 25 in the second cluster, 50 samples in the third cluster, 50 samples in the fourth cluster, and 75 samples in the fifth cluster. Let us first visualize the clusters.

library(scatterplot3d)
set.seed(1)
n = c(100, 25, 50, 50, 75)
growth_m = c(0, 2, -2, 0, 4)
growth_sd = c(0.5, 1, 1, 0.5, 0.5)
immune_m = c(0, -2, -3, 4, 0)
immune_sd = c(0.5, 1, 0.5, 0.5, 1)
structural_m = c(3, 0, 2, -1, -2)
structural_sd = c(1, 1, 1, 0.7, 1)
data = data.frame(n=n, growth_m=growth_m, growth_sd, immune_m, immune_sd, structural_m, structural_sd)

get_expression = function(data){
  n = data[1]
  growth_m = data[2]
  growth_sd = data[3]
  immune_m = data[4]
  immune_sd = data[5]
  structural_m = data[6]
  structural_sd = data[7]
  growth = rnorm(n, growth_m, growth_sd)
  immune = rnorm(n, immune_m, immune_sd)
  structural = rnorm(n, structural_m, structural_sd)
  expression = data.frame(growth=growth, immune=immune, structural=structural)
  return(expression)
}

expressions = apply(data, 1, get_expression)
expressions = do.call(rbind, expressions)
expressions$cluster = c(rep("1", 100), rep("2", 25), rep("3", 50), rep("4", 50), rep("5", 75))
expressions$pcolor[expressions$cluster=="1"] = "red"
expressions$pcolor[expressions$cluster=="2"] = "blue"
expressions$pcolor[expressions$cluster=="3"] = "darkgreen"
expressions$pcolor[expressions$cluster=="4"] = "black"
expressions$pcolor[expressions$cluster=="5"] = "darkorange"

# if (!file.exists("data")){
#   dir.create("data")
# }
#
#save(expressions,file="data/expressions.Rda")
par(las=1)
with(expressions, {plot = scatterplot3d(growth, immune, structural, color = pcolor, main="Scatterplot of Cell Gene Expression")})
legend("topleft", c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5"), fill=c("red", "blue", "darkgreen", "black", "darkorange"))

Clustering with clusterSingle

Let us start with a walkthrough of clusterSingle. clusterSingle is the primary function for performing the primary clustering tasks. Each of these tasks has an underlying (user-accessible) function and clusterSingle is a wrapper around these tasks:

The arguments of clusterSingle indicate which of these tasks should be performed and allow the user to pass options for each of these tasks (that are passed to the downstream functions). The options are documented in the more detailed documentation vignette, but here we run through some common examples.

Here are the argument calls for clusterSingle:

clusterSingle(x, subsample = TRUE, sequential = FALSE, clusterFunction = c("tight", "hierarchical01", "pam", "kmeans"), mainClusterArgs = NULL, subsampleArgs = NULL, seqArgs = NULL)

The primary arguments clusterFunction,subsample, and sequential govern each of these tasks:

We can visualize arguments and the values they can take on as a diagram:

drawClusterAll()

The remaining arguments (mainClusterArgs, subsampleArgs, and seqArgs) are for passing additional options to each of these tasks, and we will show examples of how they are used.

Example: PAM Clustering

Let us use the function clusterSingle to perform simple Partition Around Medoids (PAM) clustering on our expression data set with k=5. In this expression matrix, rows are samples and columns are features.

To do standard pam clustering, we set clusterFunction="pam". The remaining tasks (subsampling and sequential) will not be performed, so we set subsample=FALSE and sequential=FALSE, which we can visualize as:

drawClusterAll(highlight = c("pam", "subsample.FALSE", "sequential.FALSE"))

However, notice that doing this will return an error:

simpleCluster = clusterSingle(expressions, subsample=FALSE, sequential=FALSE, clusterFunction="pam")

This is because even to perform simple PAM, the PAM algorithm needs additional parameters to be set, namely 'k', the number of groups.

However, there is no argument in clusterSingle for this. Because it is specific to a particular choice of clustering function, we don't want to clutter up clusterSingle with all the possible options that might be needed downstream for a particularly task. Instead, clusterSingle lets you pass such arguments to the function that actually does the task of clustering a distance matrix, clusterD via the mainClusterArgs option. So the user doesn't need to call clusterD explicitly, but often will need to pass an argument to clusterD.

In this case we need to pass the argument 'k'. Specifically, we need to create a list with an element of name "k" that takes on the value, say 5, for our k.

drawClusterAll(highlight = c("pam", "subsample.FALSE", "sequential.FALSE"),Layer2Options=list("mainClusterArgs"="k"),plotOptions=TRUE)
library(clusterExperiment)
library(cluster)
#load("data/expressions.Rda")
expression = expressions[, 1:3]
simpleCluster = clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", mainClusterArgs=list('k'=5))
table(simpleCluster$clustering)

The result is a one-element list with a vector of cluster assignments for each sample.

Let us compare this result with the result from calling pam with k=5 from the cluster package.

pamCluster = pam(dist(expression), 5)
table(clusterExperiment=simpleCluster$clustering, cluster=pamCluster$clustering)
table(clusterExperiment=simpleCluster$clustering, true=expressions$cluster)

We have now verified that these two function calls perform the same task. Incidentally, if we know the true number of clusters, a simple PAM procedure leads to good performances in this toy example. This is not surprising, since the clusters are simulated with Gaussian noise, in low dimensions.

More options for PAM clustering

There are other options we can pick to change the clustering of the distance matrix. A full documentation of possible options can be found in the help files of clusterD. Some of the useful options associated with PAM are the following common tasks:

We can pass these arguments to clusterD, again as a list:

drawClusterAll(highlight = c("pam", "subsample.FALSE", "sequential.FALSE"),Layer2Options=list("mainClusterArgs"=c("k","findBestK","removeSil","kRange")),plotOptions=TRUE)
Cluster<-clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", mainClusterArgs=list(findBestK=TRUE, removeSil=TRUE, kRange=2:10))

table(true=expressions$cluster, PAM=Cluster$clustering)

Note that, even in this toy example, it is not easy to choose the correct number of clusters, and a simple procedure based on comparing average silhouette width may not always lead to good results.

Note: at the current time, if we are not going to do the subsample task (subsample=FALSE), then clusterFunction must be pam because the other clustering options for clusterFunction assume a 0-1 distance matrix based on subsampling. If there is no subsampling, clusterSingle makes the distance matrix by default with dist(x) and therefore must be used with pam, as this is the only implemented method for distance matrices.

Subsampling

We can add the additional task to our clustering workflow to subsample the samples and cluster the resampled data. This is accomplished by setting subsample=TRUE. In this case, clusterSingle will first subsample the rows of the data matrix, cluster the subsamples for a specified clustering algorithm, and return a clustering co-occurance matrix.

Then, clusterSingle will cluster this co-occurance matrix using another specified algorithm, which can be the same as the algorithm used in clustering the subsampled data, but doesn't have to be. Indeed, several of the cluster algorithm options (tight and hierarchical) are designed to cluster 0-1 co-occurance matrices from subsampling.

In this demonstration, let us cluster the subsamples using kmeans with k=5, then cluster the co-occurance matrix using the tight clustering algorithm. Now we have options we want to set for the subsampling portion of the task.

drawClusterAll(highlight = c("tight", "subsample.TRUE", "sequential.FALSE"),Layer2Options=list("subsampleArgs"=c("k","clusterFunction")),plotOptions=TRUE)
set.seed(212421)
Cluster<-clusterSingle(expression, subsample=TRUE, sequential=FALSE, clusterFunction="tight", subsampleArgs=list("k"=5, clusterFunction="kmeans"))

table(true=expressions$cluster, subsample=Cluster$clustering)

As expected, the subsampling procedure will return more clusters than a simple PAM. In addition, some observations may be not clustered (value of -1), because the proportion of times that they get assign to the same cluster core across subsamplings does not pass the specified threshold. Note that the choice of k is now not directly linked to the output labels: although we selected k=5 for the subsampling step, the final result contains 8 clusters.

DR: we should probably show a heatmap of the subsampling proportions to make this point clearer

Sequential

The last additional task is to sequentially remove the best cluster, and then recluster the remaining samples. This is performed using sequential=TRUE.

Again, we need to pass arguments to control how this is done, as a list to the argument seqArgs. A necessary argument k0 must be passed to seqArgs. What this argument means depends on what other choices are being made, but if we also set subsample=TRUE (recommended), then k0 is the value of k passed to the subsampling clustering method at the first iteration of sequential algorithm.

This time we will use the hierarchical method for clustering the 0-1 co-occurance matrix.

drawClusterAll(highlight = c("hierarchical01", "subsample.TRUE", "sequential.TRUE"),Layer2Options=list("subsampleArgs"=c("clusterFunction"),seqArgs=c("k0","verbose")),plotOptions=TRUE)
Cluster<-clusterSingle(expression, subsample=TRUE, sequential=TRUE, clusterFunction="hierarchical01", seqArgs=list(k0=5, verbose = FALSE), subsampleArgs = list(clusterFunction="kmeans"))

Running clusterSingle with sequential=TRUE will return a list of the three elements clustering, clusterInfo, and whyStop. clusterInfo is a matrix of information regarding the algorithm behavior for each cluster (the starting and stopping k set for subsampling clustering for each cluster, and the number of iterations for each cluster). whyStop is a character string explaining what triggered the algorithm to stop.

table(Cluster$clustering)
Cluster$clusteringInfo
Cluster$whyStop

table(true=expressions$cluster, clusterExperiment=Cluster$clustering)

This procedure leads to many more clusters, many of which are very small size.

DR: don't we have a min size parameter to remove clusters with too few observations? Also, here would be a good place to talk about merging. Perhaps merging before finding markers?

Finding Marker Genes

The function getBestFeatures calls limma on input data to determine the gene features most associated with the found clusters. The minimal required input is the expression data and a vector of the (numeric) cluster labels. The method ignores samples in clusters marked by "-1" ("-1" always signifies samples not clustered). By default, getBestFeatures finds significant genes based on a F-test between the clusters, but the option type can provide other measures and is the most common one to be changed by the user.

In this demonstration, clustering will first be performed with basic PAM with k=4 and removing negative silhouette widths. We use the simulated data found in the package simData.

getBestFeatures will be called to determine the top features associated with every pairwise comparison between the clusters. By default, topTable in limma only returns the top 10, which in this context means 10 per each of the pairwise comparisons.

data(simData)
cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, mainClusterArgs=list(k=4,removeSil=TRUE))
pairsAll<-getBestFeatures(cl$clustering,simData,type="Pairs")

head(pairsAll)

The column Contrast in the output signifies the pairwise difference (or contrast) for which the information on the row comes from. IndexInOriginal matches the gene to its index in the original dataset. The other columns are provided by topTable in limma (see documentation therein).

getBestFeatures accepts arguments to limma's function topTable to decide which genes should be returned (and in what order). In particular, we can set an adjusted p-value cutoff for each contrast, and set number to control the number of genes returned for each contrast. By setting number to be the length of all genes, and p.value=0.05, we can return all genes for each contrast that have adjusted p-values less than 0.05.

data(simData)
cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, mainClusterArgs=list(k=4,removeSil=TRUE))
pairsAll2<-getBestFeatures(cl$clustering,simData,type="Pairs",p.value=0.05,number=ncol(simData))
table(pairsAll2$Contrast)

Notice that the same genes can be replicated across different contrasts:

nrow(pairsAll2)
length(unique(pairsAll2$Gene))

Note that by using contrasts, we are making use of all of the data, not just the samples in the particular cluster pairs being compared. This means the variance is estimated with all the samples.

Heatmaps of Marker Genes

One way to visualize samples with only the features found by getBestFeatures is to use the plotHeatmap function. The plotHeatmap function minimally requires two arguments - a vector of cluster assignments and the data matrix. plotHeatmap uses the heatmap function aheatmap in the package NMF, but provides an interface that is useful for the standard plots desired for evaluating the clustering output. In particular, it is designed to draw the cluster assignments of the samples on the top, where different colors denote different cluster assignments; lack of assignment, denoted by "-1", are automatically colored white. In the code example below, clusterData is set to the data matrix to define the color scale. Currently, plotHeatmap internally transforms the input data (i.e. argument heatData) to the log scale; this will be fixed to be an option in future versions, but currently, passing of already logged data is not supported and log-scale data should be exponentiated for use with plotHeatmap.

dataMatrix = simData[, unique(pairsAll2$IndexInOriginal)]
cluster = cl$clustering
plotHeatmap(cluster, exp(dataMatrix), eps=0, clusterData=dataMatrix)

Additional covariates can also be plotted by giving a matrix of values to annCol; unlike aheatmap, plotHeatmap will assume that such values should be converted to factors unless told otherwise by the argument whAnnCont.

plotHeatmap(cluster, exp(dataMatrix), eps=0, clusterData=dataMatrix, annCol = data.frame(clusterExperiment=cluster, TrueClusters=trueCluster))

Another feature of plotHeatmap is that a different dataset can be used for determining the clustering than for determining the color-scale image of the heatmap. This can be useful, for example, for clustering on the normalized data which is not on the count-scale, but showing the original count-scale in the color-scale which is more interpretable.

Clustering Clusters

It can be useful to create heatmaps that keep the clusters together. However, arbitrarily ordered clusters will not make compelling heatmaps either. The function clusterHclust will perform hierarchical clustering of the cluster mediods, and provide a dendrogram that will order the samples according to this clustering of the clusters.

clDendro<-clusterHclust(dataMatrix,cl$clustering,full=FALSE)
plot(clDendro)

More useful, however, is to use this in conjunction with plotHeatmap, in which case we want the individual samples in the dendrogram, not just the clusters. Notice that full=TRUE is necessary to get such a dendrogram. Then we give the data as heatData to plotHeatmap, indicating that it should be used for creating the color-image of the data, but provide our dendrogram to clusterData to indicate it will provide the clustering of the data.

clDendroFull<-clusterHclust(dataMatrix,cl$clustering,full=TRUE,unassigned="outgroup")
plotHeatmap(cl$clustering,heatData=exp(dataMatrix),eps=0,clusterData=clDendroFull,annCol = data.frame(clusterExperiment=cluster, TrueClusters=trueCluster))

Notice that the unassigned samples ("-1") are chosen in clusterHclust to be assigned to an outgroup in the dendrogram based on the argument unassigned.

One could also choose to remove them, making sure that the input data matrix to plotHeatmap doesn't contain these samples.

clDendroFull<-clusterHclust(dataMatrix,cl$clustering,full=TRUE,unassigned="remove")
whRm<-which(cl$clustering>0)
plotHeatmap(cl$clustering[whRm],heatData=exp(dataMatrix)[whRm,],eps=0,clusterData=clDendroFull,annCol = data.frame(clusterExperiment=cluster[whRm], TrueClusters=trueCluster[whRm]))

Hierarchical Contrasts and Merging clusters

We can also perform tests based on this hierarchy of clusters. Specifically, we can test for each node of the hierarchy, whether the mean of the set of clusters to the right split of the node is equal to the mean on the left split. We do this by creating a contrast of the corresponding means and testing it for each gene via getBestFeatures, where the type is set to "Dendro" and we provide the dendrogram created by clusterHclust (but setting full=FALSE so that individual samples are not in the dendrogram).

dendroGenes<-getBestFeatures(cl$clustering,simData,type="Dendro",p.value=0.05,number=ncol(simData),dendro=clDendro)
head(dendroGenes)

The output is similar to before, but now there are two identifiers for the contrast: one is ContrastName which gives the Node being compared, and the other is Contrast which is the actual mathematical statement of the mean comparison between clusters that is being performed.

We can use this basic idea to merge clusters based on what percentage of genes are found differentially expressed. The mergeClusters function performs such contrasts for each node, and then calculates the proportion of significant hypotheses. There are several methods implemented, but the simplest is just the proportion found by calculating the proportion of DE genes at a given False Discovery Rate thresshold (using the Benjamini-Hochberg procedure).

mergeOut<-mergeClusters(dat=simCount, cl=cl$clustering, mergeMethod = c("adjP"), cutoff = 0.1, plotType = "mergeMethod",isCount=TRUE)
table(Meged_clusters=mergeOut$cl,Original_clusters=cl$clustering)
print(mergeOut$propDE)

The function optionally creates a plot showing the dendrogram and what clusters were merged. mergeClusters invisibly returns information, including the merged clusters (cl), and a matrix of different estimates based on different methods for computing the proportion (propDE).

Running and Comparing Many Cluster Choices (clusterMany)

drawCompareChoices = function(arguments=NULL){
if (!is.null(arguments)){
  if (arguments == "plotClusters"){colargs = c(rep("black", 6), "red", "black")}else{colargs = c(rep("black", 7), "red")}
  }else{colargs = rep("black", 8)}
par(mar=c(1, 4, 2, 2))
names = c("clusterMany", "clusterSingle", "clusterSingle", "...", "clusterSingle", "output", "plotClusters", "combineMany")
M = matrix(0, nrow=8, ncol=8, byrow=TRUE)
M[2, 1] <- M[3, 1] <- M[4, 1] <- M[5, 1] <- M[6, 2] <- M[6, 3] <- M[6, 4] <- M[6, 5] <- M[7, 6] <- M[8, 6] <- ""
diagram::plotmat(M, pos=c(1, 4, 1, 2), name=names, lwd=1, box.lwd=2, box.cex=0.5, box.size=0.08, box.type=c(rep("square", 5), "diamond", rep("square", 2)), box.prop=0.2, curve=0, main="clusterMany Workflow", arr.type="triangle", arr.width=0, arr.length=0, box.lcol=colargs)
par(mar=c(5.1, 4.1, 4.1, 2.1))
}

clusterMany is a function for running many different combinations of parameters or different datasets. All combinations given by the user are run through clusterSingle (though the function does some (minimal) checking to remove nonsensical combinations). The calls to clusterSingle can also be parallelized across multiple cores (by setting the argument ncores to be greater than 1).

After clusterMany is run, the function plotClusters can be used to visualize the many different cluster assignments that resulted, and combineMany can be used to determine a consensus clustering across the parameter combinations. Finally, mergeClusters, described above, can be used to determine whether some clusters in the consensus clustering should be merged.

drawCompareChoices(arguments = "plotClusters")

Compare Choices

In this example, the combination of parameters we are testing are: (i) the range of k values for clustering; (ii) whether removeSil is TRUE or FALSE; and (iii) whether pam or kmeans is used.

Before running the clustering on these combinations, it is a good idea to first inspect what parameter combinations will be run by calling clusterMany and setting run=FALSE. The resulting output is a matrix where each column corresponds to a parameter setting and each row is a combination of parameters that will be sent to clusterSingle.

parameters = clusterMany(expressions, ks=c(3:7), clusterMethod=c("pam", "kmeans"), run=FALSE)
head(parameters)
dim(parameters)

Note: The output can be fed back into clusterMany. For example if some combinations are not of interest, those rows could be removed and then the resulting matrix given as an argument.

Let us now run clusterMany on the original simulated dataset.

result = clusterMany(expressions, ks=c(3:7), clusterMethod=c("pam"), subsample=FALSE, removeSil = c(TRUE, FALSE))
head(result$clMat)

With run=TRUE, clusterMany will return a list with elements clMat and clusterInfo. clMat is the main object of interest and will be a matrix with each row representing a sample and each column the cluster assignments from a particular combination of parameters. clusterInfo will only be informative when sequential=TRUE, in which case it will be list with information regarding clustering result.

Evaluating the many clusters

To visualize the several clusters created by clusterMany, we run plotClusters to visualize the assignments. plotClusters minimally takes as its argument a matrix of cluster assignments where each row represents a sample and each column a clustering. The result of plotClusters is a color representation of the clusters, where each column of the plot is a sample, and the rows are the different clusterings (note this is the inverse of the input clustering matrix so as to conserve space). Within each row/clustering, the samples are color coded as to their cluster assignment. Furthermore, plotClusters tries to order the samples so that the clusters are assigned colors that make clusters composed of similar samples get the same cluster color.

par(mar=c(2, 8, 1, 1))
plotClusters(result$clMat)

Notice that the unassigned ("-1") entries are always color-coded white.

The alignment (and color assignments) created by plotClusters are invisibly returned and can be used for other purposes (see help documentation). The alignment of the clusters is highly dependent on the order of the clusters, with the alignment starting with the first cluster in the first column, then the second aligned to the first, and so forth. Different orderings of the clusters can greatly change the visualizations.

To find a consensus cluster across many different clusterings of the same data combineMany can be used. The minimal input to combineMany is a cluster matrix with samples as rows and different clustering assignments for each column.

As an example, let us find the consensus cluster from the output of clusterMany.

r <-combineMany(result$clMat)
table(r$clustering)
plotClusters(cbind(FINAL=r$clustering,result$clMat))

Let's look at the options we chose here. To determine whether samples should be clustered together, the parameter proportion is used to set the proportion of co-occurance we want as a cutoff. If we want to be strict, we can determine clusters only if they are always together in any clustering (proportion=1, the default). Samples that are unassigned in greater than propUnassigned will always be assigned a value of -1 (i.e. will also be unassigned in the final clustering). And the option minSize forces all resulting cluster to have a minimal size equal to minSize.

r <-combineMany(result$clMat, proportion=.7,minSize=10)
table(r$clustering)
plotClusters(cbind(FINAL=r$clustering,result$clMat))

We can also visualize the co-occurance matrix along with the final clusters, which is returned with the output of combineMany (if proportion<1)

plotHeatmap(clusterVector=r$clustering,heatData=exp(r$percentageShared),eps=0)


Bioconductor-mirror/clusterExperiment documentation built on Aug. 2, 2017, 4:28 p.m.