#rmarkdown::render("clusterExperimentTutorial.Rmd") knitr::opts_chunk$set(fig.align="center", cache=TRUE, cache.path = "R_cache/", fig.path="R_figure/", fig.width=6,fig.height=6,error=TRUE,autodep=TRUE,out.width="600px",out.height="600px") knitr::dep_auto() library(diagram) library(clusterExperiment)

diagSize<-"600px" drawClusterAll <- function(highlight=NULL, Layer2Options=NULL,plotOptions=TRUE,Layer2OnlyIfUsed=TRUE){ plotLayer2<-length(Layer2Options)>0 | !Layer2OnlyIfUsed box.cex<-0.5 boxRadx<-function(st){ rad<-strwidth(paste(" ",st," ",sep=""),units="user",cex=box.cex)/2 pmax(rad,rep(0.03,length(rad))) } diagram::openplotmat(main = "", xlim =c(0,1), ylim = c(0,1)) #so have plot open to determine string width Layer1Names<-c("clusterFunction","subsample", "sequential") Layer1Options<-list("clusterFunction"=c("tight", "hierarchical01", "pam", "kmeans"), "subsample"=c("subsample.FALSE","subsample.TRUE"), "sequential"=c("sequential.FALSE","sequential.TRUE")) Layer2Names<-c("mainClusterArgs","subsampleArgs","seqArgs") Layer2NamesSimple<-c("mainClusterArgs","subsampleArgs","seqArgs") #x,y,box.size,box.prop,box.type makeDf<-function(x,y,box.size,box.prop,box.type){data.frame(x=x,y=y,box.size=box.size,box.prop=box.prop,box.type=box.type,stringsAsFactors =FALSE)} posMatrix<-makeDf(x=0.5,y=0.95,box.size=0.06,box.prop=0.4,box.type="diamond") names<-c("clusterSingle") values<-"ellipse" options<-"rect" Layer1Pos<-list("clusterFunction"=makeDf(0.25, 0.77,boxRadx("clusterFunction"),0.4,options),"subsample"=makeDf(0.50,0.77,0.06,0.4,options), "sequential"=makeDf(0.75, 0.77,0.06,0.4,options)) Layer1OptionsPos<-list( "clusterFunction"=list( "tight"=makeDf( 0.081, 0.59,boxRadx("tight"),0.4,values), "hierarchical01"=makeDf(0.162, 0.59,boxRadx("hierarchical01"),0.4,values), "pam"=makeDf(0.243, 0.59,boxRadx("pam"),0.4,values), "kmeans"=makeDf(0.324, 0.59,boxRadx("kmeans"),0.4,values)), "subsample"=list("subsample.FALSE"=makeDf(0.45, 0.59,0.03,0.4,values),"subsample.TRUE"=makeDf(0.55, 0.59,0.03,0.4,values)), "sequential"=list("sequential.FALSE"=makeDf(0.77, 0.59,0.03,0.4,values),"sequential.TRUE"=makeDf(0.88, 0.59,0.03,0.4,values)) ) Layer2Pos<-list("mainClusterArgs"=makeDf(0.2,0.44,boxRadx(Layer2Names[1]),0.4,options), "subsampleArgs"=makeDf(0.5,.44,boxRadx(Layer2Names[2]),0.4,options), "seqArgs"=makeDf(0.825, 0.44,boxRadx(Layer2Names[3]),0.4,options)) #remove Layer2 if not used if(any(!names(Layer2Options) %in% Layer2NamesSimple)) warning("some names of the Layer2Options list are not valid") Layer2Options<-Layer2Options[names(Layer2Options) %in% Layer2NamesSimple] if(Layer2OnlyIfUsed) { Layer2Names<-Layer2Names[Layer2NamesSimple %in% names(Layer2Options)] Layer2Pos<-Layer2Pos[names(Layer2Pos)%in%names(Layer2Options)] } posMatrix<-rbind(posMatrix,do.call("rbind",Layer1Pos)) names<-c(names,Layer1Names) if(plotOptions){ posMatrix<-rbind(posMatrix,do.call("rbind",unlist(Layer1OptionsPos,recursive=FALSE))) names<-c(names,unlist(Layer1Options)) } if(plotLayer2){ posMatrix<-rbind(posMatrix,do.call("rbind",Layer2Pos)) names<-c(names,Layer2Names) } posMatrix$boxCol="black" if(!is.null(highlight)){ if(any(!highlight %in% names)) warning("some highlight values not matching node names and will be ignored") highlight<-highlight[highlight %in% names] m<-match(highlight,names) posMatrix$boxCol[m]<-"red" } if(length(Layer2Options)>0){ Layer2OptionsPos<-lapply(names(Layer2Options),function(nam){ opts<-Layer2Options[[nam]] ys<-0.44-cumsum(rep(0.05,length(opts))) data.frame(x=Layer2Pos[[nam]][,"x"], y=ys, box.size=boxRadx(opts),box.prop=0.4,box.type=options,boxCol="red") }) posMatrix<-rbind(posMatrix,do.call(rbind,Layer2OptionsPos)) names<-c(names,unlist(Layer2Options)) } #square coefficient matrix, specifying the links (rows=to, cols=from) M<-matrix("0",nrow=length(names), ncol=length(names)) rownames(M)<-names colnames(M)<-names M["clusterSingle",Layer1Names]<-"" if(plotOptions) lapply(Layer1Names,function(x){M[x,Layer1Options[[x]] ]<<-""}) # if(2 %in% LayersToPlot){ # M["clusterSingle",Layer2Names]<-"" # } # lapply(names(Layer2Names),function(x){ # opts<-Layer1Options[[x]] # if("TRUE" %in% opts) opts<-"TRUE" # M[opts,Layer2Names[[x]] ]<<-1 # }) # print(M) names<-sapply(strsplit(names,"[.]"),function(x){if(length(x)>1) x[2] else x[1]}) par(mar=c(1, 1, 1, 1)) transY<-posMatrix[,"y"]-(min(posMatrix[,"y"])-0.02) transY<-transY/(max(transY)+0.02) diagram::plotmat(M, pos = cbind(posMatrix[,c("x")],transY), name = names, absent="0",lwd = 1, box.lwd = 2, box.cex = box.cex, box.size = posMatrix[,"box.size"], box.type = posMatrix[,"box.type"], box.prop = posMatrix[,"box.prop"], curve = 0, main="", arr.type="triangle", arr.width=0, arr.length=0, box.lcol=posMatrix[,"boxCol"],add=TRUE) }

The goal of this package is to allow the user to try many different clustering algorithms in one package structure, while implementing common post-processing steps unrelated to the clustering algorithm (e.g. consensus clustering). The package also provides tools for performing differential expression analysis between the clusters to find important genes. Finally, the package provides visualization tools.

The package encodes many common practices that are shared across clustering algorithms, like subsampling the data, computing silhouete width, sequential clustering procedures, and so forth. It allows the user to specify the underlying clustering algorithm, as well as its parameters.

There are two main user functions for clustering, `clusterSingle`

and `clusterMany`

. `clusterSingle`

is the wrapper function that calls the underlying clustering functions. `clusterMany`

is a convenience function that implements `clusterSingle`

across combinations of parameter choices. Such parameter choices can be the clustering method, whether to subsample the data, or the specific parameters of the clustering algorithm (e.g. the number of clusters in kmeans).

A common workflow could be to apply `clusterMany`

to the data to get a large collection of clusterings, each from clustering based on different parameters. To try to find a clustering presumably robust to the parameter choices, `clusterExperiment`

provides a function `combineMany`

that tries to find a unifying clustering of the samples.

We find that many methods for choosing the appropriate number of clusters for methods like k-means err on the side of smaller number of clusters. However, we find in practice that we tend to prefer to err on finding many clusters and then merging them based on examining the data (and many of the default parameters are set for finding many clusters). We provide the function `clusterHclust`

to perform hierarchical clustering of the resulting clusters and the function `mergeClusters`

to follow that hiearchy and merge the clusters based on the percentage of significantly different genes found in the comparisons of different arms of the hiearchy.

A common practice after determining clusters is to perform differential gene expression analysis in order to find genes that show the greatest differences amongst the clusters. We would stress that this is purely an exploratory technique, and any p-values that result from this analysis are not valid, in the sense that they are likely to be inflated. This is because the same data was used to define the clusters and to perform differential expression analysis.

Since this is a common task, we provide the function `getBestFeatures`

to perform various kinds of differential expression analysis between the clusters. A common F-statistic between groups can be chosen. However, we find that it is far more informative to do pairwise comparisons between clusters, or one cluster against all, in order to find genes that are specific to a particular cluster. An option for all of these choices is provided in the `getBestFeatures`

function. The `getBestFeatures`

function uses the DE analysis provided by the `limma`

package.

In addition, the `getBestFeatures`

function provides an option to do use the "voom" correction in the `limma`

package to account for the mean-variance relationship that is common in count data. Unlike edgeR or DESeq, the voom correction does not require a count matrix, and therefore can be used on FPKM or TPM entries, as well as normalized data.

We provide a visualization to compare many clusterings of the same data in the function `plotClusters`

. We also provide some functions that are wrappers for common visualization tasks in clustering gene expression data. We provide an heatmap function, `plotHeatmap`

that is an interface to the `aheatmap`

function in `NMF`

package. See below for detail on the `plotHeatmap`

function.

We will work with a simulated gene expression data set with 300 samples and 3 hypothetical genes, which we will simply label as *growth*, *immune*, and *structural* genes. The expression values will be drawn from different normal distributions. Hence, our data matrix has dimensions $300 x 3$. In this data set, we know *a priori* that there are k = 5 clusters (1-100, 101-125, 126-175, 176-225, 226-300). There are 100 samples in the first cluster, 25 in the second cluster, 50 samples in the third cluster, 50 samples in the fourth cluster, and 75 samples in the fifth cluster. Let us first visualize the clusters.

library(scatterplot3d) set.seed(1) n = c(100, 25, 50, 50, 75) growth_m = c(0, 2, -2, 0, 4) growth_sd = c(0.5, 1, 1, 0.5, 0.5) immune_m = c(0, -2, -3, 4, 0) immune_sd = c(0.5, 1, 0.5, 0.5, 1) structural_m = c(3, 0, 2, -1, -2) structural_sd = c(1, 1, 1, 0.7, 1) data = data.frame(n=n, growth_m=growth_m, growth_sd, immune_m, immune_sd, structural_m, structural_sd) get_expression = function(data){ n = data[1] growth_m = data[2] growth_sd = data[3] immune_m = data[4] immune_sd = data[5] structural_m = data[6] structural_sd = data[7] growth = rnorm(n, growth_m, growth_sd) immune = rnorm(n, immune_m, immune_sd) structural = rnorm(n, structural_m, structural_sd) expression = data.frame(growth=growth, immune=immune, structural=structural) return(expression) } expressions = apply(data, 1, get_expression) expressions = do.call(rbind, expressions) expressions$cluster = c(rep("1", 100), rep("2", 25), rep("3", 50), rep("4", 50), rep("5", 75)) expressions$pcolor[expressions$cluster=="1"] = "red" expressions$pcolor[expressions$cluster=="2"] = "blue" expressions$pcolor[expressions$cluster=="3"] = "darkgreen" expressions$pcolor[expressions$cluster=="4"] = "black" expressions$pcolor[expressions$cluster=="5"] = "darkorange" # if (!file.exists("data")){ # dir.create("data") # } # #save(expressions,file="data/expressions.Rda") par(las=1) with(expressions, {plot = scatterplot3d(growth, immune, structural, color = pcolor, main="Scatterplot of Cell Gene Expression")}) legend("topleft", c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5"), fill=c("red", "blue", "darkgreen", "black", "darkorange"))

`clusterSingle`

Let us start with a walkthrough of `clusterSingle`

. `clusterSingle`

is the primary function for performing the primary clustering tasks. Each of these tasks has an underlying (user-accessible) function and `clusterSingle`

is a wrapper around these tasks:

- clustering of a distance matrix (
`clusterD`

) - subsampling the samples, clustering the subsamples, and forming a co-occurance matrix (
`subsampleClustering`

) - sequentially removing the best cluster, and reclustering the remaining clusters (
`seqCluster`

)

The arguments of `clusterSingle`

indicate which of these tasks should be performed and allow the user to pass options for each of these tasks (that are passed to the downstream functions). The options are documented in the more detailed documentation vignette, but here we run through some common examples.

Here are the argument calls for `clusterSingle`

:

clusterSingle(x, subsample = TRUE, sequential = FALSE, clusterFunction = c("tight", "hierarchical01", "pam", "kmeans"), mainClusterArgs = NULL, subsampleArgs = NULL, seqArgs = NULL)

The primary arguments `clusterFunction`

,`subsample`

, and `sequential`

govern each of these tasks:

We can visualize arguments and the values they can take on as a diagram:

drawClusterAll()

The remaining arguments (`mainClusterArgs`

, `subsampleArgs`

, and `seqArgs`

) are for passing additional options to each of these tasks, and we will show examples of how they are used.

Let us use the function `clusterSingle`

to perform simple Partition Around Medoids (PAM) clustering on our expression data set with `k=5`

. In this expression matrix, rows are samples and columns are features.

To do standard pam clustering, we set `clusterFunction="pam"`

. The remaining tasks (subsampling and sequential) will not be performed, so we set `subsample=FALSE`

and `sequential=FALSE`

, which we can visualize as:

drawClusterAll(highlight = c("pam", "subsample.FALSE", "sequential.FALSE"))

However, notice that doing this will return an error:

simpleCluster = clusterSingle(expressions, subsample=FALSE, sequential=FALSE, clusterFunction="pam")

This is because even to perform simple PAM, the PAM algorithm needs additional parameters to be set, namely 'k', the number of groups.

However, there is no argument in `clusterSingle`

for this. Because it is specific to a particular choice of clustering function, we don't want to clutter up `clusterSingle`

with all the possible options that might be needed downstream for a particularly task. Instead, `clusterSingle`

lets you pass such arguments to the function that actually does the task of clustering a distance matrix, `clusterD`

via the `mainClusterArgs`

option. So the user doesn't need to call `clusterD`

explicitly, but often will need to pass an argument to `clusterD`

.

In this case we need to pass the argument 'k'. Specifically, we need to create a list with an element of name "k" that takes on the value, say 5, for our k.

drawClusterAll(highlight = c("pam", "subsample.FALSE", "sequential.FALSE"),Layer2Options=list("mainClusterArgs"="k"),plotOptions=TRUE)

library(clusterExperiment) library(cluster) #load("data/expressions.Rda") expression = expressions[, 1:3] simpleCluster = clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", mainClusterArgs=list('k'=5)) table(simpleCluster$clustering)

The result is a one-element list with a vector of cluster assignments for each sample.

Let us compare this result with the result from calling `pam`

with `k=5`

from the `cluster`

package.

pamCluster = pam(dist(expression), 5) table(clusterExperiment=simpleCluster$clustering, cluster=pamCluster$clustering) table(clusterExperiment=simpleCluster$clustering, true=expressions$cluster)

We have now verified that these two function calls perform the same task. Incidentally, if we know the true number of clusters, a simple PAM procedure leads to good performances in this toy example. This is not surprising, since the clusters are simulated with Gaussian noise, in low dimensions.

There are other options we can pick to change the clustering of the distance matrix. A full documentation of possible options can be found in the help files of `clusterD`

. Some of the useful options associated with PAM are the following common tasks:

- Finding the best k, based on silhouette width (
`findBestK`

) over a range of possible K (`kRange`

) - Removing samples with low silhouette values (
`removeSil`

). The default cutoff for the minimum silhouette width is 0. This cutoff can be adjusted via the parameter`silCutoff`

.

We can pass these arguments to `clusterD`

, again as a list:

drawClusterAll(highlight = c("pam", "subsample.FALSE", "sequential.FALSE"),Layer2Options=list("mainClusterArgs"=c("k","findBestK","removeSil","kRange")),plotOptions=TRUE)

Cluster<-clusterSingle(expression, subsample=FALSE, sequential=FALSE, clusterFunction="pam", mainClusterArgs=list(findBestK=TRUE, removeSil=TRUE, kRange=2:10)) table(true=expressions$cluster, PAM=Cluster$clustering)

Note that, even in this toy example, it is not easy to choose the correct number of clusters, and a simple procedure based on comparing average silhouette width may not always lead to good results.

**Note:** at the current time, if we are not going to do the subsample task (`subsample=FALSE`

), then `clusterFunction`

*must* be `pam`

because the other clustering options for `clusterFunction`

assume a 0-1 distance matrix based on subsampling. If there is no subsampling, `clusterSingle`

makes the distance matrix by default with `dist(x)`

and therefore must be used with `pam`

, as this is the only implemented method for distance matrices.

We can add the additional task to our clustering workflow to subsample the samples and cluster the resampled data. This is accomplished by setting `subsample=TRUE`

. In this case, `clusterSingle`

will first subsample the rows of the data matrix, cluster the subsamples for a specified clustering algorithm, and return a clustering co-occurance matrix.

Then, `clusterSingle`

will cluster this co-occurance matrix using another specified algorithm, which can be the same as the algorithm used in clustering the subsampled data, but doesn't have to be. Indeed, several of the cluster algorithm options (`tight`

and `hierarchical`

) are designed to cluster 0-1 co-occurance matrices from subsampling.

In this demonstration, let us cluster the subsamples using `kmeans`

with `k=5`

, then cluster the co-occurance matrix using the tight clustering algorithm. Now we have options we want to set for the subsampling portion of the task.

drawClusterAll(highlight = c("tight", "subsample.TRUE", "sequential.FALSE"),Layer2Options=list("subsampleArgs"=c("k","clusterFunction")),plotOptions=TRUE)

set.seed(212421) Cluster<-clusterSingle(expression, subsample=TRUE, sequential=FALSE, clusterFunction="tight", subsampleArgs=list("k"=5, clusterFunction="kmeans")) table(true=expressions$cluster, subsample=Cluster$clustering)

As expected, the subsampling procedure will return more clusters than a simple PAM. In addition, some observations may be not clustered (value of `-1`

), because the proportion of times that they get assign to the same cluster core across subsamplings does not pass the specified threshold. Note that the choice of `k`

is now not directly linked to the output labels: although we selected `k=5`

for the subsampling step, the final result contains 8 clusters.

*DR: we should probably show a heatmap of the subsampling proportions to make this point clearer*

The last additional task is to sequentially remove the best cluster, and then recluster the remaining samples. This is performed using `sequential=TRUE`

.

Again, we need to pass arguments to control how this is done, as a list to the argument `seqArgs`

. A necessary argument `k0`

must be passed to `seqArgs`

. What this argument means depends on what other choices are being made, but if we also set `subsample=TRUE`

(recommended), then `k0`

is the value of k passed to the subsampling clustering method at the first iteration of sequential algorithm.

This time we will use the hierarchical method for clustering the 0-1 co-occurance matrix.

drawClusterAll(highlight = c("hierarchical01", "subsample.TRUE", "sequential.TRUE"),Layer2Options=list("subsampleArgs"=c("clusterFunction"),seqArgs=c("k0","verbose")),plotOptions=TRUE)

Cluster<-clusterSingle(expression, subsample=TRUE, sequential=TRUE, clusterFunction="hierarchical01", seqArgs=list(k0=5, verbose = FALSE), subsampleArgs = list(clusterFunction="kmeans"))

Running `clusterSingle`

with `sequential=TRUE`

will return a list of the three elements `clustering`

, `clusterInfo`

, and `whyStop`

. `clusterInfo`

is a matrix of information regarding the algorithm behavior for each cluster (the starting and stopping k set for subsampling clustering for each cluster, and the number of iterations for each cluster). `whyStop`

is a character string explaining what triggered the algorithm to stop.

table(Cluster$clustering) Cluster$clusteringInfo Cluster$whyStop table(true=expressions$cluster, clusterExperiment=Cluster$clustering)

This procedure leads to many more clusters, many of which are very small size.

*DR: don't we have a min size parameter to remove clusters with too few observations? Also, here would be a good place to talk about merging. Perhaps merging before finding markers?*

The function `getBestFeatures`

calls `limma`

on input data to determine the gene features most associated with the found clusters. The minimal required input is the expression data and a vector of the (numeric) cluster labels. The method ignores samples in clusters marked by "-1" ("-1" always signifies samples not clustered). By default, `getBestFeatures`

finds significant genes based on a F-test between the clusters, but the option `type`

can provide other measures and is the most common one to be changed by the user.

In this demonstration, clustering will first be performed with basic PAM with k=4 and removing negative silhouette widths. We use the simulated data found in the package `simData`

.

`getBestFeatures`

will be called to determine the top features associated with every pairwise comparison between the clusters. By default, `topTable`

in `limma`

only returns the top 10, which in this context means 10 per each of the pairwise comparisons.

data(simData) cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, mainClusterArgs=list(k=4,removeSil=TRUE)) pairsAll<-getBestFeatures(cl$clustering,simData,type="Pairs") head(pairsAll)

The column `Contrast`

in the output signifies the pairwise difference (or contrast) for which the information on the row comes from. `IndexInOriginal`

matches the gene to its index in the original dataset. The other columns are provided by `topTable`

in `limma`

(see documentation therein).

`getBestFeatures`

accepts arguments to `limma`

's function `topTable`

to decide which genes should be returned (and in what order). In particular, we can set an adjusted p-value cutoff for each contrast, and set `number`

to control the number of genes returned for each contrast. By setting `number`

to be the length of all genes, and `p.value=0.05`

, we can return all genes for each contrast that have adjusted p-values less than 0.05.

data(simData) cl = clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, mainClusterArgs=list(k=4,removeSil=TRUE)) pairsAll2<-getBestFeatures(cl$clustering,simData,type="Pairs",p.value=0.05,number=ncol(simData)) table(pairsAll2$Contrast)

Notice that the same genes can be replicated across different contrasts:

nrow(pairsAll2) length(unique(pairsAll2$Gene))

Note that by using contrasts, we are making use of all of the data, not just the samples in the particular cluster pairs being compared. This means the variance is estimated with all the samples.

One way to visualize samples with only the features found by `getBestFeatures`

is to use the `plotHeatmap`

function. The `plotHeatmap`

function minimally requires two arguments - a vector of cluster assignments and the data matrix. `plotHeatmap`

uses the heatmap function `aheatmap`

in the package `NMF`

, but provides an interface that is useful for the standard plots desired for evaluating the clustering output. In particular, it is designed to draw the cluster assignments of the samples on the top, where different colors denote different cluster assignments; lack of assignment, denoted by "-1", are automatically colored white. In the code example below, `clusterData`

is set to the data matrix to define the color scale. Currently, `plotHeatmap`

internally transforms the input data (i.e. argument `heatData`

) to the log scale; this will be fixed to be an option in future versions, but currently, passing of already logged data is not supported and log-scale data should be exponentiated for use with `plotHeatmap`

.

dataMatrix = simData[, unique(pairsAll2$IndexInOriginal)] cluster = cl$clustering plotHeatmap(cluster, exp(dataMatrix), eps=0, clusterData=dataMatrix)

Additional covariates can also be plotted by giving a matrix of values to `annCol`

; unlike `aheatmap`

, `plotHeatmap`

will assume that such values should be converted to factors unless told otherwise by the argument `whAnnCont`

.

plotHeatmap(cluster, exp(dataMatrix), eps=0, clusterData=dataMatrix, annCol = data.frame(clusterExperiment=cluster, TrueClusters=trueCluster))

Another feature of plotHeatmap is that a different dataset can be used for determining the clustering than for determining the color-scale image of the heatmap. This can be useful, for example, for clustering on the normalized data which is not on the count-scale, but showing the original count-scale in the color-scale which is more interpretable.

It can be useful to create heatmaps that keep the clusters together. However, arbitrarily ordered clusters will not make compelling heatmaps either. The function `clusterHclust`

will perform hierarchical clustering of the cluster mediods, and provide a dendrogram that will order the samples according to this clustering of the clusters.

clDendro<-clusterHclust(dataMatrix,cl$clustering,full=FALSE) plot(clDendro)

More useful, however, is to use this in conjunction with `plotHeatmap`

, in which case we want the individual samples in the dendrogram, not just the clusters. Notice that `full=TRUE`

is necessary to get such a dendrogram. Then we give the data as `heatData`

to `plotHeatmap`

, indicating that it should be used for creating the color-image of the data, but provide our dendrogram to `clusterData`

to indicate it will provide the clustering of the data.

clDendroFull<-clusterHclust(dataMatrix,cl$clustering,full=TRUE,unassigned="outgroup") plotHeatmap(cl$clustering,heatData=exp(dataMatrix),eps=0,clusterData=clDendroFull,annCol = data.frame(clusterExperiment=cluster, TrueClusters=trueCluster))

Notice that the unassigned samples ("-1") are chosen in `clusterHclust`

to be assigned to an outgroup in the dendrogram based on the argument `unassigned`

.

One could also choose to remove them, making sure that the input data matrix to `plotHeatmap`

doesn't contain these samples.

clDendroFull<-clusterHclust(dataMatrix,cl$clustering,full=TRUE,unassigned="remove") whRm<-which(cl$clustering>0) plotHeatmap(cl$clustering[whRm],heatData=exp(dataMatrix)[whRm,],eps=0,clusterData=clDendroFull,annCol = data.frame(clusterExperiment=cluster[whRm], TrueClusters=trueCluster[whRm]))

We can also perform tests based on this hierarchy of clusters. Specifically, we can test for each node of the hierarchy, whether the mean of the set of clusters to the right split of the node is equal to the mean on the left split. We do this by creating a contrast of the corresponding means and testing it for each gene via `getBestFeatures`

, where the type is set to `"Dendro"`

and we provide the dendrogram created by `clusterHclust`

(but setting `full=FALSE`

so that individual samples are not in the dendrogram).

dendroGenes<-getBestFeatures(cl$clustering,simData,type="Dendro",p.value=0.05,number=ncol(simData),dendro=clDendro) head(dendroGenes)

The output is similar to before, but now there are two identifiers for the contrast: one is `ContrastName`

which gives the Node being compared, and the other is `Contrast`

which is the actual mathematical statement of the mean comparison between clusters that is being performed.

We can use this basic idea to merge clusters based on what percentage of genes are found differentially expressed. The `mergeClusters`

function performs such contrasts for each node, and then calculates the proportion of significant hypotheses. There are several methods implemented, but the simplest is just the proportion found by calculating the proportion of DE genes at a given False Discovery Rate thresshold (using the Benjamini-Hochberg procedure).

mergeOut<-mergeClusters(dat=simCount, cl=cl$clustering, mergeMethod = c("adjP"), cutoff = 0.1, plotType = "mergeMethod",isCount=TRUE) table(Meged_clusters=mergeOut$cl,Original_clusters=cl$clustering) print(mergeOut$propDE)

The function optionally creates a plot showing the dendrogram and what clusters were merged. `mergeClusters`

invisibly returns information, including the merged clusters (`cl`

), and a matrix of different estimates based on different methods for computing the proportion (`propDE`

).

`clusterMany`

)drawCompareChoices = function(arguments=NULL){ if (!is.null(arguments)){ if (arguments == "plotClusters"){colargs = c(rep("black", 6), "red", "black")}else{colargs = c(rep("black", 7), "red")} }else{colargs = rep("black", 8)} par(mar=c(1, 4, 2, 2)) names = c("clusterMany", "clusterSingle", "clusterSingle", "...", "clusterSingle", "output", "plotClusters", "combineMany") M = matrix(0, nrow=8, ncol=8, byrow=TRUE) M[2, 1] <- M[3, 1] <- M[4, 1] <- M[5, 1] <- M[6, 2] <- M[6, 3] <- M[6, 4] <- M[6, 5] <- M[7, 6] <- M[8, 6] <- "" diagram::plotmat(M, pos=c(1, 4, 1, 2), name=names, lwd=1, box.lwd=2, box.cex=0.5, box.size=0.08, box.type=c(rep("square", 5), "diamond", rep("square", 2)), box.prop=0.2, curve=0, main="clusterMany Workflow", arr.type="triangle", arr.width=0, arr.length=0, box.lcol=colargs) par(mar=c(5.1, 4.1, 4.1, 2.1)) }

`clusterMany`

is a function for running many different combinations of parameters or different datasets. All combinations given by the user are run through `clusterSingle`

(though the function does some (minimal) checking to remove nonsensical combinations).
The calls to `clusterSingle`

can also be parallelized across multiple cores (by setting the argument `ncores`

to be greater than 1).

After `clusterMany`

is run, the function `plotClusters`

can be used to visualize the many different cluster assignments that resulted, and `combineMany`

can be used to determine a consensus clustering across the parameter combinations. Finally, `mergeClusters`

, described above, can be used to determine whether some clusters in the consensus clustering should be merged.

drawCompareChoices(arguments = "plotClusters")

In this example, the combination of parameters we are testing are: (i) the range of k values for clustering; (ii) whether `removeSil`

is TRUE or FALSE; and (iii) whether `pam`

or `kmeans`

is used.

Before running the clustering on these combinations, it is a good idea to first inspect what parameter combinations will be run by calling `clusterMany`

and setting `run=FALSE`

. The resulting output is a matrix where each column corresponds to a parameter setting and each row is a combination of parameters that will be sent to `clusterSingle`

.

parameters = clusterMany(expressions, ks=c(3:7), clusterMethod=c("pam", "kmeans"), run=FALSE) head(parameters) dim(parameters)

**Note:** The output can be fed back into `clusterMany`

. For example if some combinations are not of interest, those rows could be removed and then the resulting matrix given as an argument.

Let us now run `clusterMany`

on the original simulated dataset.

result = clusterMany(expressions, ks=c(3:7), clusterMethod=c("pam"), subsample=FALSE, removeSil = c(TRUE, FALSE)) head(result$clMat)

With `run=TRUE`

, `clusterMany`

will return a list with elements `clMat`

and `clusterInfo`

. `clMat`

is the main object of interest and will be a matrix with each row representing a sample and each column the cluster assignments from a particular combination of parameters. `clusterInfo`

will only be informative when `sequential=TRUE`

, in which case it will be list with information regarding clustering result.

To visualize the several clusters created by `clusterMany`

, we run `plotClusters`

to visualize the assignments. `plotClusters`

minimally takes as its argument a matrix of cluster assignments where each row represents a sample and each column a clustering. The result of `plotClusters`

is a color representation of the clusters, where each column of the plot is a sample, and the rows are the different clusterings (note this is the inverse of the input clustering matrix so as to conserve space). Within each row/clustering, the samples are color coded as to their cluster assignment. Furthermore, `plotClusters`

tries to order the samples so that the clusters are assigned colors that make clusters composed of similar samples get the same cluster color.

par(mar=c(2, 8, 1, 1)) plotClusters(result$clMat)

Notice that the unassigned ("-1") entries are always color-coded white.

The alignment (and color assignments) created by `plotClusters`

are invisibly returned and can be used for other purposes (see help documentation). The alignment of the clusters is highly dependent on the order of the clusters, with the alignment starting with the first cluster in the first column, then the second aligned to the first, and so forth. Different orderings of the clusters can greatly change the visualizations.

To find a consensus cluster across many different clusterings of the same data `combineMany`

can be used. The minimal input to `combineMany`

is a cluster matrix with samples as rows and different clustering assignments for each column.

As an example, let us find the consensus cluster from the output of `clusterMany`

.

r <-combineMany(result$clMat) table(r$clustering) plotClusters(cbind(FINAL=r$clustering,result$clMat))

Let's look at the options we chose here. To determine whether samples should be clustered together, the parameter `proportion`

is used to set the proportion of co-occurance we want as a cutoff. If we want to be strict, we can determine clusters only if they are always together in any clustering (`proportion=1`

, the default). Samples that are unassigned in greater than `propUnassigned`

will always be assigned a value of `-1`

(i.e. will also be unassigned in the final clustering). And the option `minSize`

forces all resulting cluster to have a minimal size equal to `minSize`

.

r <-combineMany(result$clMat, proportion=.7,minSize=10) table(r$clustering) plotClusters(cbind(FINAL=r$clustering,result$clMat))

We can also visualize the co-occurance matrix along with the final clusters, which is returned with the output of `combineMany`

(if `proportion<1`

)

plotHeatmap(clusterVector=r$clustering,heatData=exp(r$percentageShared),eps=0)

Embedding an R snippet on your website

Add the following code to your website.

For more information on customizing the embed code, read Embedding Snippets.