R/makeDendrogram.R

Defines functions .makeSampleDendro .makeClusterDendro

#' @title Make hierarchy of set of clusters
#'
#' @aliases makeDendrogram,ClusterExperiment-method
#' @description Makes a dendrogram of a set of clusters based on hclust on the
#'   medoids of the cluster.
#' @param x data to define the medoids from. Matrix and
#'   \code{\link{ClusterExperiment}} supported.
#' @param cluster A numeric vector with cluster assignments. If x is a
#'   ClusterExperiment object, cluster is automatically the primaryCluster(x).
#'   ``-1'' indicates the sample was not assigned to a cluster.
#' @param unassignedSamples how to handle unassigned samples("-1") ; only
#'   relevant for sample clustering. See details.
#' @param reduceMethod character A character identifying what type of
#'   dimensionality reduction to perform before clustering. Can be either a
#'   value stored in either of reducedDims or filterStats slot or a built-in
#'   diminsionality reduction/filtering. The option "coCluster" will use the
#'   co-Clustering matrix stored in the 'coClustering' slot of the
#'   \code{ClusterExperiment} object
#' @param calculateSample only relevant for \code{matrix} or \code{dist}
#'  version of function. Indicates whether to calculate the sample dendrogram.
#' @param nDims The number of dimensions to keep from \code{reduceMethod}. If
#'   missing calls \code{\link{defaultNDims}}.
#' @param ... for makeDendrogram, if signature \code{matrix}, arguments passed
#'   to hclust; if signature \code{ClusterExperiment} passed to the method for
#'   signature \code{matrix}. For plotDendrogram, passed to
#'   \code{\link{plot.dendrogram}}.
#' @inheritParams clusterSingle
#' @inheritParams reduceFunctions
#' @inheritParams getClusterIndex
#' @details The function returns two dendrograms (as a list if x is a matrix or
#'   in the appropriate slots if x is ClusterExperiment). The cluster dendrogram
#'   is created by applying \code{\link{hclust}} to the medoids of each cluster.
#'   In the sample dendrogram the clusters are again clustered, but now the
#'   samples are also part of the resulting dendrogram. This is done by giving
#'   each sample the value of the medoid of its cluster.
#' @details The argument \code{unassignedSamples} governs what is done with
#'   unassigned samples (defined by a -1 cluster value). If
#'   unassigned=="cluster", then the dendrogram is created by hclust of the
#'   expanded medoid data plus the original unclustered observations. If
#'   \code{unassignedSamples} is "outgroup", then all unassigned samples are put
#'   as an outgroup. If the \code{x} object is a matrix, then
#'   \code{unassignedSamples} can also be "remove", to indicate that samples
#'   with "-1" should be discarded. This is not a permitted option, however,
#'   when \code{x} is a \code{ClusterExperiment} object, because it would return
#'   a dendrogram with less samples than \code{NCOL(x)}, which is not permitted
#'   for the \code{@dendro_samples} slot.
#' @details If any merge information is stored in the input object, it will be
#'   erased by a call to makeDendrogram.
#' @return If x is a matrix, a list with two dendrograms, one in which the
#'   leaves are clusters and one in which the leaves are samples. If x is a
#'   ClusterExperiment object, the dendrograms are saved in the appropriate
#'   slots.
#'
#' @export
#' @seealso makeFilterStats, makeReducedDims
#' @examples
#' data(simData)
#'
#' #create a clustering, for 8 clusters (truth was 3)
#' cl <- clusterSingle(simData, subsample=FALSE,
#' sequential=FALSE, mainClusterArgs=list(clusterFunction="pam", clusterArgs=list(k=8)))
#'
#' #create dendrogram of clusters:
#' hcl <- makeDendrogram(cl)
#' plotDendrogram(hcl)
#' plotDendrogram(hcl, leafType="samples",plotType="colorblock")
#'
#' @name makeDendrogram
#' @rdname makeDendrogram
setMethod(
    f = "makeDendrogram",
    signature = "ClusterExperiment",
    definition = function(x, whichCluster="primaryCluster",reduceMethod="mad",
                          nDims=defaultNDims(x,reduceMethod),filterIgnoresUnassigned=TRUE,
                          unassignedSamples=c("outgroup", "cluster"),
                          whichAssay=1,...)
    {
        passedArgs<-list(...)
        
        checkIgnore<-.depricateArgument(passedArgs=passedArgs,"filterIgnoresUnassigned","ignoreUnassignedVar") #06/2018 added in BioC 3.8
        if(!is.null(checkIgnore)){
            passedArgs<-checkIgnore$passedArgs
            filterIgnoresUnassigned<-checkIgnore$val
        }
        unassignedSamples<-match.arg(unassignedSamples)
        whCl<-getSingleClusterIndex(x,whichCluster,passedArgs)
        cl<-clusterMatrix(x)[,whCl]
        ##erase merge information
        if(!is.na(mergeClusterIndex(x)) || !is.na(x@merge_dendrocluster_index)) x<-.eraseMerge(x)
        
        ########
        ##Transform the data
        ########
        if(length(reduceMethod)>1) stop('makeDendrogram only takes one choice of "reduceMethod" as argument')
        if(reduceMethod!="coCluster"){
            #need to change name of reduceMethod to make it match the
            #clustering information if that option chosen.
            datList<-getReducedData(object=x,whichCluster=whCl,reduceMethod=reduceMethod,
                                    nDims=nDims,filterIgnoresUnassigned=TRUE,  whichAssay=whichAssay,returnValue="list")
            x<-datList$objectUpdate
            dat<-datList$dat
            
            outlist <- do.call("makeDendrogram",c(list(
                x=dat, 
                cluster=cl,calculateSample=TRUE,
                unassignedSamples=unassignedSamples),
                passedArgs))
        }
        else{
            if(is.null(x@coClustering)) stop("Cannot choose 'coCluster' if 'coClustering' slot is empty. Run makeConsensus before running 'makeDendrogram' or choose another option for 'reduceMethod'")
            if(is.null(dimnames(x@coClustering))) stop("This ClusterExperiment object was made with an old version of clusterExperiment and did not give dimnames to the coClustering slot.")
            outlist<-do.call("makeDendrogram",c(list(
                x=as.dist(1-x@coClustering),
                cluster=cl,calculateSample=TRUE,
                unassignedSamples=unassignedSamples),
                passedArgs)) 
        }
        
        #Add clusterNames as Ids to cluster and sample dendrogram.
		x@dendro_clusters <- outlist$clusters
		phylobase::tipLabels(x@dendro_clusters)<-NA #erase any labels of the tips, internal nodes already have the defaults.
		x@dendro_samples <- outlist$samples #labels should have been erased already
        x@dendro_index<-whCl
        ch<-.checkDendrogram(x)
        if(!is.logical(ch)) stop(ch)
        return(x)
})



#' @rdname makeDendrogram
#' @export
setMethod(
    f = "makeDendrogram",
    signature = "dist",
    definition = function(x, cluster,
                          unassignedSamples=c("outgroup", "cluster", "remove"),
                          calculateSample=TRUE,...) {
        unassigned <- match.arg(unassignedSamples)
        if(is.null(attributes(x)$Labels)) {
            attributes(x)$Labels <- .makeSampleNames(seq_len(nSamples))
        }
		if(!all(is.na(suppressWarnings(as.numeric(attributes(x)$Labels ))))){
			warning("Cannot use the attributes(x)$Labels because they are numbers. Making sample names")
			sampleNames<-.makeSampleNames(seq_len(nSamples))
		}
		
        clusterD<-.makeClusterDendro(x,cluster,type="dist",...)  
        fullD<-.makeSampleDendro(x,clusterDendro=clusterD, cl=.convertToNum(cluster), type=c("dist"), unassignedSamples=unassigned,sampleEdgeLength=0,  outbranchLength=1,calculateSample=calculateSample)
        return(list(samples=fullD,clusters=clusterD))
    })

#' @rdname makeDendrogram
#' @export
setMethod(
    f = "makeDendrogram",
    signature = "matrixOrHDF5",
    definition = function(x, cluster,
                          unassignedSamples=c("outgroup", "cluster", "remove"),
                          calculateSample=TRUE,...) {
		
        unassigned <- match.arg(unassignedSamples)
        if(is.null(colnames(x))) {
            colnames(x) <- .makeSampleNames(seq_len(ncol(x)))
        }
		if(!all(is.na(suppressWarnings(as.numeric(colnames(x) ))))){
			warning("Cannot use the colnames(x) because they are numbers. Making sample names")
			sampleNames<-.makeSampleNames(seq_len(ncol(x)))
		}
        clusterD<-.makeClusterDendro(x,cluster,type="mat",...)
        fullD<- .makeSampleDendro(x,clusterDendro=clusterD, cl=.convertToNum(cluster), type=c("mat"), unassignedSamples=unassigned, sampleEdgeLength=0,  outbranchLength=1,calculateSample=calculateSample)
        
        return(list(samples=fullD,clusters=clusterD))
    })

#' @param x a matrix or hdf5 or distance
#' @param cl vector of cluster info
#' @param type whether input x is a matrix or distance.
#' @noRd
.makeClusterDendro<-function(x, cl,type=c("mat","dist"),...){
    type<-match.arg(type)
    if(type=="dist"){
        nSamples<-  attributes(x)$Size
        if(nSamples != length(cl)) {
            stop("cluster must be the same length as the number of samples")
        }
    }
    if(type=="mat"){
        if(ncol(x) != length(cl)) {
            stop("cluster must be the same length as the number of samples")
        }
     }
    
    clNum<-.convertToNum(cl)
    
    #############
    # Cluster dendrogram
    #############
    whKeep <- which(clNum >= 0) #remove -1, -2
    if(length(whKeep) == 0) stop("all samples have clusterIds<0")
    if(length(unique(cl[whKeep]))==1) stop("Only 1 cluster given. Can not make a dendrogram.")
    clFactor <- factor(cl[whKeep])
    
    #each pair of clusters, need to get median of the distance values
    #do a double by, just returning the values as a vector, and then take the median
    if(type=="dist"){
        medoids<-do.call("rbind", by(as.matrix(x)[whKeep,whKeep], clFactor, function(z){
            out<-as.vector(by(t(z),clFactor,function(y){median(as.vector(unlist(y)))}))
            names(out)<-levels(clFactor)
            return(out)
        }))
        diag(medoids)<-0 #incase the median of within is not zero...
        rownames(medoids) <- levels(clFactor)
        colnames(medoids) <- levels(clFactor)
    }
    if(type=="mat"){
        medoids <- do.call("rbind", by(t(x[,whKeep]), clFactor, function(z){apply(z, 2, median)}))
        rownames(medoids) <- levels(clFactor)
    }
    nPerCluster <- table(clFactor)
    clusterD<-if(type=="dist").convertToPhyClasses(stats::hclust(as.dist(medoids),members=nPerCluster,...),returnClass=c("phylo4")) else .convertToPhyClasses(stats::hclust(dist(medoids)^2,members=nPerCluster,...),returnClass=c("phylo4"))
       
   #create data for phylo4d object
   clusterNodes<-phylobase::getNode(clusterD,type="all") 
   clusterIdDendro<-paste("ClusterId",names(clusterNodes),sep="")
   clusterIdDendro[is.na(names(clusterNodes))]<-NA
   
   #make permanent node ids, starting with internal nodes.
   justNodes<-phylobase::getNode(clusterD,type="internal")
   justTips<-phylobase::getNode(clusterD,type="tip")
   nodeId<-rep(NA,length(clusterNodes))  #want tips last, and internal nodes first
   nodeId[match(justNodes,clusterNodes)]<-seq_along(justNodes)
   nodeId[match(justTips,clusterNodes)]<-seq_along(justTips)+length(justNodes)
   if(!all(sort(nodeId)==sort(seq_along(nodeId)))) stop("coding error in giving node names")
   
   data.cl <- data.frame(NodeId = paste("NodeId",nodeId,sep=""), ClusterIdDendro = clusterIdDendro, ClusterIdMerge= rep(NA,length(clusterNodes)),stringsAsFactors=FALSE)
	data.cl$Position<-factor(rep("cluster hierarchy node",nrow(data.cl)), levels=.positionLevels)
	row.names(data.cl)<-as.character(unname(clusterNodes))
    data.cl$Position[phylobase::getNode(clusterD,type="tip")]<-"cluster hierarchy tip"
	  
	#give default node labels:
	phylobase::labels(clusterD)[justNodes]<-data.cl$NodeId[justNodes]
	clusterD<-phylobase::phylo4d(x=clusterD, all.data = data.cl)


	# -- Position: one of either "cluster hierarchy node","cluster hierarchy tip","tip hierarchy","assigned tip","outbranch hierarchy node","unassigned tip","outbranch root"). This column should be internal and validity check that numbers correspond to clustering from @dendro_index. Needs to be a check on "ClusterExperiment", not the "clusterPhylo4d"
    # -- ClusterIdDendro (only for cluster): cluster Id in @dendro_index that corresponds to node (NA otherwise)
    # -- ClusterIdMerge  (only for cluster): cluster Id in @merge_index that corresponds to node (NA otherwise)
    # -- MatchToClusterHier (only for sample tree): for those that are part of the cluster hierarchy, the (permanent) node id from cluster hierarchy -- ie not the name but the number so can always link them. Use this to create checks that the node names are the same, grab the default names, etc. 
    # -- SampleIndex (only for sample tree): index to the columns in the assay
    
    
    return(clusterD)
}



#' @param x a matrix or hdf5 or distance
#' @param cl a clustering of samples in x
#' @param clusterDendro the cluster dendrogram (a phylo4d class)-- will be converted internally to type 'phylo'
#' @param type whether input x is a matrix or distance.
#' @noRd
.makeSampleDendro<-function(x,clusterDendro, cl,type=c("mat","dist"), unassignedSamples=c("remove","outgroup","cluster"),sampleEdgeLength=0, outbranchLength=0,calculateSample=TRUE){
    unassignedSamples<-match.arg(unassignedSamples)
    type<-match.arg(type)
    sampleNames<-if(type=="mat") colnames(x) else attributes(x)$Labels
	
    if(calculateSample){
        whPos<-which(cl>0) #this is copy close to length of n
        if(!is.null(sampleNames) && length(sampleNames)!=length(cl)) stop("sampleNames must be same length as cluster vector")
					#converts internal node id and cluster id to the node, tip labels respectively.
        phyloObj <- .convertToPhyClasses(clusterDendro,"phylo",convertNodes=TRUE,convertTips=TRUE)
        if(!is.ultrametric(phyloObj)) stop("coding error -- the cluster dendrogram is not ultrametric")
        nSamples<-switch(type,"mat"=ncol(x),"dist"=attributes(x)$Size)
        
        ###########################
        ## I. Make outlier tree if needed
        ###########################
        outbranchHclust<-NULL
        whNeg<-which(cl<0) #technically should be able to do with whPos, but a pain
        if(length(whNeg)>0){
	        outNames<- if(!is.null(sampleNames)) sampleNames[whNeg] else paste("OutSample",whNeg)
        	if(unassignedSamples=="outgroup" | length(whPos)==0){
                #--------
                # 1. Make outlier tree 
                #--------
                if(length(whNeg) > 5 | length(whPos)==0){ 
                    outlierDat <- if(type=="mat") x[,whNeg,drop=FALSE] else as.matrix(x)[whNeg,whNeg,drop=FALSE]
                    outbranchHclust <- if(type=="mat") stats::hclust(dist(t(outlierDat))^2) else  stats::hclust(as.dist(outlierDat))
                    outTree<- .convertToPhyClasses(x=outbranchHclust,"phylo") 
                    if(length(outTree$tip.label)!=length(whNeg)) stop("coding error - given hclust doesn't have correct number of tips.")
                }
                else{ #construct tree with just root and tips:
                    outTree<-.makeFakeBinary(tipNames=outNames,rootEdgeLength=1,edgeLength=.1)
                    #have to make it ultrametric -- since arbitrary doesn't matter.
                    if(length(outNames)>1) outTree<-.force.ultrametric(outTree)
                }
            }
            if(unassignedSamples=="cluster"){
                if(type=="dist")stop("cannot use unassigned='cluster' if input is a distance matrix")
                #code from assignUnassigned
                clFactor <- factor(cl[-whNeg])
                medoids <- do.call("rbind", by(t(x[,-whNeg]), clFactor, function(z){apply(z, 2, median)}))
                rownames(medoids) <- levels(clFactor)
                classif<-.genericClassify(inputMatrix=x[,whNeg],centers=medoids) 
                cl[whNeg]<-classif
                whPos<-which(cl>0)
                whNeg<-which(cl<0)
                if(length(whPos)!=length(cl)) stop("coding error -- predicting unassigned missed some")
                unassignedSamples<-"remove"
            }			
        }      
        
        ###################
        ## II. Make tree with main samples:
        ###################
        if(length(whPos)>0){
            #---------
            #make list of phylo trees for each cluster:
            #---------
            fakePhyloList <- tapply(sampleNames[whPos], as.factor(paste("ClusterId",cl[whPos],sep="")), .makeFakeBinary, simplify=FALSE)
            
            #need to reorder so in order of the tips of phyloObj
            m<-match(phyloObj$tip.label,names(fakePhyloList))
            if(any(is.na(m))) stop("coding error -- names in dendrogram of clusters don't match cluster ids")
            fakePhyloList<-fakePhyloList[m]
            newPhylo<-.addTreesToTips(mainTree=phyloObj,tipTrees=fakePhyloList)
        }
        else newPhylo<-outTree 
        
        
        ###################
        ## III. Add outlier samples if unassignedSamples=="outgroup"
        ###################
        if(unassignedSamples == "outgroup" & length(whNeg)>0 & length(whPos)>0){
            if(length(whNeg)>1){
                newPhylo<-.mergePhylo(tree1=newPhylo,tree2=outTree,mergeEdgeLength=outbranchLength)
            }
            else{
                newPhylo<-.mergePhylo(tree1=newPhylo, tree2=outNames, mergeEdgeLength=outbranchLength)
            }
        }
        
        ###################
        ## IV. Convert to return format
        ###################
        newPhylo<-.convertToPhyClasses(newPhylo,"phylo4")
				
		#create data for phylo4d object
		nodeLabs<-phylobase::getNode(newPhylo,type="all") 
		mClusterHier<-rep(NA,length(nodeLabs))
		position<-rep(NA,length(nodeLabs))
		whInternal<-grep("NodeId",names(nodeLabs))
		mClusterHier[whInternal]<-as.character(.matchToDendroData(inputValue=names(nodeLabs)[whInternal], dendro=clusterDendro, returnColumn="NodeId"))
		position[whInternal]<-as.character(.matchToDendroData(inputValue=names(nodeLabs)[whInternal], dendro=clusterDendro, returnColumn="Position"))

		whCluster<-grep("ClusterId",names(nodeLabs))
		mClusterHier[whCluster]<-as.character(.matchToDendroData(inputValue=names(nodeLabs)[whCluster], dendro=clusterDendro, matchColumn="ClusterIdDendro",returnColumn="NodeId"))
		position[whCluster]<-as.character(.matchToDendroData(inputValue=names(nodeLabs)[whCluster], dendro=clusterDendro, matchColumn="ClusterIdDendro",returnColumn="Position"))

				#rather than keep track of position as make trees (which require making it a phylo4d class much more frequently), figure it out from here.
				
		rootNode<-phylobase::rootNode(newPhylo)
		rootChildren <- phylobase::descendants(newPhylo,node=rootNode,type="children")
			 
		 whNAChild<-which(is.na(names(rootChildren)))
		 if(length(whNAChild)==0){
			 #all are part of cluster hierarchy
			 #all NA nodes -> "tip hierarchy"
			 #all tips -> "assigned tip"
			 position[is.na(names(nodeLabs))]<-"tip hierarchy"
			 position[phylobase::getNode(newPhylo,type="tip")]<-"assigned tip"
			 #check if there is a singleton sample for outlier:
			 whSingletonOutlier<-which(is.na(mClusterHier[rootChildren]))
			 if(length(whSingletonOutlier)>0){
			 	position[rootChildren[whSingletonOutlier]]<-"unassigned tip"
			 }
	 
		 }
		 else if(length(whNAChild)==1){
			 #all nodes descending from child of NA -> "outbranch hierarchy node"
			 #all tips descending from child of NA -> "unassigned tip"
			 #all tips descending from non-NA -> "assigned tip"
			 #all NA nodes descending from non-NA -> "tip hierarchy"
			 #root -> "outbranch root"
			 position[rootNode]<-"outbranch root"
			 naNode<-rootChildren[which(is.na(names(rootChildren)))]
			 position[phylobase::descendants(newPhylo,node=naNode,type="ALL")] <- "outbranch hierarchy node" #includes tips, but will overwrite this in next line...
			 position[phylobase::descendants(newPhylo,node=naNode,type="tip")] <- "unassigned tip"
			 nonNANode<-rootChildren[which(!is.na(names(rootChildren)))]
			  whDescendantNonNA <- phylobase::descendants(newPhylo,node=nonNANode,type="all")
				whDescendantNonNA<-whDescendantNonNA[is.na(names(whDescendantNonNA))]
				position[whDescendantNonNA]<-"tip hierarchy"
			  whTipsDescendantNonNA <- phylobase::descendants(newPhylo,node=nonNANode,type="tip")
				position[whTipsDescendantNonNA]<-"assigned tip"
		
		 }
		else stop("coding error -- neither child of root is part of cluster hierarchy")
		if(any(is.na(position))) stop("coding error -- not all nodes assigned a position value")
		position<-factor(position,levels=.positionLevels)

		mSamples<-match(names(nodeLabs),sampleNames)
		#this will miss singleton clusters (which will have name ClusterIdX instead of sample name)
		whMissed<-intersect(which(is.na(mSamples)),grep("assigned tip",position))
		if(length(whMissed)>0){
			clusters<-.matchToDendroData(inputValue=mClusterHier[whMissed],clusterDendro,returnColumn="ClusterIdDendro",matchColumn="NodeId")
			if(any(is.na(clusters))) stop("coding error -- didn't find singleton cluster")
			whSamplesSingle<-match(gsub("ClusterId","",clusters),as.character(cl))
			mSamples[whMissed]<-whSamplesSingle
		}
		if(any(is.na(mSamples[position %in% c("unassigned tip","assigned tip")]))) stop("coding error -- finding sample index failed")
		data.cl <- data.frame(NodeId=mClusterHier, Position=position, SampleIndex=mSamples, stringsAsFactors=FALSE)				
		row.names(data.cl)<-as.character(nodeLabs)

		# #Need to fix back up the nodeLabels so match that of clusterDendro (in conversion made them the internal names)
		# whMatchCluster<-which(!is.na(data.cl$NodeId))
		# mToCluster<-.matchToDendroData(inputValue=data.cl$NodeId[whMatchCluster],clusterDendro,returnColumn="NodeIndex",matchColumn="NodeId")
		# phylobase::labels(newPhylo,type="all")[whMatchCluster]<-phylobase::labels(clusterDendro,type="all")[mToCluster]
		
		phylobase::labels(newPhylo,type="all")<-NA
		return(phylobase::phylo4d(x=newPhylo, all.data = data.cl))

    }
    else return(NULL)
}
epurdom/clusterExperiment documentation built on Oct. 12, 2022, 5:27 a.m.