R/hopach.R

Defines functions hopach newsplitcluster newnextlevel msscomplete mssrundown mssmulticollap msscollap collap paircoll mssinitlevel orderelements mssnextlevel msssplitcluster nonzeros cutzeros cutdigits digits msscheck silcheck labelstomss

Documented in collap cutdigits cutzeros digits hopach labelstomss msscheck msscollap msscomplete mssinitlevel mssmulticollap mssnextlevel mssrundown msssplitcluster newnextlevel newsplitcluster nonzeros orderelements paircoll silcheck

#Hierarchical Ordered Partitioning and Collapsing Hybrid (HOPACH)#

#a. mean/median split silhoutte

labelstomss<-function(labels,dist,khigh=9,within="med",between="med",hierarchical=TRUE){
	if(!is.vector(labels))
		stop("First arg to labelstomss() must be a vector")

	p = dist@Size
	if(length(labels)!=p)
		stop("Distance matrix and labels dimensions do not agree in labelstomss()")

	unlabels<-sort(unique(labels))
	k<-length(unlabels)
	ss<-NULL
	for(i in 1:k){
		labs<-(1:p)[labels==unlabels[i]]
		pp<-length(labs)
		if(pp<3)
			ss[i]<-NA
		else{
			dissvec<-dist[labs,labs]@Data
			bestk<-silcheck(dissvec,min(khigh,(length(labs)-1),na.rm=TRUE),diss=TRUE)[1]
			if(within=="med") 
				ss[i]<-median(pam(dissvec,bestk,diss=TRUE)$silinfo$widths[,3])
			if(within=="mean") 
				ss[i]<-pam(dissvec,bestk,diss=TRUE)$silinfo$avg.width
		}
	}
	if(sum(is.na(ss))==k)
		out<-NA
	else{
		if(hierarchical==TRUE){
			parentlab<-trunc(labels/10)
			unparentlab<-sort(unique(parentlab))
			pk<-length(unparentlab)
			pss<-NULL
			for(i in 1:pk){
				if(between=="med") 
					pss[i]<-median(ss[trunc(unlabels/10)==unparentlab[i]],na.rm=TRUE)
				if(between=="mean") 
					pss[i]<-mean(ss[trunc(unlabels/10)==unparentlab[i]],na.rm=TRUE)
			}
			if(between=="med") 
				out<-median(pss,na.rm=TRUE)
			if(between=="mean") 
				out<-mean(pss,na.rm=TRUE)
		}
		else{
			if(between=="med") 
				out<-median(ss,na.rm=TRUE)
			if(between=="mean") 
				out<-mean(ss,na.rm=TRUE)
		}
	}
	return(out)
}

#b. optimizing number of clusters with average silhouette or mss

#silcheck (silhouettes)
#data is a vector of the distance matrix...

silcheck<-function(data, kmax=9, diss=FALSE, echo=FALSE, graph=FALSE)
{
    if( !diss ) {
        if( inherits(data, "dist"))
            stop("data argument is a dist object, but diss is FALSE")
        if( is.matrix(data) && (nrow(data) == ncol(data) ) )
            warning("data argument is square, could be a dissimilarity")
    }
    if( diss && is.matrix(data) && nrow(data) != ncol(data) )
        stop("should be a dissimilarity matrix - but is not square")
    
    sil<-NULL
    m<-min(kmax, max((!diss)*(dim(data)[1]-1),
                     (diss)*(0.5*(sqrt(1+8*length(data))-1)),
                     na.rm=TRUE)) 
    if(m<2)
        out<-c(1,NA)
    else{
        for(i in 1:(m-1))
            sil[i]<-pam(data, k=(i+1), diss=diss)$silinfo$avg.width
        if(echo)
            cat("best k = ", order(sil)[length(sil)]+1, ", sil(k) = ",
                round(max(sil),4), "\n")
        if(graph){
            plot(2:m, sil, type="n", xlab="Number of Clusters",
                 ylab="Average Silhouette")
            text(2:m, sil ,2:m)
        }
        out<-c(order(sil)[length(sil)]+1,max(sil))
    }
    return(out)
}

#msscheck (mss)
msscheck<-function(dist, kmax=9, khigh=9, within="med", between="med",
                   force=FALSE, echo=FALSE, graph=FALSE){
	p = dist@Size
    if(p<3)
        out<-c(1,NA)
    else{
        dvec<-dist@Data

        kmax<-min(kmax,p-1,na.rm=TRUE)
        if(force)
            mss<-0
        else
            mss<-labelstomss(rep(1,p),dist,khigh,within,between)
        for(k in 2:kmax)
            mss[k]<-labelstomss(pam(dvec, k, diss=TRUE)$clust, dist,
                                khigh, within, between)
        shift<-0
        if(force){
            mss<-mss[-1]
            shift<-1
        }
        if(echo)
            cat("best k = ", order(mss)[1]+shift, ", mss(k) = ",
                round(min(mss),4), "\n")
        if(graph){
            kmin<-ifelse(force,2,1)
            plot(kmin:kmax, mss, type="n" ,xlab="Number of Clusters",
                 ylab="MSS") 
            text(kmin:kmax,mss,kmin:kmax)
        }
        out<-c(order(mss)[1]+shift,min(mss))
    }
    return(out)
}

#2. Utility functions

#counts the number of digits in label
digits<-function(label){
	label<-label[1]
	count<-0
	while(label>=1){
		count<-count+1
		label<-label/10
	}
	return(count)
}

#truncates labels to dig digits
#cutdigits<-function(labels,dig){
#	dl<-NULL
#	for(i in 1:length(labels)) 
#		dl[i]<-digits(labels[i])
#	df<-max(0,dl-dig)
#	trunc(labels/(10^df))		
#}

# patrick's suggestion...
cutdigits <- function( labels, dig ){
	df <- max(0,digits(labels) - dig)
	return(trunc(labels/(10^df)))
}

#removes trailing zeros from labels
cutzeros<-function(labels){
	for(i in 1:length(labels)){
		while(trunc(labels[i]/10)*10==labels[i]){
			labels[i]<-labels[i]/10
		}
	}
	return(labels)
}

#returns the number of non-zero digits in labels
nonzeros<-function(labels){
	for(i in 1:length(labels)){
		while(trunc(labels[i]/10)*10==labels[i]){
			labels[i]<-labels[i]/10
		}
		labels[i]<-digits(labels[i])
	}
	return(labels)	
}

#3. functions for making the tree#

#a. msssplitcluster: splits a cluster# 
	#clust1 is gene by subjects dataframe for cluster1
	#l1 is an integer label of cluster 1: e.g 11,12,13,22, etc describing its path so far in the tree.
	#id1 is id's of cluster1 indicating row numbers in original data frame subdata
	#kmax is the maximum number of groups
	#khigh  is the maximum number of child groups for each group when computing mss
	#medoid1 is the row number (in full data matrix!) indicating  medoid for cluster 1
	#medtodist is the distance from each gene in clust1 to the neighboring cluster medoid,
	#right is 1 if medoid2 is to the right and is 0 if clust1 is the last cluster so that medoid2
	# is the medoid2 to the left of clust1
	#silh is the silhouette of clust1
	#dist1 is the distance matrix for all genes in clust1 
msssplitcluster<-function(clust1,l1,id1,medoid1,med2dist,right,dist1,kmax=9,khigh=9,within="med",between="med"){
	if(!medoid1) 
		warning("Medoid missing - continue to split cluster")
	else{
		if(sum(medoid1==id1)==0 & medoid1) 
			warning("Medoid not in cluster - continue to split cluster")
	}
	if(is.matrix(clust1)){			
		p1<-length(clust1[,1])
		n<-length(clust1[1,])
	}
	else p1<-1
	if(p1<3) 
		k1<-1				
	else{
		l<-length(clust1[,1])
		dissvec<-dist1@Data

		kmax<-min(p1-1,kmax,na.rm=TRUE)
		khigh<-min(p1-1,khigh,na.rm=TRUE)
		k1<-msscheck(dist1,kmax,khigh,within,between)[1]
		if(k1>1){
			pamobj<-pam(dissvec,k1,diss=TRUE)
			newclussizes<-pamobj$clusinfo[,1]
			newmedoids1<-id1[pamobj$medoids]  
			newlabels1<-pamobj$clustering
			distnewmedoids<-NULL 
			for(j in (1:k1)) 
				distnewmedoids[j]<-mean(med2dist[newlabels1==newlabels1[pamobj$medoids[j]]])  
			if(right==1) 
				#ord<-order(distnewmedoids, decreasing = TRUE)
				ord<-rev(order(distnewmedoids))
			else 
				ord<-order(distnewmedoids)
			newmedoids1<-newmedoids1[ord]
			newclussizes<-newclussizes[ord]
			oldlab<-newlabels1
			for(j in (1:k1)) 
				newlabels1[oldlab==ord[j]]<-j
			newlabels1<-rep(10*l1,l)+newlabels1
		}
	}
	if(k1==1){
		newmedoids1<-medoid1
		newlabels1<-rep(10*l1,p1)
		newclussizes<-p1
	}
	for(a in (1:length(newmedoids1))){
		if(sum(newmedoids1[a]==id1)==0) 
			warning("Problem with new medoids after splitting cluster")
	}
	list(k1,newmedoids1,newlabels1,newclussizes)
}

#b. mssnextlevel: calls mssspltitcluster to produce the next level of the tree#
	#data is the data frame
	#prevlevel is the previous level of the tree
	#dmat is the distance matrix
	#kmax is the maximum number of groups
	#khigh is the maximum number of child groups for each group when computing mss
	#within and between are either "med" for median split silhouette or "mean"
	# for mean split silhouette
mssnextlevel<-function(data,prevlevel,dmat,kmax=9,khigh=9,within="med",between="med"){
	n<-length(data[1,])
	p<-length(data[,1])

	id<-1:p
	k<-prevlevel[[1]]
	medoids<-prevlevel[[2]]
	labels<-prevlevel[[4]]
	newk<-0
	newlabels<-newmedoids<-newclussizes<-NULL
	count<-1
	ordlabels<-sort(unique(labels))
	if(length(ordlabels)!=k) 
		warning("Number of unique labels not equal number of clusters in mssnextlevel()")
	if(sum(is.na(medoids))){
		warning("Missing values in medoid vector in nextlevel()")
		medoids[is.na(medoids)]<-FALSE
	}
	if(length(unique(medoids))<k && sum(medoids)) 
		warning("Medoids not unique in mssnextlevel()") 
	checkmeans<-FALSE
	if(length(medoids)==1 && !medoids){
		warning("No medoids provided in mssnxtlevel()")
		usemean<-TRUE
	}
	else{
		if(sum(medoids>1)==k) 
			usemean<-FALSE
		else 
			checkmeans<-TRUE
	}
	for(j in (1:k)){
		clust1<-data[labels==ordlabels[j],]	
		id1<-id[labels==ordlabels[j]]
		if(length(id1)>1) 
			clust1<-as.matrix(clust1)
		l1<-ordlabels[j]
		right<-(j<k)
		medoid1<-ifelse(is.na(medoids[j]),0,medoids[j])
		if (j<k) 
			medoid2<-medoids[j+1]
		else 
			medoid2<-medoids[j-1]
		if(length(id1)>1) {
			#med2dist<-rowMeans(as(dmat[labels==ordlabels[j],labels==labels[medoid2]],"matrix"))
			med2dist<-rowMeans(dmat[labels==ordlabels[j],labels==labels[medoid2]])
		}else{
			med2dist<-mean(dmat[labels==ordlabels[j],labels==labels[medoid2]])
		}

		splitobj<-msssplitcluster(clust1,l1,id1,medoid1,med2dist,right,dmat[labels==l1,labels==l1],kmax,khigh,within,between)
		newlabels[labels==ordlabels[j]]<-splitobj[[3]]
		k1<-splitobj[[1]]
		start<-count
		end<-count+k1-1
		newmedoids[start:end]<-splitobj[[2]]
		newclussizes[start:end]<-splitobj[[4]]
		count<-count+k1		
	}
	count<-newk<-count-1
	newmedoids<-newmedoids[1:newk]
	newclussizes<-newclussizes[1:newk]
	final<-0
	if(count==k)
		final<-1
	if(max(newclussizes)==3) 
		final<-1
	list(newk,newmedoids,newclussizes,newlabels,final,rbind(prevlevel[[6]],cbind(sort(unique(newlabels)),newmedoids)))
}

#c. orderelements: produces an ordering of elements within a set of clusters 
	#level is a level of the tree
	#data is the data frame
	#rel is an indicator of whether to order elements in each cluster with respect 
	# to their own medoid ("own") or the neighboring medoid to the right ("neighbor")
	# or using improveordering() function ("co"). the default is "own"
	#d is an indicator of which distance function to use
	# choices are: "cosangle" (default),"abscosangle","euclid","abseuclid","cor","abscor".
	#dmat is the distance matrix. if this has already been calculated by the user, it can
	# be passed into the function in order to save calculation time
orderelements<-function(level,data,rel="own",d="cosangle",dmat=NULL){

	idn<-1:length(data[,1])
	k<-level[[1]]
	labels<-level[[4]]
	medoids<-level[[2]]
	clussizes<-level[[3]]
	ord<-order(labels)
	idnord<-idn[ord]
	subdataord<-data[ord,]

	if(is.null(dmat))
		dmat <- distancematrix(data,d) 
	distord<-dmat[ord,]

	labelsord<-labels[ord]
	count<-1
	for(j in (1:k)){
		start<-count
		end<-count+clussizes[j]-1
		if(clussizes[j]>2){
			tempid<-idnord[start:end]
			if(rel=="co"){
				distj<-distord[,ord][start:end,start:end]
				idnord[start:end]<-tempid[improveordering(distj)]
			}
			else{
				if(rel=="neighbor"){
					if(j<k) 
						mednext<-medoids[j+1]
					else 
						mednext<-medoids[j-1]
				}else 
					mednext<-medoids[j]

				dmednext<-distord[start:end,mednext]	

				if(rel=="neighbor"){
					if(j<k) 
						#ordtemp<-order(dmednext, decreasing = TRUE)
						ordtemp<-rev(order(dmednext))
					else 
						ordtemp<-order(dmednext)
				}
				else 
					ordtemp<-order(dmednext)
				idnord[start:end]<-tempid[ordtemp]
			}
		}
		else 
			idnord[start:end]<-idnord[start:end]
		count<-count+clussizes[j]
	}
	list(data[idnord,],idnord)
}


#  d. mssinitlevel: creates ordered initial level #
#  data is the data matrix
#  kmax is the maximum number of groups
#  khigh is the maximum number of child groups for each group when computing mss
#  d is an indicator of which distance function to use.
#  choices are: "cosangle" (default),"abscosangle","euclid","abseuclid","cor","abscor"	
#  dmat is the distance matrix of class hdist. 
#  if this has already been calculated by the user, it can
#  be passed into the function in order to save calculation time
#  within and between are either "med" for median split silhouette or "mean"
#  for mean split silhouette
#  ord is an indicator of how to order the clusters. choices are to maximize 
#  correlation ordering ("co") or to build a tree of cluster medoids ("clust")

mssinitlevel<-function(data, kmax=9, khigh=9, d="cosangle", dmat=NULL,
                       within="med", between="med", ord="co",
                       verbose=FALSE)
{
		  #print("mssinitlevel")
	if(!is.matrix(data))
		stop("First arg to mssinitlevel() must be a matrix")

	p<-nrow(data)

        if(!is.hdist(dmat)){
           if(is.matrix(dmat) && nrow(dmat)==p && ncol(dmat)==p)
		dmat<-as.hdist(dmat)
	   else
		dmat<-distancematrix(data,d=d)
        }

	if(dmat@Size != p)
		stop("Data and distance matrix dimensions do not agree in mssinitlevel()")

	m<-msscheck(dmat,kmax,khigh,within,between)
	if(m[1]==1){
      if(verbose)
		cat("No strong evidence for clusters in the first level - \n continuing to split root node anyway. \n")
		m<-msscheck(dmat,kmax,khigh,within,between,force=TRUE)
	}

	pamobj<-pam(dmat@Data, m[1], diss=TRUE)
	rowmedoids<-pamobj$medoids
	final<-ifelse(max(pamobj$clusinfo[,1])<3,1,0)
	if(m[1]>2){			
		medoidsdata<-as.matrix(data[rowmedoids,])
		l<-length(rowmedoids)
		medoidsdist<-dmat[rowmedoids,rowmedoids]
		if(ord=="co")
			medoidsord<-improveordering(medoidsdist)
		if(ord=="clust"){
			mpamobj<-pam(medoidsdist@Data,2,diss=TRUE)
			labelsmed<-mpamobj$clustering
			medmed<-mpamobj$medoids
			clussizes<-mpamobj$clusinfo[,1]
			
			prevlevel<-mssnextlevel(medoidsdata,list(2,medmed,clussizes,labelsmed,0,cbind(c(1,2),medmed)),dmat=medoidsdist,kmax,khigh,within,between)
			final<-prevlevel[[5]]
			if(final==0){
				depth<-(l-1)
				for(j in (1:depth)){
					if(final==0){
						clustnext<-mssnextlevel(medoidsdata,prevlevel,dmat=medoidsdist,kmax,khigh,within,between)
						final<-clustnext[[5]]
					}
					if(final==1){ 
						j<-depth
						prevlevel<-clustnext
					}
				}
			}					
			medoidsord<-orderelements(prevlevel,medoidsdata,rel="neighbor",d=d,dmat=medoidsdist)[[2]]
		}
		k<-m[1]
		rowmedoids<-rowmedoids[medoidsord]
		labels<-lab2<-pamobj$clustering
		for(j in (1:k)) 
			lab2[labels==medoidsord[j]]<-j
		output<-list(k,rowmedoids,pamobj$clusinfo[,1][medoidsord],lab2,final,cbind(1:k,rowmedoids))
	}
	else
		output<-list(2,pamobj$medoids,pamobj$clusinfo[,1],pamobj$clustering,final,cbind(1:2,pamobj$medoids))
	return(output)
}

#e. collapsing functions#
	#paircoll() collapses a pair of medoids (i,j)
	#collap() calls paircoll() to consider and possibly perform collapsing
	#msscollap() collapses by sequentially calling collap starting with 
	# the closest pair of clusters til there is no more improvement in mss
	#mssmulticollap tries all pairs of clusters and collapses any that improve mss
	###########################################################################################
	#data is the data matrix
	#level is level of the tree
	#d is an indicator of which distance function to use
	# choices are: "cosangle" (default),"abscosangle","euclid","abseuclid","cor","abscor"
	#dmat is the distance matrix. if this has already been calculated by the user, it can
	# be passed into the function in order to save calculation time.
	#newmed is an indicator of which way to find the medoid of the new cluster after collapsing.
	# choices are: "nn" to use the nearest neighbor of the clustersize-weighted 
	# mean of the two medoids as the medoid of a collapsed cluster, "uwnn" to use an unweighted 
	# version of nearest neighbor so that each cluster (rather than each gene) gets equal
	# weight in the mean, "center" to use the cluster center (element with min sum distance 
	# to all others), "medsil" (default) to use the medoid which maximizes the medoid based 
	# silhouette (i.e.: (a-b)/max(a,b), where a=dist(medoid), b=dist(next closest medoid)). 
	#the silhouettes and splits (arguments [[1]] and [[2]] of level) refer to the original
	# splits and loose their meaning if the child cluster(s) are collapsed
	#impr is a margin of improvement required to accept a collapse with msscollap and
	# mssmulticollap. the default is impr=0
paircoll<-function(i,j,data,level,d="cosangle",dmat=NULL,newmed="medsil"){
	p<-length(data[,1])
	k<-level[[1]]
	labels<-level[[4]]
	medoids<-level[[2]]
	clussizes<-level[[3]]
	N<-length(level[[6]][,1])
	block<-level[[6]][(N-k+1):N,]
	if(N==k)
		prevblock<-NULL
	else
		prevblock<-level[[6]][1:(N-k),]
	if(max(i,j)>k)
		stop("The clusters to collapse do not exist in paircoll()") 
	labeli<-labels[medoids[i]]
	labelj<-labels[medoids[j]]
	oldlabels<-labels
	labels[labels==labelj]<-labeli
	trunclabels<-trunc(oldlabels/10)
	labelparents<-unique(trunclabels)
	parenti<-order(labelparents)[labelparents==trunc(labeli/10)]
	parentj<-order(labelparents)[labelparents==trunc(labelj/10)]
	if(newmed=="nn")
		fakemed<-(data[medoids[i],]*clussizes[i]+data[medoids[j],]*clussizes[j])/(clussizes[i]+clussizes[j])
	if(newmed=="uwnn")
		fakemed<-(data[medoids[i],]+data[medoids[j],])/2
	if(newmed=="nn" || newmed=="uwnn"){ 
		rowsub<-(1:p)[labels==labeli]
		distsfm<-distancevector(data[rowsub,],as.vector(fakemed),d)
		medoids[i]<-rowsub[order(distsfm)[1]]
	}
	else{
		#colldist<-as.matrix(dmat[labels==labeli,labels==labeli])
		colldist<-as(dmat[labels==labeli,labels==labeli],"matrix")

		rowsub<-(1:p)[labels==labeli]
		if(newmed=="center"){
			sumdist<-rowSums(colldist)
		}
		if(newmed=="medsil"){
			othermed<-medoids[-c(i,j)]
			collp<-length(labels[labels==labeli])
			othern<-length(othermed)
			if(othern==0)
				stop("Not enough medoids to use newmed='medsil' in paircoll()")
			if(is(dmat,"hdist")){
			 	if(othern==1){ 
					#otherdist<-cbind(dmat[labels==labeli,othermed])
					otherdist<-dmat[labels==labeli,othermed]
				}else{ 
					#otherdist<-rbind(dmat[labels==labeli,othermed])
					otherdist<-dmat[labels==labeli,othermed]
				}
			}

			if(othern==1)
				b<-otherdist
			else
				b<-apply(otherdist,1,min)
			b<-matrix(b,nrow=collp,ncol=collp)
			diag(b)<-0
			b<-abs(b-colldist)/pmax(colldist,b)
			sumdist<-rowSums(b)
			medoids[i]<-rowsub[order(sumdist,decreasing=TRUE)==1]
			#medoids[i]<-rowsub[rev(order(sumdist))==1]
		}
	}	
	k<-k-1	
	clussizes[i]<-clussizes[i]+clussizes[j]
	block[i,2]<-medoids[i]
	if(j<=k){			
		for(l in (j:k)){
			medoids[l]<-medoids[l+1]
 		        clussizes[l]<-clussizes[l+1]
			block[l,]<-block[l+1,]
		}
	}
	medoids<-medoids[1:k]
	clussizes<-clussizes[1:k]
	block<-block[1:k,]
	if(labels[labels==labeli][1]/10==trunclabels[labels==labeli][1]){
		labels[labels==labeli]<-labels[labels==labeli]+1
		labels[labels==labelj]<-labels[labels==labelj]+1
		block[i,1]<-block[i,1]+1
	}
	return(list(k,medoids,clussizes,labels,level[[5]],rbind(prevblock,block)))
}

#note: this version of collap does not have silhbased arg: for use with MSS only (not silhouettes)
collap<-function(data,level,d="cosangle",dmat=NULL,newmed="medsil"){
	k<-level[[1]]
	if(k<3 && newmed!="nn"){
		warning("Not enough medoids to use newmed='medsil' in collap() - \n using newmed='nn' instead \n") 
		newmed<-"nn"
	}
	medoids<-level[[2]]
	clussizes<-level[[3]]
	if(sum(is.na(clussizes))) 
		warning("NA in clussizes")
	medoidsdata<-data[medoids,]
	if(sum(is.na(medoidsdata))>0) 
		warning("Missing value(s) in medoidsdata in collap()")

	distmed<-dmat[medoids,medoids]
	distv<-distmed@Data

	indexmin<-order(distv)[1]
	best<-vectmatrix(indexmin,k)
	clustfinal<-paircoll(best[1],best[2],data,level,d,dmat,newmed)
	return(clustfinal)
}

msscollap<-function(data,level,khigh,d="cosangle",dmat=NULL,newmed="medsil",within="med",between="med",impr=0){

	if(impr<0){
          warning("impr must be positive - setting impr=0.")
          impr<-0
        }
        newk<-level[[1]]
	mss1<-labelstomss(level[[4]],dmat,khigh,within,between)
	maxncoll<-max(0,newk-2)
        ncoll<-0
	coll<-1
        if(newk<=2) 
		coll<-0
 	while((coll==1) && (ncoll<= maxncoll)){
                levelc<-collap(data,level,d,dmat,newmed)
		mss2<-labelstomss(levelc[[4]],dmat,khigh,within,between)
		if(mss1==0) 
			r<-0
		else 
			r<-(mss1-mss2)/mss1
		if(r<impr) 
			coll<-0 	 
    		else{
			mss1<-mss2
			level<-levelc
                        ncoll<-ncoll+1
                }
	}
	return(level)
} 

mssmulticollap<-function(data,level,khigh,d="cosangle",dmat=NULL,newmed="medsil",within="med",between="med",impr=0){
	if(!is.matrix(data))
		stop("First arg to mssmulticollap() must be a matrix")
	if(impr<0){
          warning("impr must be positive - setting impr=0.")
          impr<-0
        }
	medoids<-level[[2]]
	medoidsdata<-data[medoids,]
	if(sum(is.na(medoidsdata))>0) 
		warning("Missing value(s) in medoidsdata in mssmulticollap()")

	distmed<-dmat[medoids,medoids]
	k<-level[[1]]
	ord<-order(distmed@Data)

	mss1<-labelstomss(level[[4]],dmat,khigh,within,between)
	maxncoll<-max(0,k*(k-1)/2)
	ncoll<-0 
	i<-1
	while(i<=maxncoll){
		clusts<-vectmatrix(ord[i],k)
		if(k<3){
			if(newmed=="medsil") 
				warning("Can't use newmed=medsil with less than 3 clusters. \n Substituting newmed=nn")
			newmed<-"nn"
		}
		levelc<-paircoll(clusts[1],clusts[2],data,level,d,dmat,newmed)
		mss2<-labelstomss(levelc[[4]],dmat,khigh,within,between)
		if(mss1==0) 
			r<-0
		else 
			r<-(mss1-mss2)/mss1
		if(r>=impr){
			mss1<-mss2
			level<-levelc
                        ncoll<-ncoll+1
			k<-level[[1]]
			maxncoll<-max(0,k*(k-1)/2)
			i<-0
			medoids<-level[[2]]
			medoidsdata<-data[medoids,]
			if(sum(is.na(medoidsdata))>0) 
				warning("Missing value(s) in medoidsdata in mssmulticollap()")

			distmed<-dmat[medoids,medoids]
			ord<-order(distmed@Data)
	        }
		i<-i+1
	}
	return(level)
}

#f. iterating functions to run down the tree#
	#mssrundown() runs down the tree K levels with a stopping rule to 
	# find the main clusters
	#msscomplete() runs down the tree to the final level from any level
	#############################################################################################
	#data is the data matrix
	#K is the maximum number of levels to compute
	#kmax is the maximum number of groups
	#khigh  is the maximum number of child groups for each group when computing mss
	#d is an indicator of which distance function to use
	# choices are: "cosangle" (default),"abscosangle","euclid","abseuclid","cor","abscor"	
	#dmat is the distance matrix. if this has already been calculated by the user, it can
	# be passed into the function in order to save calculation time
	#coll is an indicator of how to collapse. the choices are to begin with the closest 
	# pair of clusters and collapse til there is no more improvement in mss ("seq") 
	# or to try all pairs of clusters and accept any collapse that improves mss ("all").
	#newmed is an indicator of which way to find the medoid of the new cluster after collapsing.
	# choices are: "nn" to use the nearest neighbor of the clustersize-weighted 
	# mean of the two medoids as the medoid of a collapsed cluster, "uwnn" to use an unweighted 
	# version of nearest neighbor so that each cluster (rather than each gene) gets equal
	# weight in the mean, "center" to use the cluster center (element with min sum distance 
	# to all others), "medsil" (default) to use the medoid which maximizes the medoid based 
	# silhouette (i.e.: (a-b)/max(a,b), where a=dist(medoid), b=dist(next closest medoid)). 
	#stop is an indicator that the tree should stop as soon as there is an increase in 
	# mss moving to the next level
	#finish is an indicator that the tree should compute all K levels and return the
	# one with the minimum mss (when finish==FALSE and stop==FALSE, level K is returned)
	#within and between are either "med" for median split silhouette or "mean"
	# for mean split silhouette.
	#impr is a margin of improvement required to accept a collapse with msscollap and
	# mssmulticollap. the default is impr=0.

mssrundown<-function(data, K=16, kmax=9, khigh=9, d="cosangle",
	dmat=NULL, initord="co", coll="seq", newmed="medsil", stop=TRUE,
	finish=FALSE, within="med",between="med",impr=0, verbose=FALSE) 
{ 
		  #print("mssrundown")
	if(!is.matrix(data))
		stop("First arg to mssrundown() must be a matrix")

	bestlevel<-level<-mssinitlevel(data, kmax, khigh, d, dmat, within, between, initord, verbose)
	bestmss<-mss<-labelstomss(level[[4]],dmat,khigh,within,between)
	bestl<-l<-1
	ind<-0
        if(verbose)
            cat("Searching for main clusters... \n")
	if(level[[5]]==1)
		return(level)
	while((l<=K) && (ind==0)){
		if(verbose) cat("Level ",l,"\n")
		if(coll=="seq")	
			levelc<-msscollap(data,level,khigh,d,dmat,newmed,within,between,impr)
		if(coll=="all") 
			levelc<-mssmulticollap(data,level,khigh,d,dmat,newmed,within,between,impr)
		mss<-labelstomss(levelc[[4]],dmat,khigh,within,between)
		if(mss>=bestmss & stop==TRUE)
			ind<-1
		else{
			if(mss<bestmss){
				bestlevel<-levelc
				bestmss<-mss
				bestl<-l
			}
		}
		l<-l+1
		if(l<=K){	
			level<-mssnextlevel(data,levelc,dmat,kmax,khigh,within,between)
			if(finish==TRUE){
				if(sum(trunc(level[[4]]/10)*10==level[[4]])==length(level[[4]]) & l<=K){
					ind<-1
					bestlevel<-levelc
					bestmss<-mss
					bestl<-(l-1)
				}
			}
		}
	}
        if(verbose)
            cat("Identified", bestlevel[[1]],
                " main clusters in level",
                bestl, "with MSS =",bestmss,"\n")
	return(bestlevel)
}

msscomplete<-function(level, data, K=16, khigh=9, d="cosangle",
	dmat=NULL, within="med", between="med", verbose=FALSE)
{
	if(!is.matrix(data))
		stop("First arg to msscomplete() must be a matrix")

	count<-digits(level[[4]][1])
        if(verbose)
            cat("Running down without collapsing from Level",count,"\n")
	while((max(level[[3]])>3) & (count<K)){
		level<-newnextlevel(data,level,dmat,2,khigh)
		count<-count+1
                if(verbose) cat("Level",count,"\n")
	}
	return(level)
}


#newnextlevel and newsplitcluster are needed in msscomplete to rundown 
#completely without collapsing back to the main clusters.
	#data is the data matrix
	#prevlevel is the level from which a next level is produced
	#dmat is the distance matrix, as above
	#klow and khigh are the min and max number of children at each node
	#newnextlevel produces the args to newsplitcluster and calls
	# this function to do the splitting of each node

newnextlevel<-function(data,prevlevel,dmat,klow=2,khigh=6){
		  #print("newnextlevel")
	if(!is.matrix(data))
		stop("First arg to newnextlevel() must be a matrix")
	p<-length(data[,1])
	n<-length(data[1,])
	id<-1:p
	k<-prevlevel[[1]]
	medoids<-prevlevel[[2]]
	labels<-prevlevel[[4]]
	newk<-0
	newlabels<-newmedoids<-newclussizes<-NULL
	count<-1
	ordlabels<-sort(unique(labels))
	if(length(ordlabels)!=k) 
		warning("Number of unique labels not equal number of clusters in newnextlevel()")
	if(sum(is.na(medoids))){
		warning("Missing value(s) in medoid vector in newnextlevel()")
		medoids[is.na(medoids)]<-FALSE
	}
	if(length(unique(medoids))<k && sum(medoids)) 
		warning("Medoids in newnextlevel() are not unique") 
	checkmeans<-FALSE
	if(length(medoids)==1 && !medoids){
		warning("No medoids provided in newnextlevel()")
		usemean<-TRUE
	}
	else{
		if(sum(medoids>1)==k) 
			usemean<-FALSE
		else 
			checkmeans<-TRUE
	}
	for(j in (1:k)){
		clust1<-data[labels==ordlabels[j],]	
		id1<-id[labels==ordlabels[j]]
		if(length(id1)>1){
			kmax<-min(c(khigh,dim(clust1)[1]-1))
			clust1<-as.matrix(clust1)
		}
		l1<-ordlabels[j]
		right<-(j<k)
		medoid1<-ifelse(is.na(medoids[j]),0,medoids[j])
		if (j<k) 
			medoid2<-medoids[j+1]
		else 
			medoid2<-medoids[j-1]
		if(length(id1)>1) 
			med2dist<-rowMeans(as(dmat[labels==ordlabels[j],labels==labels[medoid2]],"matrix"))
		else 
			med2dist<-mean(dmat[labels==ordlabels[j],labels==labels[medoid2]])

		splitobj<-newsplitcluster(clust1,l1,id1,klow,kmax,medoid1,med2dist,right,dmat[labels==l1,labels==l1]) 
		newlabels[labels==ordlabels[j]]<-splitobj[[3]]
		k1<-splitobj[[1]]
		start<-count
		end<-count+k1-1
		newmedoids[start:end]<-splitobj[[2]]
		newclussizes[start:end]<-splitobj[[4]]
		count<-count+k1-1+1		
	}
	count<-count-1
	newk<-count
	newmedoids<-newmedoids[1:newk]
	newclussizes<-newclussizes[1:newk]
	final<-0
	if(count==k)
		final<-1
	if(max(newclussizes)==3) 
		final<-1
	return(list(newk,newmedoids,newclussizes,newlabels,final,rbind(rbind(prevlevel[[6]],cbind(sort(unique(newlabels)),newmedoids)))))
}

newsplitcluster<-function(clust1,l1,id1,klow=2,khigh=2,medoid1,med2dist,right,dist1){
		  #print("newsplitcluter")
	if(!medoid1) 
		warning("Medoid missing - continue to split cluster")
	else{
		if(sum(medoid1==id1)==0 & medoid1) 
			warning("Medoid not in cluster - continue to split cluster")
	}
	if(is.matrix(clust1)){			
		p1<-length(clust1[,1])
		n<-length(clust1[1,])
	}
	else p1<-1
	if(p1<3){
		k1<-1				
		newmedoids1<-medoid1
		newlabels1<-rep(10*l1,p1)
		newclussizes<-p1
	}
	else{
		l<-length(clust1[,1])
		dissvec<-dist1@Data

		kmax<-min(p1-1,khigh)
		a<-rep(0,(kmax-klow+2))
		best<-2
		for(j in (klow:kmax)){
			a[j]<-pam(dissvec,j,diss=TRUE)$silinfo$avg.width
			if (a[j]>a[best]) best<-j
		}
		k1<-best
		pamobj<-pam(dissvec,k1,diss=TRUE)
		newclussizes<-pamobj$clusinfo[,1]
		newmedoids1<-id1[pamobj$medoids]
		newlabels1<-pamobj$clustering
		distnewmedoids<-NULL
		for(j in (1:k1)) 
			distnewmedoids[j]<-mean(med2dist[newlabels1==newlabels1[pamobj$medoids[j]]])  
		if(right==1) 
			ord<-order(distnewmedoids,decreasing=TRUE)
			#ord<-rev(order(distnewmedoids))
		else 
			ord<-order(distnewmedoids)
		newmedoids1<-newmedoids1[ord]
		newclussizes<-newclussizes[ord]
		oldlab<-newlabels1
		for(j in (1:k1)) 
			newlabels1[oldlab==ord[j]]<-j
		newlabels1<-rep(10*l1,l)+newlabels1
	}
	for(a in (1:length(newmedoids1))){
		if(sum(newmedoids1[a]==id1)==0) 
			warning("Problem with new medoids after splitting cluster")
	}
	return(list(k1,newmedoids1,newlabels1,newclussizes))
}

#g. wrapper function to build the whole tree with clusters#
	#data is the data matrix.
	#clusters (default="best") tells how to identify the main clusters 
	# clusters="greedy" stops at the first level where MSS increases
	# clusters="none" does not identify main clusters
	# clusters="best" identifies the level<=K with best MSS as main clusters
	#K is the maximum number of levels to compute. right now, still have 
	# computational problem with doing more than 16, which is the default.
	#kmax is the maximum number of groups.
	#khigh  is the maximum number of child groups for each group when computing mss.
	#d is an indicator of which distance function to use.
	# choices are: "cosangle" (default),"abscosangle","euclid","abseuclid","cor","abscor".	
	#dmat is the distance matrix. if this has already been calculated by the user, it can
	# be passed into the function in order to save calculation time.
	#coll is an indicator of how to collapse. the choices are to begin with the closest 
	# pair of clusters and collapse til there is no more improvement in mss ("seq") 
	# or to try all pairs of clusters and accept any collapse that improves mss ("all").
	#newmed is an indicator of which way to find the medoid of the new cluster after collapsing.
	# choices are: "nn" to use the nearest neighbor of the clustersize-weighted 
	# mean of the two medoids as the medoid of a collapsed cluster, "uwnn" to use an unweighted 
	# version of nearest neighbor so that each cluster (rather than each gene) gets equal
	# weight in the mean, "center" to use the cluster center (element with min sum distance 
	# to all others), "medsil" (default) to use the medoid which maximizes the medoid based 
	# silhouette (i.e.: (a-b)/max(a,b), where a=dist(medoid), b=dist(next closest medoid)). 
	#mss is either "med" (default) for median split silhouettes or "mean" for mean 
	# split silhouettes.
	# impr is a margin of improvement required to accept a collapse with msscollap and
	# mssmulticollap. the default is impr=0.
	#initord is "co" (default) if improveordering() is used to order the clusters in 
	# the first level or "clust" if clsutering the medoids is used.
	#ord determines how elements are ordered within clusters: "co" is 
	# using improveordering(), "own" is distance to their own medoid, and "nieghbor"
	# is distance to the neighboring medoid (to the right).

hopach<-function(data, dmat=NULL, d="cosangle", clusters="best", K=15,
                 kmax=9, khigh=9, coll="seq", newmed="medsil",
                 mss="med", impr=0,initord="co",ord="own", verbose=FALSE){
	if(inherits(data,"ExpressionSet")) 
		data<-exprs(data)
	data<-as.matrix(data)
        p<-nrow(data)

	# Convert to hdist immediately #
	if( is.null(dmat) ){
      	dmat<-distancematrix(data,d)
	}else if( is.matrix(dmat) && nrow(dmat)==p && ncol(dmat)==p){
		dmat <- as(dmat,"hdist")
	}else if( class(dmat) == "dist" ){
		dmat <- hdist(Data=as.numeric(dmat), Size=attr(dmat,"Size"), Labels=(1:(attr(dmat,"Size"))), Call=as.character(attr(dmat,"call"))[3])
	}else if(!is.hdist(dmat)){
		stop("Distance matrix could not be transformed into hdist object.") 
	}

	if(K>15){
		K<-15
		warning("K set to 15 - can't do more than 15 splits")
	}
	if(K<1){
		K<-1
		warning("K set to 1 - can't do less than 1 level")
	}
	if(clusters!="none"){
		cuttree<-mssrundown(data, K, kmax, khigh, d, dmat,
                                    initord, coll, newmed,
                 stop=(clusters=="greedy"), finish=TRUE, within=mss,
                                    between=mss, impr, verbose) 
		if(cuttree[[1]]>1) 
			cutord<-orderelements(cuttree,data,rel=ord,d,dmat)[[2]]
		else 
			cutord<-NULL
		out1<-list(k=cuttree[[1]],medoids=cuttree[[2]],sizes=cuttree[[3]],labels=cuttree[[4]],order=cutord)
		finaltree<-msscomplete(cuttree, data, K, khigh, d,
                 dmat, within=mss, between=mss, verbose)
	}
	else{
		out1<-NULL
		finaltree<-msscomplete(mssinitlevel(as.matrix(data),
                 kmax, khigh, d, dmat, within=mss, between=mss,
                 initord), data, K, khigh, d, dmat, within=mss,
                 between=mss, verbose)
	}
	dimnames(finaltree[[6]])<-list(NULL,c("label","medoid"))
	out2<-list(labels=finaltree[[4]],
                 order=orderelements(finaltree, data, rel=ord, d,
                 dmat)[[2]], medoids=finaltree[[6]])
	return(list(clustering=out1, final=out2, call=match.call(), metric=d))
}

Try the hopach package in your browser

Any scripts or data that you put into this service are public.

hopach documentation built on Nov. 8, 2020, 4:54 p.m.