R/gap.r
In clusterGenomics: Identifying clusters in genomics data by recursive partitioning

####################################################################################################################
## Author: Gro Nilsen
## Maintainer: Gro Nilsen <gronilse@ifi.uio.no>
## License: Artistic 2.0
## Part of the clusterGenomics package
## Reference: "Identifying clusters in genomics data by recursive partitioning", Nilsen et al. (2013, preprint)
####################################################################################################################


### Requires: ##
## clusteringFunctions.r
	
### Required by: ##
## part.r

#gap finds the optimal number of clusters based on the "gap statistic" (Tibshirani et al. 2001) 

gap <- function(X,Kmax=10,B=100,ref.gen="PC",cl.lab=NULL,...){
	
  #Default ... values:
  default.par <- list(dist.method="euclidean",cl.method="hclust",linkage="average",cor.method="pearson",nstart=10)
  #Check for user modifications
  #Note: call could come from part in which case ... is a list called fixed.par, or it could be a independent call in which case ... could contain several parameters which must be converted to list
  if(hasArg(fixed.par)){
    fixed.par <- modifyList(default.par,as.list(list(...)$fixed.par))
  }else{
    fixed.par <- modifyList(default.par,list(...))    
  }
  
  n <- nrow(X)
	
  if(n<=Kmax){
    if(fixed.par$cl.method=="hclust"){
      Kmax = n
    }else{
      Kmax = n-1   #kmeans does not work if K=n
    }
  }
  	
	#Calculate distances:
	dX <- getDist(X,dist.method=fixed.par$dist.method,cor.method=fixed.par$cor.method)
	
	if(is.null(cl.lab)){
	 cl.lab <- findPartition(X=X,Kmax=Kmax,dX=dX,fixed.par)
	}
	
	#Find W: vector containing W_K for all choices of K
	W <- findW(dX=dX,K=Kmax,cl.lab=cl.lab)

		
	#Generate reference data sets and find Wb:
	Wb <- getReferenceW(X,Kmax,B,ref.gen,fixed.par)

	
	#Calculate gap statistic:
	L <- apply(log(Wb),1,mean)
  gap <- L - log(W)

	sdk <- apply(log(Wb),1,sd)*sqrt((B-1)/B)  #multiply by the last expression to get 1/B (as in original article) instead of 1/(B-1) (normal calculation)
	sk <- sqrt(1+(1/B))*sdk

	#Calculate gap-criterion for each k:
	diff <- gap[1:Kmax-1] - (gap[2:Kmax] - sk[2:Kmax])

	#Select the first k where the criterion is positive:
	kvec <- 1:Kmax
	posDiff <- diff[diff>=0]
	if(length(posDiff)==0){
		hatK <- 1  #no k satisfies criterion, define 1 cluster e.g. to make part run..
	}else{ 
		hatK <- kvec[diff>=0][1]
	}

	if(all(is.na(diff))){
		hatK <- 1   #when used with part; can happen if sub-cluster consists of only 1 or 2 objects
	}
	
  #Get labels for best partition:
	lab.hatK <- cl.lab[[hatK]]
		
	return(list(hatK=hatK,lab.hatK=lab.hatK,gap=gap,sk=sk,W=W))
}



## helper-functions only used by GAP:

#Simulate from a uniform distribution according to min and max of the feature vector:
sim <- function(Xcol) {
	min <- min(Xcol)
	max <- max(Xcol)
	U <- runif(length(Xcol),min,max)
	return(U)
}


#Calulates the total within-cluster dispersion (W_K) for a k=1,...,K:
findW <- function(dX,K,cl.lab){

	W <- rep(0,K)
	#n <- nrow(X)
	n <- nrow(as.matrix(dX))
	
	#Calculate W for k=1:
	W[1] <- sum(as.matrix(dX))/(2*n)

	k <- 2
	while(k<=K){
	  labX <- cl.lab[[k]]
		
		for(i in 1:k){
			#Sum of within-cluster dispersion:
			d.k <- as.matrix(dX)[labX==i,labX==i]
			D.k <- sum(d.k)
				
			nk <- nrow(as.matrix(d.k))
			W[k] <- W[k] + D.k/(2*nk)
			
		}#endfor
		k <- k+1
	}#endwhile
	return(W)

}#endfunction



getReferenceW <- function(X,Kmax,B,ref.gen,...)	{
  arg <- as.list(...)
	#Generate reference data sets and find Wb:
	Wb <- matrix(0,nrow=Kmax,ncol=B)
	if(ref.gen=="PC"){
    #Transform data using svd:
    m <- apply(X,2,mean,na.rm=T)   #First columncenter X:
    Xc <- sweep(X,2,m)
    #SVD:
    s <- svd(Xc)	
	  newX <- Xc%*%s$v
  }
	for(b in 1:B){
		if(ref.gen=="PC"){
      U <- apply(newX,2,sim)
		  Z1 <- U%*%t(s$v)      #backtransform
		  #Add mean
		  Z <- sweep(Z1,2,m,FUN="+")
		}else{
		  Z <- apply(X,2,sim)
		}
		#Calculate distances
		dZ <- getDist(Z,dist.method=arg$dist.method,cor.method=arg$cor.method)
		#Cluster reference data 
		clW.lab <- findPartition(X=Z,Kmax=Kmax,dX=dZ,arg)
		#Calculate Wb_K for all values of K
    Wb[,b] <- findW(dX=dZ,K=Kmax,cl.lab=clW.lab)
	}#endfor
	
	return(Wb)
	
}

Any scripts or data that you put into this service are public.

clusterGenomics documentation built on May 2, 2019, 7:04 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

clusterGenomics
Identifying clusters in genomics data by recursive partitioning

R/gap.r
In clusterGenomics: Identifying clusters in genomics data by recursive partitioning

Try the clusterGenomics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

clusterGenomics Identifying clusters in genomics data by recursive partitioning

R/gap.r In clusterGenomics: Identifying clusters in genomics data by recursive partitioning

Try the clusterGenomics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

clusterGenomics
Identifying clusters in genomics data by recursive partitioning

R/gap.r
In clusterGenomics: Identifying clusters in genomics data by recursive partitioning