R/clusterJudge_z_score.R

Defines functions clusterJudge_z_score

Documented in clusterJudge_z_score

clusterJudge_z_score <-
function(clusters, entity.attribute, nmb.randomizations=30, plot.saveRDS.file=NULL){
if(is.null(names(clusters))) stop('The clusters must have as names the entity names of the reference entity.attribute table (e.g. Yeast gene names)')
#####  calculate MI between Clusters and each Attribute: MI(C,Ai) . 
#####     For all attributes (assuming independence) caculate the sum MI(C,A) = Sum(MI(C,Ai)) = n*E(C)+sum(E(Ai))-sum(E(C,Ai))
#####         where C is the variable that has the cluster id ! 
#####  we will restrict the GO attributes table to the entities present in the clusters
    if(!validate_association(entity.attribute, message=FALSE)) return() 

    ##### drop clustered entities not in entities and reciprocally
    common.entities <- intersect(names(clusters), entity.attribute[,1])
    clusters         <- clusters[common.entities]
    entity.attribute <- entity.attribute[entity.attribute[,1] %in% common.entities,]
     
    eat <- table(entity.attribute)
     
#####  keep only attributes that have at least one gene
eat <- eat[,apply(eat,2,sum)>0]     
    if(dim(eat)[2] <=0 ) stop('No attributes found in the entity.attribute table for the clustered entities !')


#####   to maximize randomness take the same number of clusters but with aprox same size 
####  calculate z score and show plots 
no.clust      <- length(unique(clusters))  ### the number of clusters
RND.clust.size <- round(length(clusters)/no.clust) ### the size of the random clusters (as uniform as possible)
N.RND <- nmb.randomizations   ### number of randomizations
MIr <- vector(mode='numeric', length= nmb.randomizations)  ### will store the Mutual information after each randomization
clusters.RND <- rep(no.clust, length(clusters)) ###  create a randomized cluster of same size as the original
for(k in seq_len(length(clusters.RND)))  clusters.RND[k] <- floor((k-1)/RND.clust.size) + 1  ### set the clusters in order because randomization will be later
#### add to last class until it's size is RND.clust.size-1
k=1; while(sum(clusters.RND == no.clust ) < (RND.clust.size-1) ){ clusters.RND[clusters.RND ==k][1] <- no.clust; k <- k+1}

HCr  <- ncol(eat)*entropy(clusters.RND)
HAr  <- sum(apply(eat,2,entropy))
cat('randomizing clusters ')
for(j in seq_len(N.RND)){
     HCAr <- sum(apply(eat,2,function(v) entropy(cbind(sample(clusters.RND),v)))) ### here we randomize by sampling
     MIr[j]  <- HCr+HAr-HCAr
     cat('.') 	
}	

    calculate.mi <- function(cl,ea){
    	  n <- ncol(ea)
    	  n*entropy(cl)+sum(apply(ea,2,entropy)) - sum(sapply(seq_len(ncol(ea)), function(j) entropy(cbind(cl, ea[,j]))) )
    	}
    	
    MI <- calculate.mi(cl=clusters, ea=eat)

z.cl    <- (MI -mean(MIr))/ sd(MIr)
z.RND   <- (MIr -mean(MIr))/ sd(MIr)

 p1 <- bwplot( z.RND ~ rep('Random clusters of uniform size',length(MIr))
      ,ylim=c( min(c(z.cl,z.RND)), max(c(z.cl,z.RND)) )*1.1
      ,ylab='z-score = (MI.experiment - mean(MI.random))/stdev(MI.random)'
      ,panel=function(x,y,...){
      	panel.abline(v=1,h=seq(-10,100,10),alpha=0.5,lty=3)
      	panel.bwplot(x,y,...)
      	panel.xyplot(x,y,...,pch=19,alpha=0.2,jitter.x=TRUE)
      	panel.xyplot(1,z.cl,pch=19,alpha=1,col='red',cex=2)
      	panel.text(1,z.cl,pch=19,paste('z-score Clustering under study:',round(z.cl,2)), cex=0.8, pos=4)
      })
  plot(p1)
  if(!is.null(plot.saveRDS.file)) tryCatch( {saveRDS(p1,plot.saveRDS.file)}, error=function(e){ print(e)})
    

return(data.frame(z.score=c(z.cl, z.RND), method=c('experiment',rep('randomization',length(z.RND)))  ) )     
}
pasculescu/ClusterJudge documentation built on May 29, 2019, 4:52 p.m.