ClueR: Cluster Evaluation

Documented in clustEnrichment

#' Cluster enrichment test
#' 
#' Takes a clustering object generted by cmeans or kmeans algorithm and determine the enrichment of each cluster
#' and then the overall enrichment of this clustering object based on an annotation file.
#' 
#' @param clustObj the clustering object generated by cmeans or kmeans.
#' @param annotation a list with names correspond to kinases and elements correspond to substrates belonging to each kinase.
#' @param effectiveSize the size of kinase-substrate groups to be considered for calculating enrichment. Groups that are too small
#' or too large will be removed from calculating overall enrichment of the clustering.
#' @param pvalueCutoff a pvalue cutoff for determining which kinase-substrate groups to be included in calculating overall enrichment of the clustering.
#' @param universe the universe of genes/proteins/phosphosites etc. that the enrichment is calculated against.
#' @return a list that contains both the p-value indicating the overall enrichment and a sublist that details the enrichment of each individual cluster.
#' @export
#' 
#' @examples
#' # simulate a time-series data with six distinctive profile groups and each group with
#' # a size of 500 phosphorylation sites.
#' simuData <- temporalSimu(seed=1, groupSize=500, sdd=1, numGroups=4)
#' 
#' # create an artificial annotation database. Generate 100 kinase-substrate groups each
#' # comprising 50 substrates assigned to a kinase. 
#' # among them, create 5 groups each contains phosphorylation sites defined to have the
#' # same temporal profile.   
#' kinaseAnno <- list()
#' groupSize <- 500
#' for (i in 1:5) {
#'  kinaseAnno[[i]] <- paste("p", (groupSize*(i-1)+1):(groupSize*(i-1)+50), sep="_")
#' }
#'    
#' for (i in 6:100) {
#'  set.seed(i)
#'  kinaseAnno[[i]] <- paste("p", sample.int(nrow(simuData), size = 50), sep="_")
#' }
#' names(kinaseAnno) <- paste("KS", 1:100, sep="_")
#'
#' # testing enrichment of clustering results by partition the data into six clusters
#' # using cmeans algorithm.
#' clustObj <- e1071::cmeans(simuData, centers=6, iter.max=50, m=1.25)
#' clustEnrichment(clustObj, annotation=kinaseAnno, effectiveSize=c(5, 100), pvalueCutoff=0.05)
#' 
#' @import stats
clustEnrichment <- function(clustObj, annotation, effectiveSize, pvalueCutoff=0.05, universe=NULL) {
  
  numOfcluster <- nrow(clustObj$centers)
  enrich.list <- list()
  counter <- 0;
  listName <- c()
  for (j in 1:numOfcluster) {

    if (is.null(universe)) {
      enrich.stats <- enrichmentTest(clust=names(which(clustObj$cluster == j)), annotation=annotation, universe=names(clustObj$cluster))
    } else {
      enrich.stats <- enrichmentTest(clust=names(which(clustObj$cluster == j)), annotation=annotation, universe)
    }

    # effective cluster filtering
    sizes <- as.numeric(enrich.stats[,3])
    pvalues <- as.numeric(enrich.stats[,2])
    idx <- which((sizes >= effectiveSize[1]) & (sizes <= effectiveSize[2]) & (pvalues < pvalueCutoff))
    
    if (length(idx) != 0) {
      listName <- c(listName, paste("cluster", j))
      counter <- counter + 1
      enrich.list[[counter]] <- matrix(enrich.stats[idx,], ncol=4)
      colnames(enrich.list[[counter]]) <- c("kinase", "pvalue", "size", "substrates")
    }
  }
  names(enrich.list) <- listName
  
  
  # integrating the enrichment scores
  fisher.pvalue <- 1
  if(length(enrich.list) != 0) {
    # fitness
    cluster.pvalue <- sapply(enrich.list, function(x){
      # minimum p for a cluster
      min(as.numeric(x[,2]))
    })
    
    # fisher for combining overall clustering
    fisher.pvalue <- stats::pchisq(-2*sum(log(cluster.pvalue)), 2*length(cluster.pvalue), lower.tail = FALSE)
    #cluster.pvalue.sort <- sort(cluster.pvalue)
    #if (length(cluster.pvalue.sort) > 5) {
    #  fisher.pvalue <- pchisq(-2*sum(log(cluster.pvalue.sort[1:5])), 2*5, lower.tail = FALSE)
    #} else {
    #  fisher.pvalue <- pchisq(-2*sum(log(cluster.pvalue)), 2*length(cluster.pvalue), lower.tail = FALSE) 
    #}

    if(fisher.pvalue == 0) {
      print("Fisher's combined pvalue is too small. The estimation maybe inaccurate!")
    }
  }
  
  results <- list()
  results$fisher.pvalue <- fisher.pvalue
  results$enrich.list <- enrich.list
  return(results)
}