BS831: Boston University - Genomics Data Mining

#####################################################################################
## BEGIN documentation support (what follows are keyworded entries
## from which documentation pages will be extracted automatically)

#' Carry out set enrichment test based on hyper-geometric distribution
#'
#' @param drawn One or more sets of 'drawn' items (e.g., genes). Basically, a list of signatures.
#' @param categories list of gene sets (e.g., MSigDB c2)
#' @param ntotal background population, i.e., the total no. of items from which items are supposed to have been drawn
#' @param min.drawsize min no. of drawn items that must be among categories' items
#' @param mht correct for multiple hypothesis testing across multiple 'draws'
#'
#' @return a data.frame with rows indexed by the signature(s) tested
#'
#' @examples
#'
#' # load objects hyperSig (a list of signatures) and hyperGsets (a GeneSet object)
#' # and run hyper-enrichment test
#'
#' data(hyper) # contains objects hyperSig and hyperGsets
#' hyperE <- hyperEnrichment(drawn=hyperSig,categories=getGeneSet(hyperGsets),ntotal=10000)
#' head(hyperE)
#' 
#' @export 

## END documentation support
#####################################################################################

## function: HYPER ENRICHMENT
##
hyperEnrichment <- function
(
   drawn,          # one or more sets of 'drawn' items (e.g., genes). Basically, a list of signatures.
   categories,     # gene sets (list of gene sets)
   ntotal=length(unique(unlist(categories))),
                   # background population, i.e., the total no. of items from which
                   # ..items are supposed to have been drawn
   min.drawsize=4, # min no. of drawn items that must be among categories' items
   mht=TRUE,       # correct for MHT across multiple 'draws'
   verbose=TRUE
)
{
   ## checks on inputs
   ##
   if (!is(categories, "list") ) {
     stop( "categories expected to be a list of gene sets" )
   }
   gene.names<-unique(unlist(categories))
   if ( is.list(drawn) && is.null(names(drawn)) ) {
     stop( "drawn must have non-null names when a list" )
   }
   if ( ntotal<length(unique(unlist(categories)))) {
     warning( "background population's size less than unique categories' items: ", ntotal,"<",length(gene.names))
   }
   ##
   ## end checks
   
   cnames <-
      c("pval","fdr","set annotated","set size","category annotated","total annotated","category","hits")
   
   ## handling of multiple 'draws'
   ##
   if ( is.list(drawn) ) 
   {
     ncat <- length(categories)
     enrich <- matrix(NA,ncat*length(drawn),length(cnames)+1)
     
     VERBOSE(verbose,"Testing",length(drawn),"drawsets on",ncat,"categories and",
             length(gene.names),"total items ..\n")
     
     percent <- 0.1
     base <- 0
     ntst <- 0
     for ( i in 1:length(drawn) )
     {
       VERBOSE(verbose,"*** Testing", names(drawn)[i], ".. " )
       dset <- drawn[[i]]
       tmp <- hyperEnrichment(dset,categories,ntotal=ntotal,verbose=verbose)
       if (is.null(tmp)) {
         VERBOSE(verbose,"not enough items drawn\n")
         next
       }
       ntst <- ntst+1
       rng <- (base+1):(base+ncat)
       if (any(!is.na(enrich[rng,]))) stop( "something wrong")
       
       enrich[rng,] <- cbind(set=rep(names(drawn)[i],ncat),tmp)
       base <- base+ncat
       if (F && i>=round(length(drawn)*percent)) {
         VERBOSE(verbose, round(100*percent),"% ",sep="")
         percent <- percent+0.1
       }
       VERBOSE(verbose," (min fdr: ", signif(min(as.numeric(tmp[,"fdr"])),2),")\n",sep="")
     }
     VERBOSE(verbose,"done.\n")
     colnames(enrich) <- c("set",cnames)
      
     enrich <- enrich[1:base,,drop=F]
     if (mht) {
       VERBOSE(verbose,"MHT-correction across multiple draws ..")
       enrich[,"fdr"] <- pval2fdr(as.numeric(enrich[,"pval"]))
       VERBOSE(verbose,"done.\n")
     }    
     VERBOSE(verbose,
             "Categories tested: ",rjust(length(categories),4),"\n",
             "Candidate sets:    ",rjust(length(drawn),4),"\n",
             "Sets tested:       ",rjust(ntst,4),"\n",
             "Items tested:      ",rjust(sum(sapply(drawn,length)),4)," (min,med,max: ",
             paste(quantile(sapply(drawn,length),probs=c(0,.5,1)),collapse=","),")\n",
             "N(FDR<=0.25):      ",rjust(sum(enrich[,"fdr"]<=.25),4),"\n",
             "N(FDR<=0.05):      ",rjust(sum(enrich[,"fdr"]<=.05),4),"\n",
             "N(FDR<=0.01):      ",rjust(sum(enrich[,"fdr"]<=.01),4),"\n",
             sep="")
     return(enrich)
   }
   ## handling of a single draw
   ##
   m.idx<-drawn[drawn %in% gene.names] 
   
   if ( length(m.idx)<min.drawsize ) {
     VERBOSE(verbose,"insufficient annotated genes in the drawn set: ",
             paste(gene.names[m.idx],collapse=","),"\n")
     return(NULL)
   }
   VERBOSE(verbose,length(m.idx),"/",length(drawn), " annotated genes found",sep="")
   
   nhits <-sapply(categories, function(x,y) length(intersect(x,y)), m.idx)
   ndrawn <- length(drawn) # length(m.idx)
   ncats <- sapply(categories,length)
   nleft <- ntotal-ncats
   
   ## compute P[X>=nhits]
   enrich <- phyper(q=nhits-1,m=ncats,n=nleft,k=ndrawn,lower.tail=F)
   enrich <- cbind(pval=enrich,
                   fdr=pval2fdr(enrich),
                   nhits=nhits,
                   ndrawn=ndrawn,
                   ncats=ncats,
                   ntot=ntotal,
                   category=names(categories))
   
   enrich <- cbind(enrich,
                   hits=sapply(categories,function(x,y)paste(intersect(x,y),collapse=','),m.idx))
   ord <- order(as.numeric(enrich[,"pval"]))
   enrich <- enrich[ord,,drop=F]
   enrich[,"pval"] <- signif(as.numeric(enrich[,"pval"]),2)
   enrich[,"fdr"] <- signif(as.numeric(enrich[,"fdr"]),2)
   
   colnames(enrich) <- cnames
   rownames(enrich) <- names(categories)[ord]
   
   return(enrich)
}
## GENERATION OF THE DATA FOR THE EXAMPLE
##
if ( FALSE )
{
  ## data generation
  ##
  CBMMLAB <- Sys.getenv('CBMMLAB')
  CBMGIT <- Sys.getenv('CBMGIT')
  CBMRtools <- Sys.getenv('CBMRtools')
  source(paste(CBMRtools,'R/misc.R',sep='/'))
  source(paste(CBMRtools,'R/GeneSet.R',sep='/'))
  source(paste(CBMRtools,'R/hyperEnrichment.R',sep='/'))
  source(paste(CBMRtools,'R/broad.file.formats.R',sep='/'))

  GS <- GeneSet(paste(CBMMLAB,'/annot/c2.cp.v4.0.symbols.gmt',sep=''))

  SIG <- read.tab.delim('~/Research/Projects/oralcancer/taz_yap_dpagt1/results/SIGtab.xls')
  hyperSig <- table2list(SIG,fill="")

  HYP <- hyperEnrichment(drawn=hyperSig,categories=getGeneSet(GS),ntotal=10000)

  hyperGsets <- GS
  hyperGsets@source.file <- GS@name
  hyperGsets@name <- basename(GS@name)
  save(hyperSig,hyperGsets,file=paste(CBMRtools,'data/hyper.rda',sep='/'))

  ## testing the script outside the package
  ##
  load(file=paste(CBMRtools,'data/hyper.rda',sep='/'))
  HYP <- hyperEnrichment(drawn=hyperSig,categories=getGeneSet(hyperGsets),ntotal=10000)
}