R/topGOfunctions.R

Defines functions groupGOTerms init initWHAT initONT inverseList .samepleRunHDO .getTermsDefinition getPvalues .sigRatio.ratio .sigRatio.log .sigRatio.01 .getGeneData .printGeneData combineResults .get.sig.gene.in.term run.batch search.result reorder.result cutoff.result to.latex.table

Documented in combineResults combineResults getPvalues init initONT initWHAT inverseList

#################### misc functions ####################
## 
##
##
##
##
########################################################

## Function that split GOTERM in different ontologies.
## Every new environment contain only the terms from one
## of the ontologies 'BP', 'CC', 'MF'
## @Not using!!
groupGOTerms <- function(where) {
#   if(missing(where))
#     where <- .GlobalEnv
#   where <- as.environment(where)
#  
# #   e <- new.env(hash = T, parent = emptyenv())
# #   assign(paste("GO", "CC", "Term", sep = ""), e, envir = where)
# #   
#   #require('topOnto.db') || stop('package topOnto.db is required  ')
# 
#   sql <- "SELECT term_id FROM term WHERE ontology IN"
#   for(onto in c("CC")) {
#     xx <- dbGetQuery(ONT_dbconn(), paste(sql, "('", onto, "');", sep = ""))$term_id
#     e <- new.env(hash = T, parent = emptyenv())
#     multiassign(xx, value = rep(TRUE, length(xx)), envir = e)
#     assign(paste("GO", onto, "Term", sep = ""), e, envir = where)
#   }
# 
#   cat("\ngroupTerms: \tGOCCTerm environments built.\n")
}

#' @title init methods
#' @description functions to load the ontology objects 
#' @aliases init initWHAT initONT
#' @usage 
#' initWHAT()
#' initONT(ontology)
#' @param ontology the ontology to be loaded. This can be check by running initWHAT()
#' @details initONT loads the ontology object from topOnto.<ontolog>.db packages.
#' @examples
#' initWHAT()
#' \dontrun{
#'  require(topOnto.HDO.db)
#'  initONT('HDO')
#' }
#' @author Xin He
init <- function(){
  cat("topOnto has been loaded. Now you need to specify which ontology you want to use.\n")
  cat("run topOnto::intiWHAT() to find out what ontologies are current supportted.\n")
  cat("run topOnto::intiONT(ontology_name) to choose ontology.\n")
  ##cat("Then follow topGO menu for all the analysis.\n")
  initWHAT()
  #cat("ontology_name can be one of the following:\n")
}

initWHAT <- function(){
  cat("These ontology package(s) are currently available in your libPath:\n")
  x=list.files(path=.libPaths(),pattern='topOnto.*.db')
  cat(sapply(x,function(y){ y=sub('topOnto.','',y,perl=TRUE,);y=sub('.db','',y,perl=TRUE,)},USE.NAMES = FALSE),"\n") 
  #x=list.files(system.file("extdata", package ="topOnto"),pattern='*.sqlite')
  #
}

initONT <- function(ontology='HDO'){
  #require("methods", quietly=TRUE)
  pkg <- paste('topOnto',ontology,'db',sep='.')
  
  ##detach other topOnto.xx.db package to avoid name conflict
  while(length(grep("topOnto.\\w+.db", search(), perl=TRUE, value=FALSE)) >0 ){
    detach(pos = grep("topOnto.\\w+.db", search(), perl=TRUE, value=FALSE)[1], unload=TRUE,force=TRUE)
  }
  
  require(pkg, character.only = TRUE) || stop(paste('package ',pkg,' is required',sep=''))
  #detach(pos = match(paste("package", pkg, sep = ":"), search()))

  
  sql <- "SELECT id FROM term;"
  xx <- dbGetQuery(ONT_dbconn(), sql)$id
  e <- new.env(hash = TRUE, parent = emptyenv())
  multiassign(xx, value = rep(TRUE, length(xx)), envir = e)
  assign("Term", e, envir = .GlobalEnv)
}


## given a list of vectors this function is returning the reverse list
## given a mapping form genes to GO terms as a list, compute which are
## the genes mapped to each GO.
inverseList <- function(l) {
  rId <- unlist(l, use.names = FALSE)
  lId <- rep(names(l), sapply(l, length))
  
  return(split(lId, rId))
}

## A test function for HPO
.samepleRunHDO<-function(){
  cat("Loading HDO objects from db...\n")
  topOnto::initONT('HPO')
  a<-system.file("extdata/annotation","human_g2d_omim", package ="topOnto")
  g<-system.file("extdata/genelist","testList", package ="topOnto")
  geneID2GO <- readMappings(file = a)
  geneNames=names(geneID2GO)
  myInterestingGenes=(read.csv(header = FALSE, file = g))$V1
  geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
  names(geneList) <- geneNames
  GOdata <- new("topONTdata", ontology = "HDO", allGenes = geneList,annot = annFUN.gene2GO, gene2GO = geneID2GO)
  resultFis <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
  resultElimFis<- runTest(GOdata, algorithm = "elim", statistic = "fisher")
  allRes <- GenTable(GOdata,  elim = resultElimFis,elim = resultElimFis,topNodes = 30,useLevels=TRUE)
  print(allRes)
  cat("Demo done..Seems working!\n")
}



######################################################################
## function to print all the genes annotated to the specified GOs

if(!isGeneric("printGenes"))
  setGeneric("printGenes", function(object, whichTerms, file, ...) standardGeneric("printGenes"))


########
## TODO
##
## remove the chip argument and replace it with a function which,
## given a list of genes will provide information about these genes
##
## TODO
########


## if the file argument is missing the function will just return
## a list of data.frames, each data.frame containg the gene information for specified GO term
setMethod("printGenes",
          signature(object = "topONTdata", whichTerms = "character", file = "missing"),
          function(object, whichTerms, chip, numChar = 100, simplify = TRUE,
                   geneCutOff = 50, pvalCutOff) { 
            
            term.genes <- genesInTerm(object, whichTerms)
            all.genes <- character()
            all.genes <- unique(unlist(term.genes))
            #lapply(term.genes, function(x) all.genes <<- union(x, all.genes))

            chip <- sub(".db$", "", chip)
            
            LL.lib <- get(paste(chip, "ENTREZID", sep = ""))
            Sym.lib <- get(paste(chip, "SYMBOL", sep = ""))
            GNAME.lib <- get(paste(chip, "GENENAME", sep = ""))

            probeMapping <- data.frame(LL.id = as.integer(unlist(mget(all.genes,
                                         envir = LL.lib, ifnotfound = NA))),
                                       Symbol.id = unlist(mget(all.genes,
                                         envir = Sym.lib, ifnotfound = NA)))

            retList <- vector("list", length(whichTerms))
            names(retList) <- whichTerms
            
            for(gt in whichTerms) {
              affID <- term.genes[[gt]]

              pval <- sort(geneScore(object, affID, use.names = TRUE))
              if(missing(pvalCutOff))
                affID <- names(pval)
              else {
                pval <- pval[pval <= pvalCutOff]
                affID <- names(pval)
              }

              ## we restrict the output to the number of genes
              length(affID) <- min(length(affID), geneCutOff)
              
              ## if there are no genes, there is nothing to print
              if(length(affID) == 0) {
                cat("\n\t No genes over the cutOff for:", gt, "\n")
                next
              }
              
              genesNames <- sapply(mget(affID, envir = GNAME.lib),
                                   function(x) return(x[1]))
              genesNames <- paste(substr(genesNames, 1, numChar),
                                  ifelse(nchar(genesNames) > numChar, "...", ""), sep = "")
              
              retList[[gt]] <- cbind("Chip ID" = affID, probeMapping[affID, ], "Gene name" = genesNames,
                                     "raw p-value" = format.pval(pval[affID], dig = 3, eps = 1e-30))
            }            

            ## if we have only one GO term we return just the data.frame
            if(simplify && length(whichTerms) == 1)
              return(retList[[1]])

            return(retList)
          })


setMethod("printGenes",
          signature(object = "topONTdata", whichTerms = "character", file = "character"),
          ## numChar = "integer"),
          function(object, whichTerms, file, oneFile = FALSE, ...) { 

            infoList <- printGenes(object, whichTerms, ...)

            if(length(whichTerms) == 1) {
              write.table(infoList, quote = TRUE, sep = ",", append = FALSE,
                          file = paste(paste(file, sub(":", "_", whichTerms), sep = "_"), "csv", sep = "."),
                          col.names = colnames(infoList), row.names = 1:nrow(infoList))
              return()
            }
            
            if(oneFile) {
              fAppend <- FALSE
              for(gt in whichTerms) {
                write.table(cbind(GOID = rep(gt, nrow(infoList[[gt]])), infoList[[gt]]),
                            quote = TRUE, sep = ",", append = fAppend,
                            file = paste(file, "csv", sep = "."),
                            col.names = colnames(infoList[[gt]]), row.names = FALSE)
                fAppend <- TRUE
                return()
              }
            }
            
            for(gt in whichTerms) 
              write.table(infoList[[gt]], quote = TRUE, sep = ",", append = FALSE,
                          file = paste(paste(file, sub(":", "_", gt), sep = "_"), "csv", sep = "."),
                          col.names = colnames(infoList[[gt]]), row.names = 1:nrow(infoList[[gt]]))
          })

 ######################################################################


.getTermsDefinition <- function(ONTdata, whichTerms, numChar = 20, multipLines = FALSE) {
  
  termsNames=ONTdata@termName[whichTerms]
  
  if(!multipLines) 
    shortNames <- paste(substr(termsNames, 1, numChar),
                        ifelse(nchar(termsNames) > numChar, '...', ''), sep = '')
  else
    shortNames <- sapply(termsNames,
                         function(x) {
                           a <- strwrap(x, numChar)
                           return(paste(a, sep = "", collapse = "\\\n"))
                         })
  
  names(shortNames) <- names(termsNames)
  return(shortNames[whichTerms])
}


## methodsSig contains a named vector of p-values for each run method
.sigAllMethods <- function (methodsSig) 
{
  names.index <- names(methodsSig[[1]])
  retval <- as.data.frame(lapply(methodsSig, function(x) x[names.index]))
  names(retval) <- names(methodsSig)
  return(retval)
}



######################################################################
if(!isGeneric("GenTable"))
  setGeneric("GenTable", function(object, ...) standardGeneric("GenTable"))
# if(!isGeneric("GenTable2"))
  setGeneric("GenTable2", function(object, ...) standardGeneric("GenTable2"))
# if(!isGeneric("GenTableGSEA"))
  setGeneric("GenTableGSEA", function(object, ...) standardGeneric("GenTableGSEA"))

#' @title Diagnostic functions for topONTdata and topONTresult objects.
#' @aliases printGenes-methods printGenes printGenes,topONTdata,character,character-method printGenes,topONTdata,character,missing-method GenTable GenTable,topONTdata-method showGroupDensity
#' @rdname diagnosticMethods
#' @name diagnosticMethods
#' @description The \code{GenTable} function generates a summary of the
#'   results of the enrichment analysis.
#'   
#'   The \code{showGroupDensity} function plots the distributions of the
#'   gene' scores/ranks inside a GO term.
#' 
#'   The \code{printGenes} function shows a short summary of the top genes annotated
#'   to the specified GO terms.
#' 
#' 
#' 
#' @usage
#' GenTable(object, ...)
#' 
#' showGroupDensity(object, whichGO, ranks = FALSE, rm.one = TRUE) 
#' 
#' printGenes(object, whichTerms, file, ...)
#' 
#'   @param object an object of class \code{topONTdata}.
#'   @param whichGO the GO terms for which the plot should be generated.
#'   @param ranks if ranks should be used instead of scores.
#'   @param rm.one the p-values which are 1 are removed. 
#'   @param whichTerms character vector listing the GO terms for which the summary should be printed.
#'   @param file character string specifying the file in which the results should be printed.
#'   
#'   @param \dots other
#'        \code{topNodes}  the number of top GO terms to be included in the table / the gene description is trimmed such that it has \code{numChar} characters.
#'     %%GenTable(object, ..., orderBy = 1, ranksOf = 2, topNodes = 10, numChar = 40)
#'     Extra arguments for \code{GenTable} can be:
#'       \code{orderBy} if more than one \code{topONTresult} object is given then \code{orderBy} gives the index of which scores will be used to order the resulting table. Can be an integer index  or a character vector given the name of the \code{topONTresult} object.
#'       \code{ranksOf} same as \code{orderBy} argument except that this parameter shows the relative ranks of the specified result.   
#'       \code{cutoff} p-value cutoff. default 1
#'       \code{show.gene} default \code{FALSE}. Whether add a column in the result showing the significant genes.
#'       \code{use.symbol} default \code{FALSE}. Whether to use symbol or entrez_id for the significant genes if the \code{show.gene} is T.
#'       \code{entrez2symbol} default \code{NULL}. If \code{use.symbol} is \code{TRUE}, \code{use.symbol} must be provided as a list of entrez_id to gene symbol.
#'       \code{numChar} the GO term definition will be truncated such that only the first \code{numChar} characters are shown.
#'     %% printGenes(object, whichTerms, chip, numChar = 100, simplify = TRUE, geneCufOff = 50, pvalCutOff)
#'     %% printGenes(object, whichTerms, file, oneFile = FALSE, ...)
#'     Extra arguments for \code{printGenes} can be:
#'        \code{chip} character string containing the name of the Bioconductor annotation package for a microarray chip.
#'        \code{numChar} 
#'        \code{simplify} logical variable affecting how the results are returned.
#'        \code{geneCutOff} the maximal number of genes shown for each term.
#'        \code{pvalCutOff} only the genes with a p-value less than \code{pvalCutOff} are shown.
#'        \code{oneFile} if \code{TRUE} then a file for each GO term is generated.
#'   
#' @details
#' 
#'   \code{GenTable} is an easy to use function for summarising the most
#'   significant GO terms and the corresponding p-values. The function
#'   dispatches for \code{topONTdata} and \code{topONTresult} objects, and
#'   it can take an arbitrary number of the later, making comparison
#'   between various results easier.
#'   
#'   Note: One needs to type the complete attribute names (the exact name)
#'   of this function, like: \code{topNodes = 5}, \code{rankOf = "resultFis"}, etc. 
#'   This being the price paid for flexibility of specifying different
#'   number of \code{topONTdata} objects.
#'   
#' 
#'   
#'   The \code{showGroupDensity} function analyse the distribution of the
#'   gene-wise scores for a specified GO term.
#'   The function will show the distribution of the genes in a GO term
#'   compared with the complementary set, using a lattice plot.
#'   
#' 
#'   \code{printGenes} 
#'   The function will generate a table with all the probes annotated to
#'   the specified GO term. Various type of identifiers, the gene name and
#'   the gene-wise statistics are provided in the table. 
#'   
#'   One or more GO identifiers can be given to the function using the
#'   \code{whichTerms} argument. When more than one GO is specified, the
#'   function returns a list of \code{data.frames}, otherwise only one
#'   \code{data.frame} is returned.
#'   
#'   The function has a argument \code{file} which, when specified, will
#'   save the results into a file using the CSV format.
#' 
#'   For the moment the function will work only when the chip used has an
#'   annotation package available in Bioconductor. It will not work with
#'   other type of custom annotations.
#' 
#' 
#' @return A data.frame or a list of data.fames.
#' 
#' @author Adrian Alexa
#' 
#' @seealso
#'   \code{\link{groupStats-class}},
#'   \code{\link{getSigGroups-methods}}
#'

#' @examples
#' 
#' 
#' ########################################
#' ## GenTable
#' ########################################
#' 
#' ## load two topONTresult sample objects: resultFis and resultElimFis
#' data(ONTdata)
#' \dontrun{
#' ##These code needs the topOnto.xx.db package to run
#' ## generate the result of Fisher's exact test
#' sig.tab <- GenTable(GOdata, Fis = resultFis, topNodes = 20)
#' 
#' ## results of both test
#' sig.tab <- GenTable(GOdata, resultFis, resultElimFis, topNodes = 20)
#' 
#' ## results of both test with specified names
#' sig.tab <- GenTable(GOdata, Fis = resultFis, ElimFis = resultElimFis, topNodes = 20)
#' 
#' ## results of both test with specified names and specified ordering
#' sig.tab <- GenTable(GOdata, Fis = resultFis, ElimFis = resultElimFis, orderBy = "ElimFis", ranksOf = "Fis", topNodes = 20)
#' }
#' 
#' ########################################
#' ## showGroupDensity
#' ########################################
#' 
#' TERMID <- "DOID:10652"
#' print(showGroupDensity(GOdata, TERMID, ranks = FALSE, rm.one = FALSE))
#' 
#' 
#' ########################################
#' ## printGenes
#' ########################################
#' 
#' \dontrun{
#' library(hgu95av2.db)
#' goID <- "GO:0006629"
#' 
#' gt <- printGenes(GOdata, whichTerms = goID, chip = "hgu95av2.db", numChar = 40)
#' 
#' goIDs <- c("GO:0006629", "GO:0007076")
#' gt <- printGenes(GOdata, whichTerms = goIDs, chip = "hgu95av2.db", pvalCutOff = 0.01)
#' 
#' gt[goIDs[1]]
#' }
#' 
#' 
#' @keywords methods

setMethod("GenTable",
          signature(object = "topONTdata"),
          ## ... = list of topONTresult object
          ## orderBy = "ANY", ## integer or character (index/name)
          ## ranksOf = "ANY", ## which ranks to be computed (integer/character)
          ## topNodes = "integer",
          ## numChar = "integer",
          ## useLevels = "logical"),
          function(object, ..., orderBy = 1, ranksOf = 2,
                   topNodes = 10, numChar = 40,
                   format.FUN = format.pval, decreasing = FALSE,
                   useLevels = FALSE,cutoff=NULL,show.gene.sig=FALSE,show.gene.node=FALSE,use.symbol=FALSE,entrez2symbol=NULL) {
            
            resList <- list(...)
            #browser()
            ## first for the class of the elements in the list
            if(!all(sapply(resList, is, "topONTresult")))
              stop("Use: topONTdata, topONTresult_1, topONTresult_2, ..., \"parameters\".")
            
            ## if no names were provided we name them
            if(is.null(names(resList)))
              names(resList) <- paste("result", 1:length(resList), sep = "")
            
            ## obtain the score from the objects
            resList <- lapply(resList, score)
            
            ## order the scores and take care of the case in which only one result is provided
            ## in such case the orderBy and ranksOf parameters are ignored.
            if(length(resList) == 1) {
              orderBy <- ranksOf <- 1
              l <- data.frame(resList)
              names(l) <- ifelse(is.null(names(resList)), "", names(resList)) 
            } else {
              l <- .sigAllMethods(resList)
            }
            
            index <- order(l[, orderBy], decreasing = decreasing)
            l <- l[index, , drop = FALSE]
            
            if(decreasing)
              rr <- rank(-l[, ranksOf], ties = "first")
            else
              rr <- rank(l[, ranksOf], ties = "first")
            
            if(length(rownames(l)) < topNodes)
              topNodes = length(rownames(l))
            
            whichTerms <- rownames(l)[1:topNodes]
            
            l <- l[whichTerms, , drop = FALSE]
            
            ##apply cutoff
            if(!is.null(cutoff)){
              cut<-sum(l[,orderBy]<=cutoff)
              if(cut<topNodes)
                topNodes=cut
              
              whichTerms <- rownames(l)[1:topNodes]           
              l <- l[whichTerms, , drop = FALSE]
            }
            
            
            rr <- as.integer(rr[1:topNodes])
           
            shortNames <- .getTermsDefinition(ONTdata=object, whichTerms, numChar = numChar)
            
            infoMat <- data.frame('TERM ID' = whichTerms, 'Term' = shortNames, stringsAsFactors = FALSE)
            
            ## put the levels of the GO
            if(useLevels) {
              nodeLevel <- buildLevels(graph(object), leafs2root = TRUE)
              nodeLevel <- unlist(mget(whichTerms, envir = nodeLevel$nodes2level))
              infoMat <- data.frame(infoMat, Level = as.integer(nodeLevel))
            }
            
            annoStat <- termStat(object, whichTerms)
            
            ## if orderBy == ranksOf then there is no need to put the ranks
            if(ranksOf != orderBy) {
              dim(rr) <- c(length(rr), 1)
              colnames(rr) <- paste("Rank in ", ifelse(is.character(ranksOf), ranksOf, colnames(l)[ranksOf]), sep = "")
              
              if(nrow(l)==1)
                infoMat <- data.frame(infoMat, annoStat, rr,
                                      matrix(apply(l, 2, format.FUN, dig = 2, eps = 1e-30),ncol = ncol(l),dimnames=list(NULL,colnames(l))),
                                      check.names = FALSE, stringsAsFactors = FALSE)
                
              else
                infoMat <- data.frame(infoMat, annoStat, rr,
                                    apply(l, 2, format.FUN, dig = 2, eps = 1e-30),
                                    check.names = FALSE, stringsAsFactors = FALSE)
 
              
            } else {
                if(nrow(l)==1)
                  infoMat <- data.frame(infoMat, annoStat,
                                        matrix(apply(l, 2, format.FUN, dig = 2, eps = 1e-30),ncol = ncol(l),dimnames=list(NULL,colnames(l))),
                                        check.names = FALSE, stringsAsFactors = FALSE)
                else
                  infoMat <- data.frame(infoMat, annoStat,
                                    apply(l, 2, format.FUN, dig = 2, eps = 1e-30),
                                    check.names = FALSE, stringsAsFactors = FALSE)
            }
            
            ##rownames(infoMat) <- whichTerms
            rownames(infoMat) <- 1:length(whichTerms)
            #browser()
            if(show.gene.sig){
              sig.genes<-.get.sig.gene.in.term(object)
              if(use.symbol){
                if(is.null(entrez2symbol)){
                  #message("entrez2symbol is missing. Ignore show.symbol!")
                  message("entrez2symbol is not provided by user, using entrez2symbol mapping from org.Hs.eg.db!")
                  require('org.Hs.eg.db')
                  entrez2symbol<-revmap(as.list(org.Hs.egSYMBOL2EG))
                }else{
                  sig.gene2term<-revmap(sig.genes)
                  symbols<-entrez2symbol[as.character(names(sig.gene2term))]
                  names(sig.gene2term)<-unname(unlist(symbols))
                  sig.genes<-revmap(sig.gene2term)
                }
              }
              hits<-sapply(sig.genes[infoMat$TERM.ID],paste,collapse=',')
              
              
              infoMat<-data.frame(infoMat,sig.genes=hits)
            }
            
            if(show.gene.node){
              nodes.genes<-.genesInNode(object@graph,nNames =nodes(object@graph),score = F)
              if(use.symbol){
                if(is.null(entrez2symbol)){
                  message("entrez2symbol is missing. Ignore show.symbol!")
                  # require('org.Hs.eg.db')
                  # entrez2symbol<-revmap(as.list(org.Hs.egSYMBOL2EG))
                }else{
                  nodes.genes2term<-revmap(nodes.genes)
                  symbols<-entrez2symbol[as.character(names(nodes.genes2term))]
                  names(nodes.genes2term)<-unname(unlist(symbols))
                  nodes.genes<-revmap(nodes.genes2term)
                }
              }
              annotation<-sapply(nodes.genes[infoMat$TERM.ID],paste,collapse=',')
              
              infoMat<-data.frame(infoMat,annotation=annotation)
            }
            
            
            return(infoMat)           
          })




###for batch run
setMethod("GenTable2",
          signature(object = "topONTdata"),
          ## ... = list of topONTresult object
          ## orderBy = "ANY", ## integer or character (index/name)
          ## ranksOf = "ANY", ## which ranks to be computed (integer/character)
          ## topNodes = "integer",
          ## numChar = "integer",
          ## useLevels = "logical"),
          function(object, ..., orderBy = 1, ranksOf = 2,
                   topNodes = 10, numChar = 40,
                   format.FUN = format.pval, decreasing = FALSE,
                   useLevels = FALSE,cutoff=NULL,show.gene=FALSE,use.symbol=FALSE,entrez2symbol=NULL) {
            
            resList <- (...)
            
            ## first for the class of the elements in the list
            if(!all(sapply(resList, is, "topONTresult")))
              stop("Use: topONTdata, topONTresult_1, topONTresult_2, ..., \"parameters\".")
            
            ## if no names were provided we name them
            if(is.null(names(resList)))
              names(resList) <- paste("result", 1:length(resList), sep = "")
            
            ## obtain the score from the objects
            resList <- lapply(resList, score)
            
            ## order the scores and take care of the case in which only one result is provided
            ## in such case the orderBy and ranksOf parameters are ignored.
            if(length(resList) == 1) {
              orderBy <- ranksOf <- 1
              l <- data.frame(resList)
              names(l) <- ifelse(is.null(names(resList)), "", names(resList)) 
            } else {
              l <- .sigAllMethods(resList)
            }
            
            index <- order(l[, orderBy], decreasing = decreasing)
            l <- l[index, , drop = FALSE]
            
            if(decreasing)
              rr <- rank(-l[, ranksOf], ties = "first")
            else
              rr <- rank(l[, ranksOf], ties = "first")
            
            if(length(rownames(l)) < topNodes)
              topNodes = length(rownames(l))
            
            whichTerms <- rownames(l)[1:topNodes]
            
            l <- l[whichTerms, , drop = FALSE]
            
            ##apply cutoff
            if(!is.null(cutoff)){
              cut<-sum(l[,orderBy]<=cutoff)
              if(cut<topNodes)
                topNodes=cut
              
              whichTerms <- rownames(l)[1:topNodes]           
              l <- l[whichTerms, , drop = FALSE]
            }
            
            
            rr <- as.integer(rr[1:topNodes])
            
            shortNames <- .getTermsDefinition(ONTdata=object, whichTerms, numChar = numChar)
            
            infoMat <- data.frame('TERM ID' = whichTerms, 'Term' = shortNames, stringsAsFactors = FALSE)
            
            ## put the levels of the GO
            if(useLevels) {
              nodeLevel <- buildLevels(graph(object), leafs2root = TRUE)
              nodeLevel <- unlist(mget(whichTerms, envir = nodeLevel$nodes2level))
              infoMat <- data.frame(infoMat, Level = as.integer(nodeLevel))
            }
            
            annoStat <- termStat(object, whichTerms)
            
            ## if orderBy == ranksOf then there is no need to put the ranks
            if(ranksOf != orderBy) {
              dim(rr) <- c(length(rr), 1)
              colnames(rr) <- paste("Rank in ", ifelse(is.character(ranksOf), ranksOf, colnames(l)[ranksOf]), sep = "")
              
              infoMat <- data.frame(infoMat, annoStat, rr,
                                    apply(l, 2, format.FUN, dig = 2, eps = 1e-30),
                                    check.names = FALSE, stringsAsFactors = FALSE)
              
            } else {
              infoMat <- data.frame(infoMat, annoStat,
                                    apply(l, 2, format.FUN, dig = 2, eps = 1e-30),
                                    check.names = FALSE, stringsAsFactors = FALSE)
            }
            
            ##rownames(infoMat) <- whichTerms
            rownames(infoMat) <- 1:length(whichTerms)
            
            if(show.gene){
              sig.genes<-.get.sig.gene.in.term(GOdata)
              if(use.symbol){
                #require('org.Dm.eg.db')
                #entrez2symbol<-revmap(as.list(org.Dm.egSYMBOL2EG))
                if(is.null(entrez2symbol)){
                  message("entrez2symbol is missing. Ignore show.symbol!")
                }else{
                  sig.gene2term<-revmap(sig.genes)
                  symbols<-entrez2symbol[as.character(names(sig.gene2term))]
                  names(sig.gene2term)<-unname(unlist(symbols))
                  sig.genes<-revmap(sig.gene2term)
                }
              }
              hits<-sapply(sig.genes[infoMat$TERM.ID],paste,collapse=',')
              infoMat<-data.frame(infoMat,sig.genes=hits)
            }
            
            
            return(infoMat)            
          })



setMethod("GenTableGSEA",
          signature(object = "topONTdata"),
          ## ... = list of topONTresult object
          ## orderBy = "ANY", ## integer or character (index/name)
          ## ranksOf = "ANY", ## which ranks to be computed (integer/character)
          ## topNodes = "integer",
          ## numChar = "integer",
          ## useLevels = "logical"),
          function(object, ..., orderBy = 1, ranksOf = 2,
                   topNodes = 10, numChar = 40,
                   format.FUN = format.pval, decreasing = FALSE,
                   useLevels = FALSE,cutoff=NULL,show.gene=FALSE,use.symbol=FALSE,entrez2symbol=NULL) {
            
            resList <- list(...)
            browser()
            ## first for the class of the elements in the list
            if(!all(sapply(resList, is, "topONTresult")))
              stop("Use: topONTdata, topONTresult_1, topONTresult_2, ..., \"parameters\".")
            
            ## if no names were provided we name them
            if(is.null(names(resList)))
              names(resList) <- paste("result", 1:length(resList), sep = "")
            
            ## obtain the score from the objects
            resList <- lapply(resList, score)
            
            ## order the scores and take care of the case in which only one result is provided
            ## in such case the orderBy and ranksOf parameters are ignored.
            if(length(resList) == 1) {
              orderBy <- ranksOf <- 1
              l <- data.frame(resList)
              names(l) <- ifelse(is.null(names(resList)), "", names(resList)) 
            } else {
              l <- .sigAllMethods(resList)
            }
            
            index <- order(l[, orderBy], decreasing = decreasing)
            l <- l[index, , drop = FALSE]
            
            if(decreasing)
              rr <- rank(-l[, ranksOf], ties = "first")
            else
              rr <- rank(l[, ranksOf], ties = "first")
            
            if(length(rownames(l)) < topNodes)
              topNodes = length(rownames(l))
            
            whichTerms <- rownames(l)[1:topNodes]
            
            l <- l[whichTerms, , drop = FALSE]
            
            ##apply cutoff
            if(!is.null(cutoff)){
              cut<-sum(l[,orderBy]<=cutoff)
              if(cut<topNodes)
                topNodes=cut
              
              whichTerms <- rownames(l)[1:topNodes]           
              l <- l[whichTerms, , drop = FALSE]
            }
            
            
            rr <- as.integer(rr[1:topNodes])
            
            shortNames <- .getTermsDefinition(ONTdata=object, whichTerms, numChar = numChar)
            
            infoMat <- data.frame('TERM ID' = whichTerms, 'Term' = shortNames, stringsAsFactors = FALSE)
            
            ## put the levels of the GO
            if(useLevels) {
              nodeLevel <- buildLevels(graph(object), leafs2root = TRUE)
              nodeLevel <- unlist(mget(whichTerms, envir = nodeLevel$nodes2level))
              infoMat <- data.frame(infoMat, Level = as.integer(nodeLevel))
            }
            
            annoStat <- termStat(object, whichTerms)
            
            ## if orderBy == ranksOf then there is no need to put the ranks
            if(ranksOf != orderBy) {
              dim(rr) <- c(length(rr), 1)
              colnames(rr) <- paste("Rank in ", ifelse(is.character(ranksOf), ranksOf, colnames(l)[ranksOf]), sep = "")
              
              infoMat <- data.frame(infoMat, annoStat, rr,
                                    apply(l, 2, format.FUN, dig = 2, eps = 1e-30),
                                    check.names = FALSE, stringsAsFactors = FALSE)
              
            } else {
              infoMat <- data.frame(infoMat, annoStat,
                                    apply(l, 2, format.FUN, dig = 2, eps = 1e-30),
                                    check.names = FALSE, stringsAsFactors = FALSE)
            }
            
            ##rownames(infoMat) <- whichTerms
            rownames(infoMat) <- 1:length(whichTerms)
            
            if(show.gene){
              sig.genes<-.get.sig.gene.in.term(GOdata)
              if(use.symbol){
                #require('org.Dm.eg.db')
                #entrez2symbol<-revmap(as.list(org.Dm.egSYMBOL2EG))
                if(is.null(entrez2symbol)){
                  message("entrez2symbol is missing. Ignore show.symbol!")
                }else{
                  sig.gene2term<-revmap(sig.genes)
                  symbols<-entrez2symbol[as.character(names(sig.gene2term))]
                  names(sig.gene2term)<-unname(unlist(symbols))
                  sig.genes<-revmap(sig.gene2term)
                }
              }
              hits<-sapply(sig.genes[infoMat$TERM.ID],paste,collapse=',')
              infoMat<-data.frame(infoMat,sig.genes=hits)
            }
            
            
            return(infoMat)            
          })


## if(!isGeneric("genLatexTable"))
##   setGeneric("genLatexTable", function(object, resList, ...) standardGeneric("genLatexTable"))

## setMethod("genLatexTable",
##           signature(object = "topONTdata",
##                     resList = "list",
##                     orderBy = "ANY", ## integer or character (index/name)
##                     ranksOf = "ANY", ## which ranks to be computed (integer/character)
##                     topNodes = "integer",
##                     numChar = "integer"),
##           function(object, resList, orderBy, ranksOf, topNodes = 10, no.char = 40) {
            

## = infoMat, 
##                         pval.tab = xtable.matrix(infoMat, caption = 'GO terms p-value', label = 'tab:GOinfo')))
          



################################################################################
## function that computes the raw/adjusted p-values based on
## multtest package
#' @name getPvalues
#' @aliases getPvalues
#' @title Convenient function to compute p-values from a gene expression matrix.
#' @description Warping function of "mt.teststat", for computing p-values of a gene expression matrix.
#' @usage
#'    getPvalues(edata, classlabel, test = "t", alternative = c("greater", "two.sided", "less")[1],
#'    genesID = NULL, correction = c("none", "Bonferroni", "Holm", "Hochberg", "SidakSS", "SidakSD",
#'    "BH", "BY")[8]) 
#'    
#'  @param edata Gene expression matrix.
#'  @param classlabel The phenotype of the data
#'  @param test Which test statistic to use
#'  @param alternative The alternative of the test statistic
#'  @param genesID if a subset of genes is provided
#'  @param correction Multiple testing correction procedure
#' 
#' @return
#'   An named numeric vector of p-values.
#' 
#' 
#' @author Adrian Alexa
#' @seealso
#'   \code{\link{GOKSTest}},  \code{\link{groupStats-class}},
#'   \code{\link{getSigGroups-methods}}
#' 
#' @examples
#' 
#' library(ALL)
#' data(ALL)
#' 
#' ## discriminate B-cell from T-cell
#' classLabel <- as.integer(sapply(ALL$BT, function(x) return(substr(x, 1, 1) == 'T')))
#' 
#' ## Differentially expressed genes
#' geneList <- getPvalues(exprs(ALL), classlabel = classLabel,
#'                        alternative = "greater", correction = "BY")
#' 
#' hist(geneList, 50)
#' 
#' @keywords graphs
getPvalues <- function(edata, classlabel, test = "t",
                       alternative = c("greater", "two.sided", "less")[1],
                       genesID = NULL,
                       correction = c("none", "Bonferroni", "Holm", "Hochberg",
                         "SidakSS", "SidakSD", "BH", "BY")[8]) {
  
  #require('multtest') || stop('package multtest is required')

  ## restrict the dataset
  if(!is.null(genesID))
  edata <- edata[intersect(genesID, rownames(edata)), ]
  genesID <- rownames(edata)
  
  t.stats <- multtest::mt.teststat(edata, classlabel = classlabel, test = test)

  if (alternative == "less") 
    p.values <- pt(t.stats, df = length(classlabel) - 2)
  else if (alternative == "greater") 
    p.values <- pt(t.stats, df = length(classlabel) - 2, lower.tail = FALSE)
  else 
    p.values <- 2 * pt(-abs(t.stats), df = length(classlabel) - 2)
  
  if(correction != "none") {
    p.values <- multtest::mt.rawp2adjp(p.values, correction)
    p.values <- p.values$adjp[order(p.values$index), correction]
  }
  names(p.values) <- genesID
  
  return(p.values)
}

################################################################################

## a <= b 
.sigRatio.ratio <- function(a, b, tolerance = 1e-50) {
  
  ## if a and b are almost equal we return 2
  if(identical(all.equal(a, b, tolerance = tolerance), TRUE))
    return(2)
  
  ## we want to compute b / a, thus we must take care for a = 0
  if(identical(all.equal(a, 0, tolerance = tolerance), TRUE))
    return(abs(b / (a + tolerance)))
  
  return(abs(b / a))
}

.sigRatio.log <- function(a, b, tolerance = 1e-50) {
  
  ## if a and b are almost equal we return 2
  if(identical(all.equal(a, b, tolerance = tolerance), TRUE))
    return(2)
  
  ## we want to compute log(a) / log(b), thus we must take care for b = 1
  if(identical(all.equal(log10(b), 0, tolerance = tolerance), TRUE))
    return(abs(log10(a) / tolerance))
  
  return(abs(log10(a) / log10(b)))
}

## a <= b 
.sigRatio.01 <- function(a, b, tolerance = 1e-50) {
  
  ## if a and b are almost equal we return 2
  if(identical(all.equal(a, b, tolerance = tolerance), TRUE))
    return(2)
  
  if(a < b)
    return(1e50)
  
  return(2)
}

################################################################################
################################################################################

## functions to get gene stats from the topONTdata object
.getGeneData <- function(object) {
  return(c(Annotated = numGenes(object),
           Significant = numSigGenes(object),
           NodeSize = object@nodeSize))
}


## functions to get gene stats from the topONTdata object
.printGeneData <- function(x) {
  cat("Annotation data:\n")
  if("Annotated" %in% names(x))
    cat("    Annotated genes:", x["Annotated"], "\n")
  if("Significant" %in% names(x))
    cat("    Significant genes:", x["Significant"], "\n")
  cat("    Min. no. of genes annotated to a term:", x["NodeSize"], "\n")
  if("SigTerms" %in% names(x))
    cat("    Nontrivial nodes:", x["SigTerms"], "\n")
}

################################################################################
################################################################################

## function to aggregate 2 or more topONTdata objects
combineResults <- function(..., method = c("gmean", "mean", "median", "min", "max")) {

  resList <- list(...)
  ## first for the class of the elements in the list
  if((length(resList) < 2) || !all(sapply(resList, is, "topONTresult")))
    stop("Use: topONTresult_1, topONTresult_2, ..., method = \"mean\"")
  
  combMethod <- match.arg(method)

  retVal <- resList[[1]] 

  ## update the infos
  description(retVal) <- paste(description(retVal),
                               paste(combMethod, "(",
                                     paste(sapply(resList, algorithm), collapse = ", "),
                                     ")", sep = ""),
                               sep = "\n")
  testName(retVal) <- paste(unique(sapply(resList, testName)), collapse = ", ")
  algorithm(retVal) <- "ensemble"
    
  ## get the list of GOs
  nn <- names(score(retVal))
  ## obtain the score from the objects
  resList <- do.call(cbind, lapply(resList, score, whichGO = nn))

  newRes <- switch(combMethod, 
                   mean = rowMeans(resList),
                   median = rowMedians(resList),
                   min = rowMin(resList),
                   max = rowMax(resList),
                   gmean = exp(rowMeans(log(resList))))
  names(newRes) <- rownames(resList) # just to make sure ... some of the functions we use might drop the names
  score(retVal) <- newRes

  return(retVal)
}


.get.sig.gene.in.term<-function(GOdata){
  genes<-genesInTerm(GOdata)
  lapply(genes,intersect,sigGenes(GOdata))
}


run.batch<-function(ontologys=c('HDO','HPO'),gene.file=system.file("extdata/genelist","age", package ="topOnto"),
                    annotation.file=c(system.file("extdata/annotation","human_gene2HDO_o", package ="topOnto"),system.file("extdata/annotation","human_gene2HPO_o", package ="topOnto")),
                    clip=c(c('DOID:10652'),c('HP:0002511')),
                    algorithm=c('classic','elim'),
                    statistic=c('fisher','fisher'),
                    topNodes = 15,useLevels=TRUE,cutoff=1,
                    ...
                    )
{
 
  allRes <- list()
  for(i in 1:length(ontologys)){
    initONT(ontologys[i])
    geneID2TERM <- readMappings(annotation.file[i])
    geneNames=names(geneID2TERM)
    myInterestingGenes=(read.csv(header = FALSE, file = gene.file))$V1
    geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
    names(geneList) <- geneNames
    
    if(!is.null(clip)){
      term2geneID<-filter.ontology.annotation(clip[i],term2genes=revmap(geneID2TERM))
      geneID2TERM<-revmap(term2geneID)
    }
    
    
    ONTdata <- new("topONTdata", ontology = ontologys[i], allGenes = geneList,annot = annFUN.gene2GO, gene2GO = geneID2TERM)
    result<-list()
    for(j in 1:length(algorithm)){
      result[[paste(algorithm[j],statistic[j],sep = '')]]<-runTest(ONTdata, algorithm = algorithm[j], statistic = statistic[j])
    }
    allRes[[ontologys[i]]]<-list('ONTdata'=ONTdata,'result'=result,'tableView'=GenTable2(ONTdata,result, topNodes = topNodes,useLevels=useLevels,cutoff=cutoff,...))
  }
  
  allRes
}

search.result<-function(GenTable,key=c(),type=c(id,name)[1]){
  if(type==1){
    GenTable[which(GenTable$TERM.ID %in% key),]
  }else{
    pattern=paste(key,collapse = '|')
    GenTable[grep(pattern,GenTable$Term,perl = T,ignore.case = T,value = T),]
  }
}

reorder.result<-function(GenTable,key='classicfisher'){
  GenTable[order(as.numeric(GenTable[,key]),decreasing = F),]
}

cutoff.result<-function(GenTable,key='classicfisher',cutoff=0.05){
  GenTable[as.numeric(GenTable[,key])<=cutoff,]
}

to.latex.table<-function(tb,caption='',label='',...){
  require('xtable')
  align<-rep('r',ncol(tb)+1)
  xtable(tb, caption = caption, label = label,align=align,...)
}
hxin/topOnto documentation built on May 17, 2019, 9:15 p.m.