R/go-analysis.R

Defines functions mapGoToSlim myFisherTest collectChildren goClusterResults clusterPheatmap clusterHeatmap clusterResults getChildTerms goStringsToList ezGetGoByLevels ezGroupGO ezGSEA ezEnricher ezGoseq twoGroupsGOSE addGeneNamesEnrich twoGroupsGO compileEnrichmentInput addGoParents getGOparents separateGoIdsByOnto hasGoAnnotation doGo

Documented in addGoParents clusterHeatmap clusterResults doGo ezGetGoByLevels ezGoseq ezGroupGO getGOparents goClusterResults goStringsToList hasGoAnnotation separateGoIdsByOnto twoGroupsGO

###################################################################
# Functional Genomics Center Zurich
# This code is distributed under the terms of the GNU General
# Public License Version 3, June 2007.
# The terms are available here: http://www.gnu.org/licenses/gpl.html
# www.fgcz.ch


doGo = function(param, seqAnno){
  flag = param$runGO && (hasGeneMapping(param, seqAnno) || param$featureLevel == "gene") && hasGoAnnotation(seqAnno)
  message("doGo: ", flag)
  return(flag)
}

hasGoAnnotation = function(seqAnno){
  goColumns <- c("GO BP", "GO MF", "GO CC")
  hasGO <- all(goColumns %in% colnames(seqAnno))
  validGO <- all(colSums(seqAnno[, goColumns] == "") != nrow(seqAnno))
  validGO2 <- !any(is.na(seqAnno[, goColumns]))
  return(hasGO && validGO && validGO2)
}

##' @title Separates GO ID's by ontology
##' @description Separates GO ID's by ontology
##' @param goIdStrings the GO ID's to separate.
##' @template roxygen-template
##' @return Returns the separated GO ID's
##' @examples separateGoIdsByOnto(c("GO:0008150", "GO:0005575"))
separateGoIdsByOnto = function(goIdStrings){
  require("GO.db", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)
  bpGos = keys(GOBPPARENTS)
  mfGos = keys(GOMFPARENTS)
  ccGos = keys(GOCCPARENTS)
  result = data.frame(row.names=1:length(goIdStrings))
  goList = strsplit(trimWhiteSpace(gsub("; ", ";", goIdStrings)), ";") ## support both separators "; " and ";" 
  result[["GO BP"]] = sapply(lapply(goList, function(x,y){intersect(x,y)}, y=bpGos), ezCollapse)
  result[["GO MF"]] = sapply(lapply(goList, function(x,y){intersect(x,y)}, y=mfGos), ezCollapse)
  result[["GO CC"]] = sapply(lapply(goList, function(x,y){intersect(x,y)}, y=ccGos), ezCollapse)
  return(result)
}

##' @title Gets the GO parents
##' @description Gets the GO parents.
##' @param id the ID to check
##' @param onto the ontology to use
##' @template roxygen-template
##' @return Returns the ancestor terms if they are specified.
##' @examples
##' getGOparents("GO:0034767")
##' addGoParents(c("GO:0034767", "GO:0034768"), "BP")
getGOparents = function(id, onto="BP"){
  require(GO.db)
  go2Ancestor = switch(onto, BP=as.list(GOBPANCESTOR), MF=as.list(GOMFANCESTOR), CC=as.list(GOCCANCESTOR))
  ancestorIds = setdiff(go2Ancestor[[id]], "all") ## remove the all category
  if (is.null(ancestorIds)){
    return()
  } else {
    ancestorTerms = Term(GOTERM[ancestorIds])
    return(ancestorTerms)
  }
}

##' @describeIn getGOparents Adds the GO parents.
addGoParents = function(gene2goList, onto){
  require(GO.db)
  goParents = switch(onto, BP=as.list(GOBPPARENTS),
                     CC=as.list(GOCCPARENTS),
                     MF=as.list(GOMFPARENTS))
  return(lapply(gene2goList, function(x){setdiff(union(x, unlist(goParents[x])), "all")}))
}


compileEnrichmentInput = function(param, se){
  require(SummarizedExperiment)
  seqAnno = data.frame(rowData(se), row.names=rownames(se),
                       check.names = FALSE, stringsAsFactors=FALSE)
  logSignal = log2(shiftZeros(assays(se)$xNorm, param$minSignal))
  groupMeans = cbind(rowMeans(logSignal[ , param$grouping == param$sampleGroup, 
                                         drop=FALSE]),
                     rowMeans(logSignal[ , param$grouping == param$refGroup, 
                                         drop=FALSE])
  )
  colnames(groupMeans) = c(param$sampleGroup, param$refGroup)
  normalizedAvgSignal=rowMeans(groupMeans)
  log2Ratio <- set_names(rowData(se)$log2Ratio, rowData(se)$gene_id)
  
  if (param$featureLevel != "gene"){
    genes = getGeneMapping(param, seqAnno)
    goAnno = aggregateGoAnnotation(seqAnno, genes)
    seqAnno = ezFrame(gene_id=rownames(goAnno),
                      gene_name=seqAnno$gene_name[match(rownames(goAnno), seqAnno$gene_id)],
                      goAnno)
    if (!is.null(normalizedAvgSignal)){ ## if its not an identity mapping
      normalizedAvgSignal = tapply(normalizedAvgSignal[names(genes)], genes, mean)
      normalizedAvgSignal = normalizedAvgSignal[rownames(seqAnno)]
    }
    if (is.null(genes)){
      stop("no probe 2 gene mapping found found for ")
    }
  } else {
    genes = rownames(seqAnno)
    names(genes) = genes
  }
  
  isSig = rowData(se)$pValue <= param$pValThreshGO & rowData(se)$usedInTest
  isUp = rowData(se)$log2Ratio > param$log2RatioThreshGO & isSig
  isDown = rowData(se)$log2Ratio < -param$log2RatioThreshGO & isSig
  probes = rownames(groupMeans)
  presentGenes = na.omit(unique(genes[probes[rowData(se)$isPresentProbe]]))
  upGenes = na.omit(unique(genes[probes[isUp]]))
  downGenes = na.omit(unique(genes[probes[isDown]]))
  bothGenes = union(upGenes, downGenes)
  normalizedAvgSignal = normalizedAvgSignal[presentGenes]
  if (length(presentGenes) == 0 | length(bothGenes) == 0){
    ezWrite("presentGenes: ", length(presentGenes), " up: ", length(upGenes), 
            " down: ", length(downGenes), " both: ", length(bothGenes))
  }
  
  ontologies = c("BP", "MF", "CC")
  require(GO.db)
  go2geneDfList = list()
  for (onto in ontologies){
    gene2goList = goStringsToList(seqAnno[[paste("GO", onto)]], 
                                  listNames=rownames(seqAnno))[presentGenes]
    if (param$includeGoParentAnnotation){
      gene2goList = addGoParents(gene2goList, onto) 
    }
    ### consider only genes with annotation in the currently selected ontology!!!!
    allGos = switch(onto,
                    BP=keys(GOBPPARENTS),
                    MF=keys(GOMFPARENTS),
                    CC=keys(GOCCPARENTS),
                    NA)
    if (!all(unlist(gene2goList) %in% allGos)){
      gene2goList = lapply(gene2goList, function(x){intersect(x, allGos)})
    }
    gene2goList = gene2goList[lengths(gene2goList) > 0]
    goIDs <- unlist(gene2goList, use.names=FALSE)
    go2geneDF <- data.frame(ont=goIDs,
                            gene=rep(names(gene2goList), lengths(gene2goList)),
                            stringsAsFactors = FALSE)
    go2geneDfList[[onto]] = go2geneDF
  }
  
  ans = list(selections=list(upGenes=upGenes, downGenes=downGenes, bothGenes=bothGenes),
             presentGenes=presentGenes, 
             normalizedAvgSignal=normalizedAvgSignal, log2Ratio=log2Ratio, seqAnno=seqAnno,
             go2gene = go2geneDfList)
  return(ans)
}

##' @title Performs the GO analysis for two groups
##' @description Performs the GO analysis for two groups.
##' @param param a list of parameters:
##' \itemize{
##'   \item{featureLevel}{ which feature level to use.}
##'   \item{pValThreshGO}{ a numeric specifying the threshold for the GO p-value.}
##'   \item{log2RatioThreshGO}{ a numeric specifying the threshold for the GO log2 ratios.}
##'   \item{includeGoParentAnnotation}{ a logical indicating whether to include the annotation of the GO parents.}
##' }
##' @param testResult a list containing the results of an earlier test.
##' @param seqAnno the sequence annotation.
##' @param normalizedAvgSignal an optional normalized average signal.
##' @param method which method to pass to \code{ezGoseq()}.
##' @template roxygen-template
##' @return Returns the results of the GO analysis.
##' @seealso \code{\link{ezGoseq}}
twoGroupsGO = function(param, testResult, seqAnno, normalizedAvgSignal=NULL, method="Wallenius"){
  
  job = ezJobStart("twoGroupsGO")
  require("GOstats", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)
  require("annotate", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)
  
  if (param$featureLevel != "gene"){
    genes = getGeneMapping(param, seqAnno)
    seqAnno = aggregateGoAnnotation(seqAnno, genes)
    if (!is.null(normalizedAvgSignal)){ ## if its not an identity mapping
      normalizedAvgSignal = tapply(normalizedAvgSignal[names(genes)], genes, mean)
      normalizedAvgSignal = normalizedAvgSignal[rownames(seqAnno)]
    }
    if (is.null(genes)){
      stop("no probe 2 gene mapping found found for ")
    }
  } else {
    genes = rownames(seqAnno)
    names(genes) = genes
  }
  
  isSig = testResult$pValue < param$pValThreshGO & testResult$usedInTest
  isUp = testResult$log2Ratio > param$log2RatioThreshGO & isSig
  isDown = testResult$log2Ratio < -param$log2RatioThreshGO & isSig
  probes = rownames(testResult$groupMeans)
  presentGenes = na.omit(unique(genes[probes[testResult$isPresentProbe]]))
  upGenes = na.omit(unique(genes[probes[isUp]]))
  downGenes = na.omit(unique(genes[probes[isDown]]))
  bothGenes = union(upGenes, downGenes)
  normalizedAvgSignal = normalizedAvgSignal[presentGenes]
  if (length(presentGenes) == 0 | length(bothGenes) == 0){
    ezWrite("presentGenes: ", length(presentGenes), " up: ", length(upGenes), " down: ", length(downGenes), " both: ", length(bothGenes))
  }
  ontologies = c("BP", "MF", "CC")
  #goResults = list()
  #for (onto in ontologies){
  goResults = ezMclapply(ontologies, function(onto){
    gene2goList = goStringsToList(seqAnno[[paste("GO", onto)]], listNames=rownames(seqAnno))[presentGenes]
    if (param$includeGoParentAnnotation){
      gene2goList = addGoParents(gene2goList, onto) 
    }
    enrichUp = ezGoseq(param, selectedGenes=upGenes, allGenes=presentGenes, gene2goList=gene2goList, method=method, normalizedAvgSignal=normalizedAvgSignal, onto=onto)
    enrichDown = ezGoseq(param, selectedGenes=downGenes, allGenes=presentGenes, gene2goList=gene2goList, method=method, normalizedAvgSignal=normalizedAvgSignal, onto=onto)
    enrichBoth = ezGoseq(param, selectedGenes=bothGenes, allGenes=presentGenes, gene2goList=gene2goList, method=method, normalizedAvgSignal=normalizedAvgSignal, onto=onto)
    result = list(enrichUp=enrichUp, enrichDown=enrichDown, enrichBoth=enrichBoth)
    return(result)
  }, mc.cores=1)
  names(goResults) = ontologies
  #     goResults[[onto]] = list(enrichUp=enrichUp, enrichDown=enrichDown, enrichBoth=enrichBoth,
  #                              countsUp=countsUp, countsDown=countsDown, countsBoth=countsBoth)
  ezWriteElapsed(job)
  return(goResults)
}

addGeneNamesEnrich <- function(resEnrich, se){
  gene_ids <- strsplit(resEnrich$Genes, "; ")
  ids2names <- set_names(rowData(se)$gene_name, rowData(se)$gene_id)
  gene_names <- relist(ids2names[unlist(gene_ids)], gene_ids)
  gene_names <- sapply(gene_names, paste, collapse="; ")
  resEnrich$GenesNames <- gene_names
  return(resEnrich)
}


twoGroupsGOSE = function(param, se, method="Wallenius"){
  godata <- prepareGOData(param, se)
  seqAnno <- data.frame(rowData(se), row.names=rownames(se),
                        check.names = FALSE, stringsAsFactors=FALSE)
  
  upGenes <- godata$upGenes
  downGenes <- godata$downGenes
  bothGenes <- godata$bothGenes
  presentGenes <- godata$presentGenes
  normalizedAvgSignal <- godata$normalizedAvgSignal
  
  if (param$featureLevel != "gene"){
    genes = getGeneMapping(param, seqAnno)
    seqAnno = aggregateGoAnnotation(seqAnno, genes)
    if (!is.null(normalizedAvgSignal)){ ## if its not an identity mapping
      normalizedAvgSignal = tapply(normalizedAvgSignal[names(genes)], genes, mean)
      normalizedAvgSignal = normalizedAvgSignal[rownames(seqAnno)]
    }
    if (is.null(genes)){
      stop("no probe 2 gene mapping found found for ")
    }
  } else {
    genes = rownames(seqAnno)
    names(genes) = genes
  }
  
  job = ezJobStart("twoGroupsGO")
  require("GOstats", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)
  require("annotate", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)

  ontologies = c("BP", "MF", "CC")
  #goResults = list()
  #for (onto in ontologies){
  goResults = ezMclapply(ontologies, function(onto){
    gene2goList = goStringsToList(seqAnno[[paste("GO", onto)]],
                                  listNames=rownames(seqAnno))[presentGenes]
    if (param$includeGoParentAnnotation){
      gene2goList = addGoParents(gene2goList, onto) 
    }
    enrichUp = ezGoseq(param, selectedGenes=upGenes, allGenes=presentGenes, 
                       gene2goList=gene2goList, method=method, 
                       normalizedAvgSignal=normalizedAvgSignal, onto=onto)
    enrichDown = ezGoseq(param, selectedGenes=downGenes, allGenes=presentGenes, 
                         gene2goList=gene2goList, method=method, 
                         normalizedAvgSignal=normalizedAvgSignal, onto=onto)
    enrichBoth = ezGoseq(param, selectedGenes=bothGenes, allGenes=presentGenes, 
                         gene2goList=gene2goList, method=method, 
                         normalizedAvgSignal=normalizedAvgSignal, onto=onto)
    result = list(enrichUp=enrichUp, enrichDown=enrichDown, enrichBoth=enrichBoth)
    result <- lapply(result, addGeneNamesEnrich, se)
    return(result)
  }, mc.cores=1)
  names(goResults) = ontologies
  #     goResults[[onto]] = list(enrichUp=enrichUp, enrichDown=enrichDown, enrichBoth=enrichBoth,
  #                              countsUp=countsUp, countsDown=countsDown, countsBoth=countsBoth)
  ezWriteElapsed(job)
  return(goResults)
}


## ezGoseq considers only the genes that have annotations; genes without annotation are removed from the selectedGenes and allGenes
##' @describeIn twoGroupsGO Performs the GO analysis and returns a list of results.
ezGoseq = function(param, selectedGenes, allGenes, gene2goList=NULL, 
                   method=c("Wallenius", "Sampling", "Hypergeometric"),
                   onto=NULL, normalizedAvgSignal=NULL){
  method = match.arg(method)
  require("GO.db", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)
  #if (length(selectedGenes) <= 1){
  #  return(NA)
  #}
  
  stopifnot(names(gene2goList) %in% allGenes)
  stopifnot(!is.null(onto))
  ### consider only genes with annotation in the currently selected ontology!!!!
  allGos = switch(onto,
                  BP=keys(GOBPPARENTS),
                  MF=keys(GOMFPARENTS),
                  CC=keys(GOCCPARENTS),
                  NA)
  if (!all(unlist(gene2goList) %in% allGos)){
    gene2goList = lapply(gene2goList, function(x){intersect(x, allGos)})
  }
  
  gene2goList = gene2goList[lengths(gene2goList) > 0]
  
  ## GO analysis
  allGenes = intersect(allGenes, names(gene2goList))
  selectedGenes = intersect(selectedGenes, names(gene2goList))
  
  go2GenesList = inverseMapping(gene2goList)
  go2GenesList = go2GenesList[lengths(go2GenesList) >= param$minCountFisher]    
  goSizes = lengths(go2GenesList)
  go2SelectedGenes = lapply(go2GenesList, function(x, y){intersect(x, y)}, selectedGenes)  
  goCounts = lengths(go2SelectedGenes)
  
  gene.vector = as.integer(allGenes %in% selectedGenes)
  names(gene.vector) = allGenes
  pwf.counts = data.frame(DEgenes=gene.vector, bias.data=1, pwf=1, row.names=names(gene.vector))
  if (!is.null(normalizedAvgSignal)){
    stopifnot(names(gene.vector) %in% names(normalizedAvgSignal))
    tryCatch({pwf.counts = goseq::nullp(gene.vector, bias.data=2^normalizedAvgSignal[names(gene.vector)], plot.fit=FALSE)}, error=function(e){message("nullp failed")})
    #tryCatch({pwf.counts = goseq::nullp(gene.vector, bias.data=2^normalizedAvgSignal[names(gene.vector)], plot.fit=FALSE)})
    
  }
  #   else {
  #     stopifnot(method == "Hypergeometric") ## if there is no bias --> method must be Hypergeometric
  #     pwf.counts = data.frame(DEgenes=gene.vector, bias.data=1, pwf=1, row.names=names(gene.vector))
  #   }
  go.counts = goseq::goseq(pwf.counts, gene2cat=gene2goList, method=method)
  pvalues = go.counts[match(names(go2GenesList), go.counts$category), "over_represented_pvalue"]
  
  ## compile the result
  result = data.frame(row.names=names(go2GenesList), stringsAsFactors=FALSE, 
                      check.names=FALSE)
  result$Pvalue = pvalues
  result$fdr = p.adjust(pvalues, method="fdr")
  result$Count = goCounts
  result$Size = goSizes
  result$Term = Term(GOTERM[rownames(result)])
  result$Genes = sapply(go2SelectedGenes, function(x){paste(sort(x), collapse="; ")})
  return(result)
}

### -----------------------------------------------------------------
### ezEnricher with hypergeometric implementation from clusterProfiler
###
ezEnricher <- function(enrichInput, param){
  require(clusterProfiler)
  minGenesOverlap <- 3
  geneid2name = set_names(enrichInput$seqAnno$gene_name, enrichInput$seqAnno$gene_id)
  ontologies = c("BP", "MF", "CC")
  goResults = ezMclapply(ontologies, function(onto){
    result = list()
    for (mySel in names(enrichInput$selections)){
      enrichRes <- enricher(gene=enrichInput$selections[[mySel]],
                            universe=enrichInput$presentGenes,
                            TERM2GENE=enrichInput$go2gene[[onto]], pvalueCutoff = param$fdrThreshORA)
      if(!is.null(enrichRes)){
        tempTable <- enrichRes@result
        if(nrow(tempTable) != 0L){
          tempTable$Description <- Term(GOTERM[tempTable$ID])
          tempTable$geneName <- sapply(tempTable$geneID, function(idString){
            idString %>% 
              strsplit("/") %>% 
              unlist %>% 
              (function(x)geneid2name[x]) %>%  ## map to gene symbols
              na.omit %>%
              paste(collapse="/")
          })
          tempTable <- tempTable[tempTable$Count >= minGenesOverlap, ]
          
          enrichRes@result <- tempTable
        }
      }
      result[[mySel]] = enrichRes
    }
    return(result)
  }, mc.cores=1)
  names(goResults) = ontologies
  return(goResults)
}

### -----------------------------------------------------------------
### ezGSEA
###
ezGSEA <- function(enrichInput, param){
  require(clusterProfiler)
  require(GO.db)
  geneid2name = set_names(enrichInput$seqAnno$gene_name, enrichInput$seqAnno$gene_id)
  ontologies = c("BP", "MF", "CC")
  goResults = ezMclapply(ontologies, function(onto){
    enrichRes <- GSEA(gene=sort(enrichInput$log2Ratio, decreasing = TRUE), pvalueCutoff = param$fdrThreshGSEA,
                      TERM2GENE=enrichInput$go2gene[[onto]])
    if(!is.null(enrichRes)){
      tempTable <- enrichRes@result
      if(nrow(tempTable) != 0L){
        tempTable$Description <- Term(GOTERM[tempTable$ID])
        tempTable$geneName <- sapply(relist(geneid2name[unlist(strsplit(tempTable$core_enrichment, "/"))], 
                                            strsplit(tempTable$core_enrichment, "/")), paste, collapse="/")
        enrichRes@result <- tempTable
      }
    }
    return(enrichRes)
  }, mc.cores=1)
  names(goResults) = ontologies
  return(goResults)
}

##' @title Groups GO terms and information
##' @description Groups GO terms and information.
##' @param selectedGenes a character vector containing the selected genes.
##' @param go2GeneList a character vector.
##' @param onto a character representing the ontology to use.
##' @param levels an integer vector selecting the levels to get GO information by.
##' @param goSlim a character vector.
##' @template roxygen-template
##' @return Returns the results of grouping the GO information.
ezGroupGO = function(selectedGenes, go2GeneList, onto="CC", levels = 2:4, goSlim=NULL) {
  goByLevel = ezGetGoByLevels(onto, levels, goSlim=goSlim)
  levelCounts = list()
  for (l in levels){
    goIds = goByLevel[[paste("level", l)]]
    go2selGenes = lapply(go2GeneList[goIds], function(genesInCategory) intersect(selectedGenes, genesInCategory))
    result = data.frame("GO ID" = goIds, Level=rep(paste("level", l), length(goIds)), stringsAsFactors=FALSE, check.names=FALSE)
    result[["Term"]] = Term(GOTERM[goIds])
    result[["Count"]] = sapply(go2selGenes, length)
    result[["Size"]] = sapply(go2GeneList[goIds], length)
    result[["Genes"]] = sapply(go2selGenes, paste, collapse="; ")
    levelCounts[[paste("level", l)]] =result
  }
  levelCounts[["unknown"]] = data.frame("GO ID"="unknown", Level="unknown", Term="unknown",
                                        Count=length(setdiff(selectedGenes, unlist(go2GeneList))),
                                        Size=NA, Genes="", stringsAsFactors=FALSE, check.names=FALSE
  )
  result = do.call("rbind", levelCounts)
  return(result)
}

## copied from clusterProfiler
##' @describeIn ezGroupGO Gets the gene ontology by level.
ezGetGoByLevels = function(onto, levels, goSlim=NULL) {
  require("GO.db", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)
  switch(onto, MF = {
    topNode <- "GO:0003674"
    children <- GOMFCHILDREN
  }, BP = {
    topNode <- "GO:0008150"
    children <- GOBPCHILDREN
  }, CC = {
    topNode <- "GO:0005575"
    children <- GOCCCHILDREN
  })
  if (!is.null(goSlim)){
    goAncestor = switch(onto, BP=as.list(GOBPANCESTOR),
                        CC=as.list(GOCCANCESTOR),
                        MF=as.list(GOMFANCESTOR))
    goSlim = goSlim[goSlim %in% names(goAncestor)]
    goSlim = setdiff(c(goSlim, unlist(goAncestor[goSlim])), "all")
  }
  
  nodes = topNode
  nodesByLevel = list("level 1"=nodes)
  for (i in seq_len(max(levels) - 1)) {
    nodes <- keys(children[nodes])
    nodes <- na.omit(unique(unlist(nodes)))
    goIds = as.character(nodes)
    if (!is.null(goSlim)){
      goIds = intersect(goIds, goSlim)
    }
    nodesByLevel[[paste("level", i+1)]] = goIds
  }
  return(nodesByLevel[paste("level", levels)])
}

##' @title Parses GO strings to a list
##' @description Parses GO strings to a list.
##' @param goStrings a character vector containing the GO strings
##' @param listNames an optional character vector naming the list elements.
##' @template roxygen-template
##' @return Returns the GO strings as a list.
##' @examples
##' goStrings = c("GO 1;GO 2;GO 3", "GO 4")
##' listNames = letters[1:2]
##' goStringsToList(goStrings, listNames)
goStringsToList = function(goStrings, listNames=NULL){
  x = strsplit(goStrings, "; ")
  names(x) = listNames
  #  x = lapply(x, function(x){ifelse(x == "", character(0), x)})
  ## "" after strsplit is character(0)
  return(x)
}

##' @describeIn goClusterTable Gets the child terms from a GO term and returns them together.
getChildTerms = function(x, subset, goRelatives, indent="", childEnvir){
  
  result = character()
  if (is.element(x, subset)){
    term = getGOTerm(x)[[1]]
    displayLabel = paste0(indent, term)
    result[displayLabel] = x
    subset = setdiff(subset, x) ## this is the modification to make the table non-redundant!!
  }
  kids = intersect(childEnvir[[x]], goRelatives)
  indent = paste(indent, ".")
  for (kid in kids){
    kidResult = getChildTerms(kid, subset, goRelatives, indent=indent, childEnvir)
    subset = setdiff(subset, kidResult) ## again the non-redundancy modification
    result = c(result, kidResult)
  }
  return(result)
}

##' @describeIn clusterHeatmap Initializes and returns a list of results used by \code{clusterHeatmap()} and \code{goClusterResults()}.
clusterResults = function(x, nClusters=5, clusterColors=rainbow(nClusters), 
                          d=NULL, method="ward.D2"){
  if (is.null(d)){
    xx = x
    xx[is.na(x)] = min(x, na.rm=TRUE)
    d = dist(xx)
  }
  hcl = hclust(d, method=method)
  clusterNumbers = cutree(hcl, k=nClusters)
  result = list()
  result$nClusters = nClusters
  result$clusterNumbers = clusterNumbers
  result$clusterColors = clusterColors
  result$hcl = hcl
  return(result)
}

##' @title Plots the cluster heatmap
##' @description Plots the cluster heatmap.
##' @param x the data to plot.
##' @param param a list of parameters to extract the logical \code{showGeneClusterLabels} from.
##' @param result a list of cluster result properties obtained with \code{clusterResults()}.
##' @param file a character representing the name of the .png file to derive tables from.
##' @param method a character representing the method to pass to \code{hclust()}.
##' @param doClusterColumns a logical indicating whether to do cluster columns.
##' @param columnDist the distance matrix to use for the cluster columns.
##' @param colColors used for the \code{ColSideColors} argument passed to \code{heatmap.2()}.
##' @param lim two integers used for \code{breaks} argument passed to \code{heatmap.2()}.
##' @param cexRow an integer passed to \code{heatmap.2()}.
##' @param cexCol an integer passed to \code{heatmap.2()}.
##' @param labRow a character vector, possibly modified, then passed to \code{heatmap.2()}.
##' @param margins an integer, possibly modified, then passed to \code{heatmap.2()}.
##' @template colors-template
##' @param maxGenesWithLabel an integer specifying the maximum amount of genes with labels.
##' @template addargs-template
##' @templateVar fun heatmap.2()
##' @template roxygen-template
##' @seealso \code{\link[stats]{hclust}}
##' @seealso \code{\link[gplots]{heatmap.2}}
clusterHeatmap = function(x, param, result, file="cluster-heatmap.png", 
                          method="ward.D2",
                          doClusterColumns=FALSE, columnDist=NULL, 
                          colColors=NULL, lim=c(-4, 4),
                          lwid=c(1, 4), lhei=c(1,5),
                          cexRow=1.0, cexCol=1.5, labRow=rownames(x), 
                          margins=c(14,9),
                          colors=getBlueRedScale(), 
                          maxGenesWithLabel=50, ...){
  require("gplots", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)
  probeDendro = as.dendrogram(result$hcl)
  probeDendro = reorder(probeDendro, rowMeans(x, na.rm=TRUE))
  if (doClusterColumns){
    if (is.null(columnDist)){
      columnDist = dist(t(x))
    }
    result$hcl = hclust(columnDist, method=method)
    colDendro = as.dendrogram(result$hcl)
    colDendro = reorder(colDendro, colMeans(x, na.rm=TRUE))
    showDendro = "both"
  } else {
    colDendro = FALSE
    showDendro = "row"
  }
  if (param$showGeneClusterLabels & nrow(x) < maxGenesWithLabel){
    if (is.null(labRow)){
      labRow = rownames(x)
    }
  } else {
    labRow = ""
    margins[2] = 0.5 
  }
  if (!is.null(colColors)){
    if (result$nClusters > 1){
      heatmap.2(x,
                ColSideColors=colColors, RowSideColors=result$clusterColors[result$clusterNumbers],
                scale="none", dendrogram=showDendro, labRow=labRow,
                breaks=seq(from=lim[1], to=lim[2], length.out=257),
                Colv=colDendro, Rowv=probeDendro, col=colors,
                key=TRUE, density.info="none", trace="none", margins=margins, cexCol=cexCol, cexRow=cexRow, lwid=lwid, lhei=lhei, ...)
    } else {
      heatmap.2(x,
                ColSideColors=colColors,
                scale="none", dendrogram=showDendro, labRow=labRow,
                breaks=seq(from=lim[1], to=lim[2], length.out=257),
                Colv=colDendro, Rowv=probeDendro, col=colors,
                key=TRUE, density.info="none", trace="none", margins=margins, cexCol=cexCol, cexRow=cexRow, lwid=lwid, lhei=lhei, ...)
    }
  } else {
    if (result$nClusters > 1){
      heatmap.2(x,
                RowSideColors=result$clusterColors[result$clusterNumbers],
                scale="none", dendrogram=showDendro, labRow=labRow,
                breaks=seq(from=lim[1], to=lim[2], length.out=257),
                Colv=colDendro, Rowv=probeDendro, col=colors,
                key=TRUE, density.info="none", trace="none", margins=margins, cexCol=cexCol, cexRow=cexRow, lwid=lwid, lhei=lhei,...)
    } else {
      heatmap.2(x,
                scale="none", dendrogram=showDendro, labRow=labRow,
                breaks=seq(from=lim[1], to=lim[2], length.out=257),
                Colv=colDendro, Rowv=probeDendro, col=colors,
                key=TRUE, density.info="none", trace="none", margins=margins, cexCol=cexCol, cexRow=cexRow, lwid=lwid, lhei=lhei, ...)
    }
  }
  if (!is.null(file)){
    ## export also the clusters
    xTmp = as.data.frame(result$clusterNumbers)
    colnames(xTmp) = "Cluster"
    xTmp[ ,1] = result$clusterColors[xTmp[,1]]
    xTmp = xTmp[order(xTmp$Cluster), ,drop=FALSE]
    ezWrite.table(xTmp, file=sub(".png$", "-clusterMembers.txt", file))
  }
}

clusterPheatmap <- function(x, design, param, 
                            clusterColors=c("red", "yellow", "orange", 
                                            "green", "blue", "cyan"), 
                            method="ward.D2", doClusterColumns=FALSE,
                            colors=getBlueRedScale(),
                            colColors=NULL, lim=c(-4, 4),
                            maxGenesWithLabel=50,
                            condColors=NULL, ...){
  require(pheatmap)
  nClusters <- length(clusterColors)
  
  if(param$showGeneClusterLabels & nrow(x) < maxGenesWithLabel){
    isShowRowNames <- TRUE
  }else{
    isShowRowNames <- FALSE
  }
  
  callback = function(hc, mat){
    dend = reorder(as.dendrogram(hc), wts = rowMeans(mat, na.rm=TRUE))
    as.hclust(dend)
  }
  clusterInfo <- pheatmap(x, color=colors, clustering_method=method,
                          breaks=seq(from=lim[1], to=lim[2], length.out=257),
                          scale="none",
                          clustering_callback = callback,
                          silent=TRUE, ...)
  
  clusters <- as.factor(cutree(clusterInfo$tree_row, nClusters))
  annotation_row = data.frame(Clusters=clusters)
  if(doClusterColumns){
    colDendro <- clusterInfo$tree_col
  }else{
    colDendro <- FALSE
  }

  ann_colors <- c(condColors, 
                  list(Clusters=set_names(clusterColors, levels(clusters))))
  
  p <- pheatmap(x, color=colors, clustering_method=method,
           breaks=seq(from=lim[1], to=lim[2], length.out=257),
           scale="none", cluster_rows=clusterInfo$tree_row,
           cluster_cols=colDendro,
           show_rownames=isShowRowNames,
           annotation_col = design, annotation_row=annotation_row,
           annotation_colors = ann_colors, silent=TRUE, ...)
  
  ans <- list(nClusters=nClusters, clusterNumbers=clusters,
              clusterColors=clusterColors, hcl=clusterInfo$tree_row,
              pheatmap=p)
  invisible(ans)
}

##' @describeIn clusterHeatmap Applies a GO analysis to the cluster results if GO should be done.
goClusterResults = function(x, param, result, ontologies=c("BP", "MF", "CC"), seqAnno=NULL,
                            universeGeneIds=NULL, universeProbeIds=NULL, keggOrganism=NA){
  require("GOstats", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)
  require("annotate", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)
  if (param$featureLevel != "gene"){
    genes = getGeneMapping(param, seqAnno)
    if (is.null(genes)){
      stop("no probe 2 gene mapping found")
    }
    seqAnno = aggregateGoAnnotation(seqAnno, genes)
  } else {
    genes = rownames(seqAnno)
    names(genes) = genes
  }
  
  if (is.null(universeGeneIds)){
    universeGeneIds =na.omit(unique(unlist( genes[universeProbeIds]) ))
  }
  keggClusterResults = NULL
  if (!is.na(keggOrganism)){
    kegg2GenesList = getPathway2GenesList(param, keggOrganism)  ## TODOMF: function does not exist
    if (all(unlist(kegg2GenesList)) %in% universeGeneIds){
      keggClusterResults = list()
    }
  }
  genesByCluster = tapply(genes[rownames(x)], result$clusterNumbers, function(x){na.omit(unique(x))}, simplify=FALSE)
  goClusterResults = ezMclapply(ontologies, function(onto){
    gene2go = goStringsToList(seqAnno[[paste("GO", onto)]], listNames=rownames(seqAnno))[universeGeneIds]
    if (param$includeGoParentAnnotation){
      gene2go = addGoParents(gene2go, onto)
    }
    result = lapply(genesByCluster, function(selectedGenes){
      ezGoseq(param, selectedGenes, universeGeneIds, gene2goList=gene2go,
              method="Hypergeometric",
              onto=onto, normalizedAvgSignal=NULL)})
  }, mc.cores=1)
  names(goClusterResults) = ontologies
  result$GO = goClusterResults
  return(invisible(result))
}







## still used?
collectChildren = function(go2geneList, onto){
  goOffsprings = switch(onto, BP=as.list(GOBPOFFSPRING),
                        CC=as.list(GOCCOFFSPRING),
                        MF=as.list(GOMFOFFSPRING))
  goNames = union(names(go2geneList), names(goOffsprings))
  names(goNames) = goNames
  doCollect = function(goName, goOffsprings=NULL, go2genes=NULL){
    return(unique(unlist(go2genes[c(goOffsprings[[goName]], goName)])))
  }
  go2geneList = lapply(goNames, doCollect, goOffsprings=goOffsprings, go2genes=go2geneList)
  return(go2geneList)
}


## still used?
myFisherTest = function(goGenes, selGenes, allGenes, alternative="greater"){ 
  fisher.test(factor(allGenes %in% selGenes, levels=c("FALSE", "TRUE")), 
              factor(allGenes %in% goGenes, levels=c("FALSE", "TRUE")), alternative=alternative)$p.value
}

## NOTE: the GSEABase has also GO to slim mapping but does not include the GO parents
## at the geneontology there seems to be a map2slim perl script but
## a) the perl script installation seems cumbersome; lots of dependencies
## b) it requires a gene-assocation-file as input; a more complex than necessary file
mapGoToSlim = function(goList, ontology, slimGo){
  require("GO.db", warn.conflicts=WARN_CONFLICTS, quietly=!WARN_CONFLICTS)
  #   require(GSEABase)
  #   slim = getOBOCollection(oboFile)
  #   slimGo = ids(slim)
  goAncestor = switch(ontology, BP=as.list(GOBPANCESTOR),
                      CC=as.list(GOCCANCESTOR),
                      MF=as.list(GOMFANCESTOR))
  goOffspring = switch(ontology, MF=as.list(GOMFOFFSPRING),
                       BP=as.list(GOBPOFFSPRING), CC=as.list(GOCCOFFSPRING))
  slimGo = slimGo[slimGo %in% names(goAncestor)]
  slimGo = setdiff(union(slimGo, unlist(goAncestor[slimGo])), "all") ## this automatically filters on the ontology  
  stopifnot(slimGo %in% names(goOffspring))
  slim2full = sapply(slimGo, function(sg){c(sg, goOffspring[[sg]])}, simplify=FALSE)
  full2slim = inverseMapping(slim2full)
  ## remove all parents
  full2slim = lapply(full2slim, function(slimIds){setdiff(slimIds, unlist(goAncestor[slimIds]))})
  slimList = lapply(goList, function(goIds){unique(unlist(full2slim[goIds]))})
  return(slimList)
}









#
#writeChildren = function(x, subset, goRelatives, indent=""){
#
#  if (length(intersect(x, subset)>0)){
#    ezWrite(indent, x, " ", getGOTerm(x)[[1]])
#  }
#  kids = get(x, envir=eval(paste0("GO", onto, "CHILDREN")))
#  indent = paste(indent, " ")
#  for (kid in kids){
#    if (is.element(kid, goRelatives)){
#      writeChildren(kid, subset, goRelatives, indent=indent)
#    }
#  }
#}
#


### deprecated should no longer be used
# makeChipAnno = function(param, probeAnno, chip){
#   
#   if (missing(chip)){
#     stop("wrong call to makeChipAnno")
#   }
#   if (hasGeneMapping(param, probeAnno)){
#     message("make chip anno for ", chip, " -- have gene mapping")
#   } else {
#     message("make chip anno for ", chip, " -- gene mapping is missing in: ", paste(colnames(probeAnno), collapse=", "))
#     return(NULL)
#   }
#   
#   require("GOstats", quietly=TRUE)
#   require("GO.db", quietly=TRUE)
#   require("annotate", quietly=TRUE)
#   
#   #job = ezJobStart("entrez")
#   entrezid = new.env(hash=TRUE, parent=emptyenv(), size=nrow(probeAnno))
#   #  apply(probeAnno, 1, function(x){  assign(x["Probe"], unname(x["Gene Name"]), envir=entrezid)})
#   genes = getGeneMapping(param, probeAnno)
#   genes[genes == ""] = NA
#   probe2Gene = as.list(genes)
#   l2e(probe2Gene, entrezid)
#   #  apply(probeAnno, 1, function(x){  assign(x["Probe"], unname(x[ "Entrez Gene ID"]), envir=entrezid)})
#   #ezWriteElapsed(job)
#   
#   #job = ezJobStart("go")
#   if (all(c("GO BP", "GO CC", "GO MF") %in% colnames(probeAnno))){
#     go = new.env(hash=TRUE, parent=emptyenv(), size=nrow(probeAnno))
#     goIds = paste(probeAnno[ , "GO BP"], probeAnno[ ,"GO CC"], probeAnno[ ,"GO MF"], sep="; ")
#     goIds = sub("; ; ", "; ", goIds)
#     goIds = sub("^; ", "", goIds)
#     goIds = sub("; $", "", goIds)
#     probe2GoList = strsplit(goIds, "; ", fixed=TRUE)
#     
#     names(probe2GoList) = rownames(probeAnno)
#     ##goList = lapply(goList, function(x){ if (length(x)== 0){NA} else {paste0("GO:", x)}})
#     l2e(probe2GoList, go)
#     #ezWriteElapsed(job)
#     
#     #job = ezJobStart("go prep")
#     
#     #  goTermVec = unlist(probe2GoList)
#     #  probeVec = rep(rownames(probeAnno), sapply(probe2GoList, length))
#     #  probesByGO = split(probeVec, goTermVec)
#     probesByGo = inverseMapping(probe2GoList)
#     #ezWriteElapsed(job)
#     
#     #job = ezJobStart("go MF")
#     
#     allGo = names(probesByGo)
#     collectProbes = function(goName, goOffsprings, probesByGo){
#       return(unique(unlist(probesByGo[c(goName, goOffsprings)])))
#     }
#     
#     
#     goBpOff = as.list(GOBPOFFSPRING)
#     bpList = mapply(collectProbes, names(goBpOff), goBpOff, MoreArgs=list(probesByGo=probesByGo))
#     
#     goMfOff = as.list(GOMFOFFSPRING)
#     mfList = mapply(collectProbes, names(goMfOff), goMfOff, MoreArgs=list(probesByGo=probesByGo))
#     
#     goCcOff = as.list(GOCCOFFSPRING)
#     ccList = mapply(collectProbes, names(goCcOff), goCcOff, MoreArgs=list(probesByGo=probesByGo))
#     
#     goList = c(bpList, mfList, ccList)
#     stopifnot(!is.na(unlist(goList)))
#     
#     go2probe = new.env(hash=TRUE, parent=emptyenv(), size=length(goList))
#     l2e(goList, go2probe)
#     
#     l = list()
#     l[[paste0(chip, "GO")]] = go
#     l[[paste0(chip, "GO2ALLPROBES")]] = go2probe
#   } else {
#     l = list()
#   }
#   l[[paste0(chip, "ENTREZID")]] = entrezid
#   pkgName = paste0("package:", chip, ".db")
#   ezWrite("attach package: ", pkgName)
#   attach(l, name=pkgName)
#   return(chip)
# }
# 

# hasGoAnnotationInEnv = function(chip){
#   exists(paste0(chip, "GO2ALLPROBES"))
# }
uzh/ezRun documentation built on Sept. 16, 2024, 11:21 p.m.