R/GOenrichmentAnalysis.R
In WGCNA: Weighted Correlation Network Analysis

Documented in GOenrichmentAnalysis

# Function that returns GO terms with best enrichment and highest number of genes.

# So that I don't forget: information about GO categories is contained in the GO.db package

GOenrichmentAnalysis = function(labels, entrezCodes, 
               yeastORFs = NULL,
               organism = "human",
               ontologies = c("BP", "CC", "MF"),
               evidence = "all",
               includeOffspring = TRUE,
               backgroundType = "givenInGO",
               removeDuplicates = TRUE,
               leaveOutLabel = NULL,
               nBestP = 10, pCut = NULL, nBiggest = 0,
               getTermDetails = TRUE,
               verbose = 2, indent = 0 )
{
   warning(immediate. = TRUE,
     spaste("This function is deprecated and will be removed in the near future. \n",
            "We suggest using the replacement function enrichmentAnalysis \n",
            "in R package anRichment, available from the following URL:\n",
            "https://labs.genetics.ucla.edu/horvath/htdocs/CoexpressionNetwork/GeneAnnotation/"));

   sAF = options("stringsAsFactors")
   options(stringsAsFactors = FALSE);
   on.exit(options(stringsAsFactors = sAF[[1]]), TRUE)

   organisms = c("human", "mouse", "rat", "malaria", "yeast", "fly", "bovine", "worm", "canine",
                 "zebrafish", "chicken");
   allEvidence =  c("EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "ISS", "ISO", "ISA",
                    "ISM", "IGC", "IBA", "IBD", "IKR", "IRD", "RCA", "TAS", "NAS", "IC", "ND", "IEA",
                    "NR");
   allOntologies = c("BP", "CC", "MF");

   backgroundTypes = c("allGiven", "allInGO", "givenInGO");

   spaces = indentSpaces(indent);
   orgInd = pmatch(organism, organisms);
   if (is.na(orgInd))
     stop(paste("Unrecognized 'organism' given. Recognized values are ", 
                paste(organisms, collapse = ", ")));

   if (length(evidence)==0)
     stop("At least one valid evidence code must be given in 'evidence'.");
   if (length(ontologies)==0)
     stop("At least one valid ontology code must be given in 'ontology'.");

   if (evidence=="all")
      evidence = allEvidence;

   evidInd = pmatch(evidence, allEvidence);
   if (sum(is.na(evidInd))!=0)
     stop(paste("Unrecognized 'evidence' given. Recognized values are ", 
                paste(allEvidence, collapse = ", ")));

   ontoInd = pmatch(ontologies, allOntologies);
   if (sum(is.na(ontoInd))!=0)
     stop(paste("Unrecognized 'ontologies' given. Recognized values are ", 
                paste(allEvidence, collapse = ", ")));

   backT = pmatch(backgroundType, backgroundTypes);
   if (is.na(backT))
     stop(paste("Unrecognized 'backgroundType' given. Recognized values are ", 
           paste(backgroundTypes,  collapse = ", ")));

   orgCodes = c("Hs", "Mm", "Rn", "Pf", "Sc", "Dm", "Bt", "Ce", "Cf", "Dr", "Gg");
   orgExtensions = c(rep(".eg", 4), ".sgd", rep(".eg", 6));
   reverseMap = c(rep(".egGO2EG", 4), ".sgdGO2ORF", rep(".egGO2EG", 6))

   missingPacks = NULL;
   packageName = paste("org.", orgCodes[orgInd], orgExtensions[orgInd], ".db", sep="");
   if (!eval(parse(text = "require(packageName, character.only = TRUE)")))
     missingPacks = c(missingPacks, packageName);

   if (!eval(parse(text="require(GO.db)")))
     missingPacks = c(missingPacks, "GO.db");

   if (!is.null(missingPacks)) 
     stop(paste("Could not load the requisite package(s)", 
           paste(missingPacks, collapse = ", "), ". Please install the package(s)."))

   if (verbose > 0)
   {
     printFlush(paste(spaces, "GOenrichmentAnalysis: loading annotation data..."));
   }

   if (orgInd==5)
   {
      # Yeast needs special care.
      if (!is.null(yeastORFs))
      {
        entrezCodes = yeastORFs
      } else {
        # Map the entrez IDs to yeast ORFs
        x = eval(parse(text = "org.Sc.sgd::org.Sc.sgdENTREZID"))
        # x = org.Sc.sgd::org.Sc.sgdENTREZID
        xx = as.list(x[mapped_genes])
        allORFs = names(xx);
        mappedECs = as.character(sapply(xx, as.character))
        entrez2orf = match(entrezCodes, mappedECs);
        fin = is.finite(entrez2orf);
        newCodes = paste("InvalidCode", c(1:length(entrezCodes)), sep = ".");
        newCodes[fin] = allORFs[entrez2orf[fin]];
        entrezCodes = newCodes;
     }
   } 

   labels = as.matrix(labels);

   nSets = ncol(labels);
   nGivenRaw = nrow(labels);

   if (removeDuplicates)
   {
     # Restrict given entrezCodes such that each code is unique
     ECtab = table(entrezCodes);
     uniqueEC = names(ECtab);
     keepEC = match(uniqueEC, entrezCodes);
     entrezCodes = entrezCodes[keepEC]
     labels = labels[keepEC, , drop = FALSE];
   } else
     keepEC = c(1:nGivenRaw);

   egGO = eval(parse(text = paste(packageName, "::org.", orgCodes[orgInd], orgExtensions[orgInd], 
                                  "GO", sep = "")));

   if (orgInd==5)
   {
      mapped_genes = as.character(do.call(match.fun("mappedkeys"), list(egGO)));
      encodes2mapped = match(as.character(entrezCodes), mapped_genes);
   } else {
      mapped_genes = as.numeric(as.character(do.call(match.fun("mappedkeys"), list(egGO))));
      encodes2mapped = match(as.numeric(entrezCodes), mapped_genes);
   }
   encMapped = is.finite(encodes2mapped);
   nAllIDsInGO = sum(encMapped);

   mapECodes = entrezCodes[encMapped];
   mapLabels = labels[encMapped, , drop = FALSE];
   nMappedGenes = nrow(mapLabels);

   if (nMappedGenes==0)
     stop(paste("None of the supplied gene identifiers map to the GO database.\n",
                "Please make sure you have specified the correct organism (default is human)."))

   Go2eg = eval(parse(text = paste("AnnotationDbi::as.list(", packageName, "::org.", orgCodes[orgInd], 
                                           reverseMap[orgInd],")", sep = "")));
   nTerms = length(Go2eg);

   goInfo = as.list(GO.db::GOTERM);
   if (length(goInfo) > 0)
   {
      orgGoNames = names(Go2eg);
      dbGoNames = as.character(sapply(goInfo, GOID));
      dbGoOntologies = as.character(sapply(goInfo, Ontology));
   } else {
      dbGoNames = "";
   }
   goOffSpr = list();
   if (includeOffspring)
   {
     goOffSpr[[1]] = as.list(GOBPOFFSPRING);
     goOffSpr[[2]] = as.list(GOCCOFFSPRING);
     goOffSpr[[3]] = as.list(GOMFOFFSPRING);
   }
   term2info = match(names(Go2eg), names(goInfo));
   termOntologies = dbGoOntologies[term2info];

   if (backT==1)
   {
     nBackgroundGenes = sum(!is.na(entrezCodes));
   } else
   {
     if (backT==2)
     {
       nBackgroundGenes = length(mapped_genes)
     } else
       nBackgroundGenes = nMappedGenes;
   }

   termCodes = vector(mode="list", length = nTerms);
   collectGarbage();
   nExpandLength = 0;
   blockSize = 3000; # For a more efficient concatenating of offspring genes
   nAllInTerm = rep(0, nTerms);
   if (verbose > 0)
   {
      printFlush(paste(spaces, " ..of the", length(entrezCodes), 
                      " Entrez identifiers submitted,", sum(encMapped), 
                      "are mapped in current GO categories."));
      printFlush(paste(spaces, " ..will use", nBackgroundGenes, 
                       "background genes for enrichment calculations."));
      cat(paste(spaces, " ..preparing term lists (this may take a while).."));
      pind = initProgInd();
   }

   for (c in 1:nTerms) if (!is.na(Go2eg[[c]][[1]]))
   {
      te = as.character(names(Go2eg[[c]])); # Term evidence codes
      tc = Go2eg[[c]];
      if (includeOffspring) 
      {
        termOffspring = NULL;
        for (ont in 1:length(goOffSpr))
        {
          term2off = match(names(Go2eg)[c], names(goOffSpr[[ont]]))
          if (!is.na(term2off))
            termOffspring = c(termOffspring, goOffSpr[[ont]][[term2off]]);
        }
        if (length(termOffspring)>0) 
        {
           maxLen = blockSize;
           tex = rep("", maxLen);
           tcx = rep("", maxLen);
           ind = 1;
           len = length(te);
           tex[ ind:len ] = te;
           tcx[ ind:len ] = tc;
           ind = len + 1;
           o2go = match(termOffspring, as.character(names(Go2eg)));
           o2go = o2go[is.finite(o2go)]
           if (length(o2go)>0) for (o in 1:length(o2go)) if (!is.na(Go2eg[[o2go[o]]][[1]]))
           {
             #printFlush(paste("Have offspring for term", c, ": ", names(Go2eg)[c], 
             #           Term(goInfo[[term2info[c]]])));
             newc = Go2eg[[o2go[o]]];
             newe = names(newc);
             newl = length(newe);
             if ((len + newl) > maxLen)
             {
               nExpand = ceiling( (len + newl - maxLen)/blockSize);
               maxLen = maxLen + blockSize * nExpand;
               tex = c(tex, rep("", maxLen - length(tex)));
               tcx = c(tcx, rep("", maxLen - length(tex)));
               nExpandLength = nExpandLength + 1;
             }
             tex[ind:(len + newl)] = newe;
             tcx[ind:(len + newl)] = newc;
             ind = ind + newl;
             len = len + newl;
           }
           te = tex[1:len];
           tc = tcx[1:len];
        }
      }
      use = is.finite(match(te, evidence));
      if (orgInd==5)
      {
         if (backT==2)
         {
            termCodes[[c]] = unique(as.character(tc[use]));
         } else
            termCodes[[c]] = as.character(intersect(tc[use], mapECodes));
      } else {
         if (backT==2)
         {
            termCodes[[c]] = unique(as.character(tc[use]));
         } else
            termCodes[[c]] = as.numeric(as.character(intersect(tc[use], mapECodes)));
      }
      nAllInTerm[c] = length(termCodes[[c]]); 
      if ( (c %%50 ==0) & (verbose > 0)) pind = updateProgInd(c/nTerms, pind);
   }
   if (verbose > 0) 
   {
      pind = updateProgInd(1, pind);
      printFlush("");
   }
   if ((verbose > 5) & (includeOffspring))
      printFlush(paste(spaces, " ..diagnostic for the developer: offspring buffer was expanded",
                       nExpandLength, "times."));

   ftp = function(...) { fisher.test(...)$p.value }

   setResults = list();

   for (set in 1:nSets)
   {
      if (verbose > 0)
        printFlush(paste(spaces, " ..working on label set", set, ".."));
      labelLevels = levels(factor(labels[, set]));
      if (!is.null(leaveOutLabel))
      {
        keep = !(labelLevels %in% as.character(leaveOutLabel));
        if (sum(keep)==0)
          stop("No labels were kept after removing labels that are supposed to be ignored.");
        labelLevels = labelLevels[keep]
      }
      nLabelLevels = length(labelLevels);

      modCodes = list();
      nModCodes = rep(0, nLabelLevels);
      if (backT==1)
      {
        for (ll in 1:nLabelLevels)
        {
           modCodes[[ll]] = entrezCodes[labels[, set]==labelLevels[ll]];
           nModCodes[ll] = length(modCodes[[ll]]);
        }
      } else 
      {
        for (ll in 1:nLabelLevels)
        {
           modCodes[[ll]] = mapECodes[mapLabels[, set]==labelLevels[ll]];
           nModCodes[ll] = length(modCodes[[ll]]);
        }
      }

      countsInTerm = matrix(0, nLabelLevels, nTerms);
      enrichment = matrix(1, nLabelLevels, nTerms);

      for (ll in 1:nLabelLevels)
        countsInTerm[ll, ] = sapply(lapply(termCodes, intersect, modCodes[[ll]]), length)

      nAllInTermMat = matrix(nAllInTerm, nLabelLevels, nTerms, byrow = TRUE);
      nModCodesMat = matrix(nModCodes, nLabelLevels, nTerms);
      tabArr = array(c(countsInTerm, nAllInTermMat - countsInTerm, 
                       nModCodesMat - countsInTerm,
                       nBackgroundGenes - nModCodesMat - nAllInTermMat + countsInTerm),
                     dim = c(nLabelLevels * nTerms, 2, 2));

      if (verbose > 0)
         printFlush(paste(spaces, "   ..calculating enrichments (this may also take a while).."));
      calculate = c(countsInTerm) > 0;
      enrichment[calculate] = apply(tabArr[calculate, , ], 1, ftp, alternative = "g");
    
      dimnames(enrichment) = list (labelLevels, names(Go2eg));
      dimnames(countsInTerm) = list (labelLevels, names(Go2eg));

      bestPTerms = list();
      modSizes = table(labels[ !(labels[, set] %in% leaveOutLabel), set]);

      if (!is.null(pCut) || nBestP > 0)
      {
         printFlush(paste(spaces, "   ..putting together terms with highest enrichment significance.."));
         nConsideredOntologies = length(ontologies)+1;
         for (ont in 1:nConsideredOntologies)
         {
            if (ont==nConsideredOntologies)
            {
               ontTerms = is.finite(match(termOntologies, ontologies))
               bestPTerms[[ont]] = list(ontology = ontologies);
               names(bestPTerms)[ont] = paste(ontologies, collapse = ", ");
            } else {
               ontTerms = termOntologies==ontologies[ont];
               bestPTerms[[ont]] = list(ontology = ontologies[ont]);
               names(bestPTerms)[ont] = ontologies[ont];
            }
            bestPTerms[[ont]]$enrichment = NULL;
            bestPTerms[[ont]]$termInfo = list();
            nOntTerms = sum(ontTerms)
            ontEnr = enrichment[, ontTerms, drop = FALSE];
            order = apply(ontEnr, 1, order);
            for (ll in 1:nLabelLevels)
            {
              if (!is.null(pCut))
              {
                 reportTerms = c(1:nTerms)[ontTerms][ontEnr[ll, ] < pCut];
                 reportTerms = reportTerms[order(ontEnr[ll, ][reportTerms])];
              } else
                 reportTerms =  c(1:nTerms)[ontTerms][order[1:nBestP, ll]];
              nRepTerms = length(reportTerms);
              enrTab = data.frame(module = rep(labelLevels[ll], nRepTerms), 
                                  modSize = rep(modSizes[ll], nRepTerms),
                                  bkgrModSize = rep(nModCodes[ll], nRepTerms), 
                                  rank = seq(length.out = nRepTerms),
                                  enrichmentP = enrichment[ll, reportTerms],
                                  BonferoniP = pmin(rep(1, nRepTerms), 
                                                    enrichment[ll, reportTerms] * nOntTerms),
                                  nModGenesInTerm = countsInTerm[ll, reportTerms],
                                  fracOfBkgrModSize = countsInTerm[ll, reportTerms]/nModCodes[ll],
                                  fracOfBkgrTermSize = 
                                       countsInTerm[ll, reportTerms]/nAllInTerm[reportTerms],
                                  bkgrTermSize = nAllInTerm[reportTerms],
                                  termID = names(Go2eg)[reportTerms],
                                  termOntology =  rep("NA", nRepTerms),
                                  termName = rep("NA", nRepTerms),
                                  termDefinition =  rep("NA", nRepTerms))
              bestPTerms[[ont]]$forModule[[ll]] = list();
              for (rci in seq(length.out = nRepTerms))
              {
                 term = reportTerms[rci];
                 termID = names(Go2eg)[term];
                 dbind = match(termID, dbGoNames);
                 if (is.finite(dbind))
                 {
                   enrTab$termName[rci] = eval(parse(text = "AnnotationDbi::Term(goInfo[[dbind]])"));
                   enrTab$termDefinition[rci] = 
                         eval(parse(text = "AnnotationDbi::Definition(goInfo[[dbind]])"));
                   enrTab$termOntology[rci] = 
                         eval(parse(text = "AnnotationDbi::Ontology(goInfo[[dbind]])"));
                 } 
                 if (getTermDetails)
                 {
                   geneCodes = intersect(modCodes[[ll]], termCodes[[term]])
                   bestPTerms[[ont]]$forModule[[ll]][[rci]] = list(termID = termID,
                           termName = enrTab$termName[rci],
                           enrichmentP = enrTab$enrichmentP[rci],
                           termDefinition = enrTab$termDefinition[rci],
                           termOntology = enrTab$termOntology[rci],
                           geneCodes = geneCodes,
                           genePositions = keepEC[match(geneCodes, entrezCodes)]);
                }
              }
              if (ll==1) 
              {
                 bestPTerms[[ont]]$enrichment = enrTab;
              } else 
                 bestPTerms[[ont]]$enrichment = rbind(bestPTerms[[ont]]$enrichment, enrTab);
           }
         }
      }
           
      biggestTerms = list();
   
      if (nBiggest > 0)
      {
         printFlush(paste(spaces, "   ..putting together terms with largest number of genes in modules.."));
         nConsideredOntologies = length(ontologies)+1;
         for (ont in 1:nConsideredOntologies)
         {
            if (ont==nConsideredOntologies)
            {
               ontTerms = is.finite(match(termOntologies, ontologies))
               biggestTerms[[ont]] = list(ontology = ontologies);
               names(biggestTerms)[ont] = paste(ontologies, collapse = ", ");
            } else {
               ontTerms = termOntologies==ontologies[ont];
               biggestTerms[[ont]] = list(ontology = ontologies[ont]);
               names(biggestTerms)[ont] = ontologies[ont];
            }
            biggestTerms[[ont]]$enrichment = NULL;
            biggestTerms[[ont]]$termInfo = list();
            nOntTerms = sum(ontTerms)
            ontNGenes = countsInTerm[, ontTerms, drop = FALSE];
            order = apply(-ontNGenes, 1, order);
            for (ll in 1:nLabelLevels)
            {
              reportTerms = c(1:nTerms)[ontTerms][order[1:nBiggest, ll]];
              nRepTerms = length(reportTerms);
              enrTab = data.frame(module = rep(labelLevels[ll], nRepTerms),
                                  modSize = rep(modSizes[ll], nRepTerms),
                                  bkgrModSize = rep(nModCodes[ll], nRepTerms), 
                                  rank = seq(length.out = nRepTerms),
                                  enrichmentP = enrichment[ll, reportTerms],
                                  BonferoniP = pmin(rep(1, nRepTerms), 
                                                    enrichment[ll, reportTerms] * nOntTerms),
                                  nModGenesInTerm = countsInTerm[ll, reportTerms],
                                  fracOfModSize = countsInTerm[ll, reportTerms]/nModCodes[ll],
                                  fracOfBkgrTermSize = 
                                       countsInTerm[ll, reportTerms]/nAllInTerm[reportTerms],
                                  bkgrTermSize = nAllInTerm[reportTerms],
                                  termID = names(Go2eg)[reportTerms],
                                  termOntology =  rep("NA", nRepTerms),
                                  termName = rep("NA", nRepTerms),
                                  termDefinition =  rep("NA", nRepTerms))
              biggestTerms[[ont]]$forModule[[ll]] = list();
              for (rci in seq(length.out = nRepTerms))
              {
                 term = reportTerms[rci];
                 termID = names(Go2eg)[term];
                 dbind = match(termID, dbGoNames);
                 if (is.finite(dbind))
                 {
                   enrTab$termName[rci] = eval(parse(text="AnnotationDbi::Term(goInfo[[dbind]])"));
                   enrTab$termDefinition[rci] =
                       eval(parse(text="AnnotationDbi::Definition(goInfo[[dbind]])"));
                   enrTab$termOntology[rci] = eval(parse(text="AnnotationDbi::Ontology(goInfo[[dbind]])"));
                 }
                 if (getTermDetails)
                 {
                   geneCodes = intersect(modCodes[[ll]], termCodes[[term]])
                   biggestTerms[[ont]]$forModule[[ll]][[rci]] = list(termID = termID,
                           termName = enrTab$termName[rci],
                           enrichmentP = enrTab$enrichmentP[rci],
                           termDefinition = enrTab$termDefinition[rci],
                           termOntology = enrTab$termOntology[rci],
                           geneCodes = geneCodes,
                           genePositions = keepEC[match(geneCodes, entrezCodes)]);
                }
              }
              if (ll==1)
              {
                 biggestTerms[[ont]]$enrichment = enrTab;
              } else
                 biggestTerms[[ont]]$enrichment = rbind(biggestTerms[[ont]]$enrichment, enrTab);
           }
         }
      }
      setResults[[set]] = list(countsInTerm = countsInTerm, 
                               enrichmentP = enrichment,
                               bestPTerms = bestPTerms,
                               biggestTerms = biggestTerms);
   }
   
   inGO = rep(FALSE, nGivenRaw);
   inGO[keepEC] = encMapped;
   kept = rep(FALSE, nGivenRaw);
   kept[keepEC] = TRUE;
   if (nSets==1)
   {
      list(keptForAnalysis = kept,
           inGO = inGO,
           countsInTerms = setResults[[1]]$countsInTerm,
           enrichmentP = setResults[[1]]$enrichmentP,
           bestPTerms = setResults[[1]]$bestPTerms,
           biggestTerms = setResults[[1]]$biggestTerms);
   } else {
      list(keptForAnalysis = kept,
           inGO = inGO,
           setResults = setResults);
   }
}