# Function that returns GO terms with best enrichment and highest number of genes.
# So that I don't forget: information about GO categories is contained in the GO.db package
GOenrichmentAnalysis = function(labels, entrezCodes,
yeastORFs = NULL,
organism = "human",
ontologies = c("BP", "CC", "MF"),
evidence = "all",
includeOffspring = TRUE,
backgroundType = "givenInGO",
removeDuplicates = TRUE,
leaveOutLabel = NULL,
nBestP = 10, pCut = NULL, nBiggest = 0,
getTermDetails = TRUE,
verbose = 2, indent = 0 )
{
warning(immediate. = TRUE,
spaste("This function is deprecated and will be removed in the near future. \n",
"We suggest using the replacement function enrichmentAnalysis \n",
"in R package anRichment, available from the following URL:\n",
"https://labs.genetics.ucla.edu/horvath/htdocs/CoexpressionNetwork/GeneAnnotation/"));
sAF = options("stringsAsFactors")
options(stringsAsFactors = FALSE);
on.exit(options(stringsAsFactors = sAF[[1]]), TRUE)
organisms = c("human", "mouse", "rat", "malaria", "yeast", "fly", "bovine", "worm", "canine",
"zebrafish", "chicken");
allEvidence = c("EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "ISS", "ISO", "ISA",
"ISM", "IGC", "IBA", "IBD", "IKR", "IRD", "RCA", "TAS", "NAS", "IC", "ND", "IEA",
"NR");
allOntologies = c("BP", "CC", "MF");
backgroundTypes = c("allGiven", "allInGO", "givenInGO");
spaces = indentSpaces(indent);
orgInd = pmatch(organism, organisms);
if (is.na(orgInd))
stop(paste("Unrecognized 'organism' given. Recognized values are ",
paste(organisms, collapse = ", ")));
if (length(evidence)==0)
stop("At least one valid evidence code must be given in 'evidence'.");
if (length(ontologies)==0)
stop("At least one valid ontology code must be given in 'ontology'.");
if (evidence=="all")
evidence = allEvidence;
evidInd = pmatch(evidence, allEvidence);
if (sum(is.na(evidInd))!=0)
stop(paste("Unrecognized 'evidence' given. Recognized values are ",
paste(allEvidence, collapse = ", ")));
ontoInd = pmatch(ontologies, allOntologies);
if (sum(is.na(ontoInd))!=0)
stop(paste("Unrecognized 'ontologies' given. Recognized values are ",
paste(allEvidence, collapse = ", ")));
backT = pmatch(backgroundType, backgroundTypes);
if (is.na(backT))
stop(paste("Unrecognized 'backgroundType' given. Recognized values are ",
paste(backgroundTypes, collapse = ", ")));
orgCodes = c("Hs", "Mm", "Rn", "Pf", "Sc", "Dm", "Bt", "Ce", "Cf", "Dr", "Gg");
orgExtensions = c(rep(".eg", 4), ".sgd", rep(".eg", 6));
reverseMap = c(rep(".egGO2EG", 4), ".sgdGO2ORF", rep(".egGO2EG", 6))
missingPacks = NULL;
packageName = paste("org.", orgCodes[orgInd], orgExtensions[orgInd], ".db", sep="");
if (!eval(parse(text = "require(packageName, character.only = TRUE)")))
missingPacks = c(missingPacks, packageName);
if (!eval(parse(text="require(GO.db)")))
missingPacks = c(missingPacks, "GO.db");
if (!is.null(missingPacks))
stop(paste("Could not load the requisite package(s)",
paste(missingPacks, collapse = ", "), ". Please install the package(s)."))
if (verbose > 0)
{
printFlush(paste(spaces, "GOenrichmentAnalysis: loading annotation data..."));
}
if (orgInd==5)
{
# Yeast needs special care.
if (!is.null(yeastORFs))
{
entrezCodes = yeastORFs
} else {
# Map the entrez IDs to yeast ORFs
x = eval(parse(text = "org.Sc.sgd:::org.Sc.sgdENTREZID"))
# x = org.Sc.sgd:::org.Sc.sgdENTREZID
xx = as.list(x[mapped_genes])
allORFs = names(xx);
mappedECs = as.character(sapply(xx, as.character))
entrez2orf = match(entrezCodes, mappedECs);
fin = is.finite(entrez2orf);
newCodes = paste("InvalidCode", c(1:length(entrezCodes)), sep = ".");
newCodes[fin] = allORFs[entrez2orf[fin]];
entrezCodes = newCodes;
}
}
labels = as.matrix(labels);
nSets = ncol(labels);
nGivenRaw = nrow(labels);
if (removeDuplicates)
{
# Restrict given entrezCodes such that each code is unique
ECtab = table(entrezCodes);
uniqueEC = names(ECtab);
keepEC = match(uniqueEC, entrezCodes);
entrezCodes = entrezCodes[keepEC]
labels = labels[keepEC, , drop = FALSE];
} else
keepEC = c(1:nGivenRaw);
egGO = eval(parse(text = paste(packageName, ":::org.", orgCodes[orgInd], orgExtensions[orgInd],
"GO", sep = "")));
if (orgInd==5)
{
mapped_genes = as.character(do.call(match.fun("mappedkeys"), list(egGO)));
encodes2mapped = match(as.character(entrezCodes), mapped_genes);
} else {
mapped_genes = as.numeric(as.character(do.call(match.fun("mappedkeys"), list(egGO))));
encodes2mapped = match(as.numeric(entrezCodes), mapped_genes);
}
encMapped = is.finite(encodes2mapped);
nAllIDsInGO = sum(encMapped);
mapECodes = entrezCodes[encMapped];
mapLabels = labels[encMapped, , drop = FALSE];
nMappedGenes = nrow(mapLabels);
if (nMappedGenes==0)
stop(paste("None of the supplied gene identifiers map to the GO database.\n",
"Please make sure you have specified the correct organism (default is human)."))
Go2eg = eval(parse(text = paste("AnnotationDbi::as.list(", packageName, ":::org.", orgCodes[orgInd],
reverseMap[orgInd],")", sep = "")));
nTerms = length(Go2eg);
goInfo = as.list(GO.db::GOTERM);
if (length(goInfo) > 0)
{
orgGoNames = names(Go2eg);
dbGoNames = as.character(sapply(goInfo, GOID));
dbGoOntologies = as.character(sapply(goInfo, Ontology));
} else {
dbGoNames = "";
}
goOffSpr = list();
if (includeOffspring)
{
goOffSpr[[1]] = as.list(GOBPOFFSPRING);
goOffSpr[[2]] = as.list(GOCCOFFSPRING);
goOffSpr[[3]] = as.list(GOMFOFFSPRING);
}
term2info = match(names(Go2eg), names(goInfo));
termOntologies = dbGoOntologies[term2info];
if (backT==1)
{
nBackgroundGenes = sum(!is.na(entrezCodes));
} else
{
if (backT==2)
{
nBackgroundGenes = length(mapped_genes)
} else
nBackgroundGenes = nMappedGenes;
}
termCodes = vector(mode="list", length = nTerms);
collectGarbage();
nExpandLength = 0;
blockSize = 3000; # For a more efficient concatenating of offspring genes
nAllInTerm = rep(0, nTerms);
if (verbose > 0)
{
printFlush(paste(spaces, " ..of the", length(entrezCodes),
" Entrez identifiers submitted,", sum(encMapped),
"are mapped in current GO categories."));
printFlush(paste(spaces, " ..will use", nBackgroundGenes,
"background genes for enrichment calculations."));
cat(paste(spaces, " ..preparing term lists (this may take a while).."));
pind = initProgInd();
}
for (c in 1:nTerms) if (!is.na(Go2eg[[c]][[1]]))
{
te = as.character(names(Go2eg[[c]])); # Term evidence codes
tc = Go2eg[[c]];
if (includeOffspring)
{
termOffspring = NULL;
for (ont in 1:length(goOffSpr))
{
term2off = match(names(Go2eg)[c], names(goOffSpr[[ont]]))
if (!is.na(term2off))
termOffspring = c(termOffspring, goOffSpr[[ont]][[term2off]]);
}
if (length(termOffspring)>0)
{
maxLen = blockSize;
tex = rep("", maxLen);
tcx = rep("", maxLen);
ind = 1;
len = length(te);
tex[ ind:len ] = te;
tcx[ ind:len ] = tc;
ind = len + 1;
o2go = match(termOffspring, as.character(names(Go2eg)));
o2go = o2go[is.finite(o2go)]
if (length(o2go)>0) for (o in 1:length(o2go)) if (!is.na(Go2eg[[o2go[o]]][[1]]))
{
#printFlush(paste("Have offspring for term", c, ": ", names(Go2eg)[c],
# Term(goInfo[[term2info[c]]])));
newc = Go2eg[[o2go[o]]];
newe = names(newc);
newl = length(newe);
if ((len + newl) > maxLen)
{
nExpand = ceiling( (len + newl - maxLen)/blockSize);
maxLen = maxLen + blockSize * nExpand;
tex = c(tex, rep("", maxLen - length(tex)));
tcx = c(tcx, rep("", maxLen - length(tex)));
nExpandLength = nExpandLength + 1;
}
tex[ind:(len + newl)] = newe;
tcx[ind:(len + newl)] = newc;
ind = ind + newl;
len = len + newl;
}
te = tex[1:len];
tc = tcx[1:len];
}
}
use = is.finite(match(te, evidence));
if (orgInd==5)
{
if (backT==2)
{
termCodes[[c]] = unique(as.character(tc[use]));
} else
termCodes[[c]] = as.character(intersect(tc[use], mapECodes));
} else {
if (backT==2)
{
termCodes[[c]] = unique(as.character(tc[use]));
} else
termCodes[[c]] = as.numeric(as.character(intersect(tc[use], mapECodes)));
}
nAllInTerm[c] = length(termCodes[[c]]);
if ( (c %%50 ==0) & (verbose > 0)) pind = updateProgInd(c/nTerms, pind);
}
if (verbose > 0)
{
pind = updateProgInd(1, pind);
printFlush("");
}
if ((verbose > 5) & (includeOffspring))
printFlush(paste(spaces, " ..diagnostic for the developer: offspring buffer was expanded",
nExpandLength, "times."));
ftp = function(...) { fisher.test(...)$p.value }
setResults = list();
for (set in 1:nSets)
{
if (verbose > 0)
printFlush(paste(spaces, " ..working on label set", set, ".."));
labelLevels = levels(factor(labels[, set]));
if (!is.null(leaveOutLabel))
{
keep = !(labelLevels %in% as.character(leaveOutLabel));
if (sum(keep)==0)
stop("No labels were kept after removing labels that are supposed to be ignored.");
labelLevels = labelLevels[keep]
}
nLabelLevels = length(labelLevels);
modCodes = list();
nModCodes = rep(0, nLabelLevels);
if (backT==1)
{
for (ll in 1:nLabelLevels)
{
modCodes[[ll]] = entrezCodes[labels[, set]==labelLevels[ll]];
nModCodes[ll] = length(modCodes[[ll]]);
}
} else
{
for (ll in 1:nLabelLevels)
{
modCodes[[ll]] = mapECodes[mapLabels[, set]==labelLevels[ll]];
nModCodes[ll] = length(modCodes[[ll]]);
}
}
countsInTerm = matrix(0, nLabelLevels, nTerms);
enrichment = matrix(1, nLabelLevels, nTerms);
for (ll in 1:nLabelLevels)
countsInTerm[ll, ] = sapply(lapply(termCodes, intersect, modCodes[[ll]]), length)
nAllInTermMat = matrix(nAllInTerm, nLabelLevels, nTerms, byrow = TRUE);
nModCodesMat = matrix(nModCodes, nLabelLevels, nTerms);
tabArr = array(c(countsInTerm, nAllInTermMat - countsInTerm,
nModCodesMat - countsInTerm,
nBackgroundGenes - nModCodesMat - nAllInTermMat + countsInTerm),
dim = c(nLabelLevels * nTerms, 2, 2));
if (verbose > 0)
printFlush(paste(spaces, " ..calculating enrichments (this may also take a while).."));
calculate = c(countsInTerm) > 0;
enrichment[calculate] = apply(tabArr[calculate, , ], 1, ftp, alternative = "g");
dimnames(enrichment) = list (labelLevels, names(Go2eg));
dimnames(countsInTerm) = list (labelLevels, names(Go2eg));
bestPTerms = list();
modSizes = table(labels[ !(labels[, set] %in% leaveOutLabel), set]);
if (!is.null(pCut) || nBestP > 0)
{
printFlush(paste(spaces, " ..putting together terms with highest enrichment significance.."));
nConsideredOntologies = length(ontologies)+1;
for (ont in 1:nConsideredOntologies)
{
if (ont==nConsideredOntologies)
{
ontTerms = is.finite(match(termOntologies, ontologies))
bestPTerms[[ont]] = list(ontology = ontologies);
names(bestPTerms)[ont] = paste(ontologies, collapse = ", ");
} else {
ontTerms = termOntologies==ontologies[ont];
bestPTerms[[ont]] = list(ontology = ontologies[ont]);
names(bestPTerms)[ont] = ontologies[ont];
}
bestPTerms[[ont]]$enrichment = NULL;
bestPTerms[[ont]]$termInfo = list();
nOntTerms = sum(ontTerms)
ontEnr = enrichment[, ontTerms, drop = FALSE];
order = apply(ontEnr, 1, order);
for (ll in 1:nLabelLevels)
{
if (!is.null(pCut))
{
reportTerms = c(1:nTerms)[ontTerms][ontEnr[ll, ] < pCut];
reportTerms = reportTerms[order(ontEnr[ll, ][reportTerms])];
} else
reportTerms = c(1:nTerms)[ontTerms][order[1:nBestP, ll]];
nRepTerms = length(reportTerms);
enrTab = data.frame(module = rep(labelLevels[ll], nRepTerms),
modSize = rep(modSizes[ll], nRepTerms),
bkgrModSize = rep(nModCodes[ll], nRepTerms),
rank = seq(length.out = nRepTerms),
enrichmentP = enrichment[ll, reportTerms],
BonferoniP = pmin(rep(1, nRepTerms),
enrichment[ll, reportTerms] * nOntTerms),
nModGenesInTerm = countsInTerm[ll, reportTerms],
fracOfBkgrModSize = countsInTerm[ll, reportTerms]/nModCodes[ll],
fracOfBkgrTermSize =
countsInTerm[ll, reportTerms]/nAllInTerm[reportTerms],
bkgrTermSize = nAllInTerm[reportTerms],
termID = names(Go2eg)[reportTerms],
termOntology = rep("NA", nRepTerms),
termName = rep("NA", nRepTerms),
termDefinition = rep("NA", nRepTerms))
bestPTerms[[ont]]$forModule[[ll]] = list();
for (rci in seq(length.out = nRepTerms))
{
term = reportTerms[rci];
termID = names(Go2eg)[term];
dbind = match(termID, dbGoNames);
if (is.finite(dbind))
{
enrTab$termName[rci] = eval(parse(text = "AnnotationDbi:::Term(goInfo[[dbind]])"));
enrTab$termDefinition[rci] =
eval(parse(text = "AnnotationDbi:::Definition(goInfo[[dbind]])"));
enrTab$termOntology[rci] =
eval(parse(text = "AnnotationDbi:::Ontology(goInfo[[dbind]])"));
}
if (getTermDetails)
{
geneCodes = intersect(modCodes[[ll]], termCodes[[term]])
bestPTerms[[ont]]$forModule[[ll]][[rci]] = list(termID = termID,
termName = enrTab$termName[rci],
enrichmentP = enrTab$enrichmentP[rci],
termDefinition = enrTab$termDefinition[rci],
termOntology = enrTab$termOntology[rci],
geneCodes = geneCodes,
genePositions = keepEC[match(geneCodes, entrezCodes)]);
}
}
if (ll==1)
{
bestPTerms[[ont]]$enrichment = enrTab;
} else
bestPTerms[[ont]]$enrichment = rbind(bestPTerms[[ont]]$enrichment, enrTab);
}
}
}
biggestTerms = list();
if (nBiggest > 0)
{
printFlush(paste(spaces, " ..putting together terms with largest number of genes in modules.."));
nConsideredOntologies = length(ontologies)+1;
for (ont in 1:nConsideredOntologies)
{
if (ont==nConsideredOntologies)
{
ontTerms = is.finite(match(termOntologies, ontologies))
biggestTerms[[ont]] = list(ontology = ontologies);
names(biggestTerms)[ont] = paste(ontologies, collapse = ", ");
} else {
ontTerms = termOntologies==ontologies[ont];
biggestTerms[[ont]] = list(ontology = ontologies[ont]);
names(biggestTerms)[ont] = ontologies[ont];
}
biggestTerms[[ont]]$enrichment = NULL;
biggestTerms[[ont]]$termInfo = list();
nOntTerms = sum(ontTerms)
ontNGenes = countsInTerm[, ontTerms, drop = FALSE];
order = apply(-ontNGenes, 1, order);
for (ll in 1:nLabelLevels)
{
reportTerms = c(1:nTerms)[ontTerms][order[1:nBiggest, ll]];
nRepTerms = length(reportTerms);
enrTab = data.frame(module = rep(labelLevels[ll], nRepTerms),
modSize = rep(modSizes[ll], nRepTerms),
bkgrModSize = rep(nModCodes[ll], nRepTerms),
rank = seq(length.out = nRepTerms),
enrichmentP = enrichment[ll, reportTerms],
BonferoniP = pmin(rep(1, nRepTerms),
enrichment[ll, reportTerms] * nOntTerms),
nModGenesInTerm = countsInTerm[ll, reportTerms],
fracOfModSize = countsInTerm[ll, reportTerms]/nModCodes[ll],
fracOfBkgrTermSize =
countsInTerm[ll, reportTerms]/nAllInTerm[reportTerms],
bkgrTermSize = nAllInTerm[reportTerms],
termID = names(Go2eg)[reportTerms],
termOntology = rep("NA", nRepTerms),
termName = rep("NA", nRepTerms),
termDefinition = rep("NA", nRepTerms))
biggestTerms[[ont]]$forModule[[ll]] = list();
for (rci in seq(length.out = nRepTerms))
{
term = reportTerms[rci];
termID = names(Go2eg)[term];
dbind = match(termID, dbGoNames);
if (is.finite(dbind))
{
enrTab$termName[rci] = eval(parse(text="AnnotationDbi:::Term(goInfo[[dbind]])"));
enrTab$termDefinition[rci] =
eval(parse(text="AnnotationDbi:::Definition(goInfo[[dbind]])"));
enrTab$termOntology[rci] = eval(parse(text="AnnotationDbi:::Ontology(goInfo[[dbind]])"));
}
if (getTermDetails)
{
geneCodes = intersect(modCodes[[ll]], termCodes[[term]])
biggestTerms[[ont]]$forModule[[ll]][[rci]] = list(termID = termID,
termName = enrTab$termName[rci],
enrichmentP = enrTab$enrichmentP[rci],
termDefinition = enrTab$termDefinition[rci],
termOntology = enrTab$termOntology[rci],
geneCodes = geneCodes,
genePositions = keepEC[match(geneCodes, entrezCodes)]);
}
}
if (ll==1)
{
biggestTerms[[ont]]$enrichment = enrTab;
} else
biggestTerms[[ont]]$enrichment = rbind(biggestTerms[[ont]]$enrichment, enrTab);
}
}
}
setResults[[set]] = list(countsInTerm = countsInTerm,
enrichmentP = enrichment,
bestPTerms = bestPTerms,
biggestTerms = biggestTerms);
}
inGO = rep(FALSE, nGivenRaw);
inGO[keepEC] = encMapped;
kept = rep(FALSE, nGivenRaw);
kept[keepEC] = TRUE;
if (nSets==1)
{
list(keptForAnalysis = kept,
inGO = inGO,
countsInTerms = setResults[[1]]$countsInTerm,
enrichmentP = setResults[[1]]$enrichmentP,
bestPTerms = setResults[[1]]$bestPTerms,
biggestTerms = setResults[[1]]$biggestTerms);
} else {
list(keptForAnalysis = kept,
inGO = inGO,
setResults = setResults);
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.