# geneSetEnrichment.R
# take a subset of genes (like from a DE analysis, and a list of 'GeneSets'
# like a pathway groupings or GO groupings, and find the enrichment by these groups
geneSetEnrichment <- function( genes, geneSets=defaultGeneSets(),
upOnly=TRUE, minEnrich=2.0, maxPvalue=0.05, wt.enrich=1, wt.pvalue=2,
geneUniverse=NULL, reportGenes=FALSE, dropPathNameLinks=FALSE, verbose=T) {
# the 'geneSets' can be a vector of strings of geneSet datasets...,
# or the usual list of vectors of genes
cat( "\nGathering GeneSets..")
GSanswer <- gatherGeneSets( geneSets, descriptor="geneSets", mode="combined")
geneSets <- GSanswer$geneSets[[1]]
getPathwaySetData <- function( pathwaySets, geneUniverse=NULL) {
# given the list of all gene sets
setNames <- names( pathwaySets)
cnts <- sapply( pathwaySets, length)
allGenes <- unlist( pathwaySets)
allNames <- rep( setNames, times=cnts)
pathTable <- data.frame( "GeneID"=allGenes, "PathwayID"= allNames, stringsAsFactors=F)
rownames(pathTable) <- 1:nrow(pathTable)
# we may be given just a subset of valid genes in the 'universe'
if ( ! is.null( geneUniverse)) {
#shrink the universe of all known genes
geneUniverse <- as.GeneUniverse( geneUniverse)
keepers <- which( pathTable$GeneID %in% geneUniverse)
pathTable <- pathTable[ keepers, ]
}
nGenesInTable <- length( unique( pathTable$GeneID))
pathID.table <- table( pathTable$PathwayID)
pathGenePcts <- pathID.table * 100 / nGenesInTable
pathGeneCnts <- pathID.table
cat( "\nFound ", length(pathID.table), " Gene Sets to evaluate")
out <- list( "nRowsTotal"=nrow( pathTable), "nUniqueGenes"=nGenesInTable,
"PathwayTable"=pathTable, "idTable"=pathID.table,
"pctByGene"=pathGenePcts, "countByGene"=pathGeneCnts)
return( out)
}
getGeneSetData <- function( genes, pathTable) {
mySet <- subset( pathTable, GeneID %in% genes)
if ( nrow(mySet) < 1) {
cat( "\nWarning: None of the given genes are found in any GeneSet")
cat( "\nVerify the current species matches your genes.\n")
out <- list( "nRowsTotal"=0, "nUniqueGenes"=0,
"PathwayTable"=data.frame(), "idTable"=vector(),
"pctByGene"=vector(), "countByGene"=vector())
return( out)
}
rownames(mySet) <- 1:nrow(mySet)
nGenesInSet <- length( unique( mySet$GeneID))
pathID.table <- table( mySet$PathwayID)
pathGenePcts <- pathID.table * 100 / nGenesInSet
pathGeneCnts <- pathID.table
out <- list( "nRowsTotal"=nrow( mySet), "nUniqueGenes"=nGenesInSet,
"PathwayTable"=mySet, "idTable"=pathID.table,
"pctByGene"=pathGenePcts, "countByGene"=pathGeneCnts)
return( out)
}
findEnrich <- function( gSet, mData, gData, minEnrich=2.0, maxPvalue=0.1,
upOnly=TRUE, wt.fold=1, wt.pvalue=1) {
# given a set of genes, metabolic table count, and the gene set counts
namesG <- names( gData$countByGene)
namesM <- names( mData$countByGene)
# lets look at all, not just those present
# both <- intersect( namesG, namesM)
both <- union( namesG, namesM)
if ( length( both) < 1) return( data.frame())
whereG <- match( both, namesG, nomatch=0)
whereM <- match( both, namesM, nomatch=0)
# gather the gene counts and percentages for each pathway
Mcnt <- ifelse( whereM > 0, mData$countByGene[ whereM], 0)
Mpct <- ifelse( whereM > 0, mData$pctByGene[ whereM], 0)
Gcnt <- ifelse( whereG > 0, gData$countByGene[ whereG], 0)
Gpct <- ifelse( whereG > 0, gData$pctByGene[ whereG], 0)
# enrichment is how much more in our set than the entire table
enrich <- Gpct / Mpct
# the probalities are for each pathway... calculate them
pvals <- vector( length=length(enrich))
cat( "\nCalculating enrichment P-values..\n")
for ( i in 1:length(pvals)) {
x <- Gcnt[i]
m <- Mcnt[i]
n <- mData$nUniqueGenes - m
k <- gData$nUniqueGenes
lowTailWanted <- (enrich[i] < 1.0)
# get the entire prob. dist. and sum up the half we want...
allPs <- dhyper( 0:k, m, n, k)
if ( lowTailWanted) {
# the probability of 0 to X genes
pvals[i] <- sum( allPs[1:(x+1)])
} else {
# the probability of X up to K genes
pvals[i] <- sum( allPs[(x+1):length(allPs)])
}
if (verbose && i %% 10 == 0) cat( "\r", i, sub( "<a.+","",both[i]), enrich[i], pvals[i])
}
# clean the values a bit
enrich <- round( enrich, digits=3)
Mpct <- round( Mpct, digits=2)
Gpct <- round( Gpct, digits=2)
out <- data.frame( both, enrich, pvals, Mcnt, Mpct, Gcnt, Gpct, stringsAsFactors=FALSE)
colnames( out) <- c( "PathName", "Enrichment", "P_value", "N_Total", "Pct_Total", "N_Given", "Pct_Given")
# having a gene count of zero forces a zero enrichment, which is hard to rank
# use a small fudge factor to prevent exact zeros
enrichDE <- enrich
isZero <- which( Gcnt == 0)
if ( length( isZero)) {
smallCnt <- 1 / gData$nUniqueGenes
Gpct[ isZero] <- smallCnt / gData$nUniqueGenes
enrichDE[isZero] <- Gpct[isZero] / Mpct[isZero]
}
# sort by enrichment and P-value, where 1.0 means not enriched...
#ord <- diffExpressRankOrder( out$Enrichment, out$P_value, wt.fold=wt.enrich, wt.pvalue=wt.pvalue,
ord <- diffExpressRankOrder( enrichDE, out$P_value, wt.fold=wt.enrich, wt.pvalue=wt.pvalue,
notDE.value=1.0)
out <- out[ ord, ]
rownames( out) <- 1:nrow( out)
# put on extra info columns
signif <- rep.int( "", nrow(out))
enrich <- out$Enrichment
pvals <- out$P_value
signif[ pvals < 0.1] <- "."
signif[ pvals < 0.05] <- "*"
signif[ pvals < 0.01] <- "**"
signif[ pvals < 0.001] <- "***"
out$OverUnder <- ifelse( enrich >= 1, "Over", "Under")
out$OverUnder[ enrich >= 0.99 & enrich <= 1.01] <- ""
out$Signif <- signif
testValue <- out$Enrichment
testP <- out$P_value
who <- which( testValue >= minEnrich & testP <= maxPvalue)
if ( ! upOnly) {
who2 <- which( testValue <= (1/minEnrich) & testP <= maxPvalue)
who <- unique( c( who, who2))
}
out <- out[ who, ]
if ( nrow(out) > 0) rownames( out) <- 1:nrow( out)
return( out)
}
whichGenesEnriched <- function( genes, pathTable, enrichDF, wt.fold=1, wt.pvalue=1,
mode=c('vector','data.frame')) {
mode <- match.arg( mode)
foundGenes <- foundPaths <- foundEnrich <- foundPvals <- vector()
foundNgiven <- foundNtotal <- vector()
nout <- 0
if ( mode == "data.frame") {
if ( ! nrow(enrichDF)) return( data.frame())
cat( "\nGathering genes in enriched pathways..\n")
for (k in 1:nrow(enrichDF)) {
myPath <- enrichDF$PathName[k]
sml <- subset( pathTable, PathwayID == myPath)
myGenes <- intersect( genes, sml$GeneID)
N <- length(myGenes)
if ( N) {
now <- (nout+1) : (nout+N)
foundGenes[now] <- myGenes
foundPaths[now] <- myPath
foundEnrich[now] <- enrichDF$Enrichment[k]
foundPvals[now] <- enrichDF$P_value[k]
foundNgiven[now] <- enrichDF$N_Given[k]
foundNtotal[now] <- enrichDF$N_Total[k]
nout <- max( now)
}
if ( k %% 10 == 0) cat( "\r", k, sub( "<a.+","",myPath), nout)
}
out <- data.frame( "GeneID"=foundGenes, "Product"=gene2Product(foundGenes),
"PathName"=foundPaths,
"N_Given"=foundNgiven, "N_Total"=foundNtotal,
"Enrichment"=foundEnrich, "P_value"=foundPvals,
stringsAsFactors=FALSE)
#ord <- order( out$GeneID, -out$Enrichment, out$PathName)
#ord <- diffExpressRankOrder( out$Enrichment, out$P_value, wt.fold=wt.enrich,
# wt.pvalue=wt.pvalue, notDE.value=1.0)
#out <- out[ ord, ]
if ( nrow(out)) rownames(out) <- 1:nrow(out)
# don't keep the hyperlinks
out$PathName <- trimGeneSetNameLink( out$PathName)
return( out)
} else {
if ( ! nrow(enrichDF)) return( vector())
cat( "\nGathering genes in enriched pathways..\n")
for (k in 1:nrow(enrichDF)) {
myPath <- enrichDF$PathName[k]
sml <- subset( pathTable, PathwayID == myPath)
myGenes <- intersect( genes, sml$GeneID)
N <- length(myGenes)
now <- (nout+1)
if ( N) {
foundGenes[now] <- paste( sort( myGenes), collapse=", ")
} else {
foundGenes[now] <- ""
}
nout <- now
if ( k %% 10 == 0) cat( "\r", k, sub( "<a.+","",myPath), nout)
}
return( foundGenes)
}
}
# the pathway data is always the short gene names, never full IDs
genes <- shortGeneName( genes, keep=1)
setData <- getPathwaySetData( geneSets, geneUniverse=geneUniverse)
geneData <- getGeneSetData( genes, setData$PathwayTable)
enrichDF <- findEnrich( genes, setData, geneData, upOnly=upOnly,
minEnrich=minEnrich, maxPvalue=maxPvalue,
wt.fold=wt.enrich, wt.pvalue=wt.pvalue)
# with the enrichment result in hand, see what genes are the ones that hit
if ( is.logical(reportGenes) && reportGenes == FALSE) {
if (dropPathNameLinks) {
enrichDF$PathName <- trimGeneSetNameLink( enrichDF$PathName)
}
return( enrichDF)
}
if ( is.logical(reportGenes) && reportGenes == TRUE) {
enrichGenes <- whichGenesEnriched( genes, setData$PathwayTable, enrichDF,
wt.fold=wt.enrich, wt.pvalue=wt.pvalue, mode='data.frame')
if (dropPathNameLinks) {
enrichDF$PathName <- trimGeneSetNameLink( enrichDF$PathName)
enrichGenes$PathName <- trimGeneSetNameLink( enrichGenes$PathName)
}
out <- list( "pathways"=enrichDF, "genes"=enrichGenes)
return( out)
}
if ( is.character(reportGenes)) {
enrichGenes <- whichGenesEnriched( genes, setData$PathwayTable, enrichDF,
wt.fold=wt.enrich, wt.pvalue=wt.pvalue, mode=reportGenes)
if ( is.data.frame( enrichGenes)) {
if (dropPathNameLinks) {
enrichDF$PathName <- trimGeneSetNameLink( enrichDF$PathName)
enrichGenes$PathName <- trimGeneSetNameLink( enrichGenes$PathName)
}
out <- list( "pathways"=enrichDF, "genes"=enrichGenes)
return( out)
}
if (dropPathNameLinks) {
enrichDF$PathName <- trimGeneSetNameLink( enrichDF$PathName)
}
out <- data.frame( enrichDF, "GeneList"=enrichGenes, stringsAsFactors=F)
return( out)
}
}
enrichment2html <- function( tbl, file="enrichment.html", title="Pathway Enrichment results", maxRows=500) {
if ( ! nrow(tbl)) {
cat( "\nEmpty Enrichment table. ", "\n Nothing to write to file: ", file)
return()
}
# add a column to be more obvious
tbl$Over_Under <- ifelse( tbl$Enrichment > 1.0, "Over", "Under")
# make the names a bit easier to wrap around
linkPart <- sub( "(.+)(<a.+)", "\\2", tbl$PathName)
linkPart <- ifelse( linkPart == tbl$PathName, "", linkPart)
namePart <- sub( "(.+)(<a.+)", "\\1", tbl$PathName)
namePart <- gsub( "_", " ", namePart, fixed=T)
# the dots in the module source are needed...
#namePart <- gsub( ".", " ", namePart, fixed=T)
tbl$PathName <- paste( namePart, linkPart)
# do minor reformating
#c( "PathName", "N_Total", "Pct_Total", "N_Given", "Pct_Given", "Enrichment", "P_value")
colnames(tbl)[1] <- "Pathway Name"
tbl$Pct_Total <- as.percent( tbl$Pct_Total, big.value=100, digits=2, sep="")
tbl$Pct_Given <- as.percent( tbl$Pct_Given, big.value=100, digits=2, sep="")
tbl$Enrichment <- formatC( tbl$Enrichment, format="f", digits=3)
tbl$P_value <- formatC( tbl$P_value, format="e", digits=2)
colnames(tbl) <- gsub( "_", " ", colnames(tbl), fixed=T)
colnames(tbl) <- gsub( ".", " ", colnames(tbl), fixed=T)
table2html( tbl, file=file, title=title, linkColumnNames=NULL, maxRows=maxRows)
return()
}
geneSetMetaEnrichment <- function( genes, Ngenes=seq( 100, length(genes), by=100),
geneSets=defaultGeneSets(),
upOnly=TRUE, minEnrich=1.05, maxPvalue=0.5,
wt.enrich=1, wt.pvalue=2, verbose=T) {
# the Enrichment tool with multiple subsets and then do the meta ranking
ansList <- vector( mode="list")
nAns <- 0
for ( n in Ngenes) {
if ( n > length(genes)) next
cat( "\n\nDoing Enrichment of top N = ", n)
ans <- geneSetEnrichment( genes[1:n], geneSets=geneSets, upOnly=T, minEnrich=minEnrich,
maxPvalue=maxPvalue, wt.enrich=wt.enrich, wt.pvalue=wt.pvalue,
reportGenes=FALSE, geneUniverse=NULL, verbose=verbose)
# let's augment the pathway name with how many genes are in it...
ans$PathName <- paste( ans$PathName," (N=", ans$N_Total,")", sep="")
nAns <- nAns + 1
ansList[[nAns]] <- ans
names(ansList)[nAns] <- paste( "N_", n, sep="")
}
cat( "\nDone.\n")
ans <- metaRank.data.frames( ansList, geneColumn="PathName", valueColumn="Enrichment", pvalueColumn="P_value",
missingGenes="fill", missingValue=1)
# tidy up...
colnames(ans)[1] <- "PathName"
ans <- ans[ , -grep( "PRODUCT", colnames(ans))]
return( ans)
}
metaEnrichment2html <- function( tbl, file="metaEnrichment.html", title="Pathway Meta Enrichment Results",
maxRows=500) {
# make the names a bit easier to wrap around
linkPart <- sub( "(.+)(<a.+)", "\\2", tbl$PathName)
linkPart <- ifelse( linkPart == tbl$PathName, "", linkPart)
namePart <- sub( "(.+)(<a.+)", "\\1", tbl$PathName)
namePart <- gsub( "_", " ", namePart, fixed=T)
#namePart <- gsub( ".", " ", namePart, fixed=T)
tbl$PathName <- paste( namePart, linkPart)
# do minor reformating
colnames(tbl)[1] <- "Pathway Name"
tbl$Enrichment <- formatC( tbl$Enrichment, format="f", digits=3)
tbl$AVG_RANK <- formatC( tbl$AVG_RANK, format="f", digits=2)
colnames(tbl)[3] <- "Avg Rank"
tbl$AVG_PVALUE <- formatC( tbl$AVG_PVALUE, format="e", digits=2)
colnames(tbl)[4] <- "Avg Pvalue"
colnames(tbl) <- gsub( "_", " ", colnames(tbl), fixed=T)
colnames(tbl) <- gsub( ".", " ", colnames(tbl), fixed=T)
table2html( tbl, file=file, title=title, linkColumnNames=NULL, maxRows=maxRows)
return()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.