knitr::opts_chunk$set(message=FALSE, warning=FALSE) devtools::load_all(".")
library(BS831)
Here, we show the use of the
hyper-geometric distribution
to test for enrichment of a (biologically relevant) category (e.g., a
pathway) in a differential gene expression signature. We will show the
use of the functions phyper
and fisher.test
. We will then show the
definition of a simple script to perform "hyper-enrichment" across
multiple categories/pathways.
Recall the set-up from the slides (BS831_class05_ComparativeEnrichment.pptx
).
We start by loading the package containing the hyperEnrichment script.
We start by loading the code containing the hyperEnrichment scripts.
data(hyper) print(hyperGsets) # show the size of the first 10 genesets sapply(getGeneSet(hyperGsets)[1:10],length) # show the size of the signatures print(sapply(hyperSig,length)) # let's rename the signatures names(hyperSig) <- gsub("UP","REPRESSED",gsub("DN","ACTIVATED",names(hyperSig)))
The genesets (the categories) represent a subset of the genesets contained in the MSgigDB's c2.cp compendium. The signatures represent the up- and down-regulated genes in oral cancer cell lines where one of several regulators (TAZ, YAP, TAZ/YAP, DPAGT1) was knocked down (KD).
For example, the "YAP" signature was obtained by performing differential analysis on a 6-sample datasets corresponding to "knockdown vs. control" (3 vs. 3) experiments performed on the HSC3 oral cancer cell line. The genes significantly down-regulated (up-regulated) upon knockdown represent the genes assumed to be activated (repressed) by TAZ.
phyper
We first test for enrichment of a single geneset in a single signature
using the function phyper
(the cumulative function of the
hyper-geometric distribution). The background population is set to
23,467, which represents the number of annotated genes in the dataset
used to derive the differential signature.
## background population backpop <- 23467 ## let us extract the signature of YAP-activated genes signature <- hyperSig$YAP.ACTIVATED ## and a REACTOME pathway pathway <- getGeneSet(hyperGsets)[["REACTOME_TRANSCRIPTION"]] ## let's check the overlap (if any) print( overlap <- length(intersect(signature,pathway)) ) ## let us take a look at the phyper arguments (call ?phyper for more details) print(args(phyper)) ## let us apply the function to our data (see figure) phyper( q=overlap-1, # number of red marbles in the draw - 1 (see below) m=length(pathway), # number of red marbles in urn n=backpop-length(pathway), # number of green marbles in urn k=length(signature), # Number of drawn marbles lower.tail=FALSE) # compute P( X > overlap ), hence the '-1' above # ..to achieve P(X≥overlap)
Clearly, the category (pathway) "REACTOME_TRANSCRIPTOME" is highly significantly enriched in the signature of YAP-activated genes.
fisher.test
We then show how we can obtain equivalent results using the
fisher.test
function. We first need to properly fill-in a
contingency table, and then apply the function.
## we need to define the contingency table ## ## | DRAWN !DRAWN | TOT ## ------+----------------+---- ## GREEN | k-q n-m-k+q | n-m ## RED | q m-q | m ## ------+----------------+---- ## TOT | k n-k | n contable <- matrix(c( dg=length(signature)-overlap, dr=overlap, ng=backpop-length(signature)-length(pathway)+overlap, nr=length(pathway)-overlap),2,2,dimnames=list(c("GREEN","RED"),c("DRAWN","not DRAWN"))) print(contable) fisher.test(contable,alt="less")
As you can see, the p-value is the same (p=1.263e-06).
We here define a (relatively) simple function, hyperEnrichment
, to
run hyper-geometric-based enrichment tests on multiple signatures and
mutiple genesets.
## function copied from hypeR hyperEnrichment <- function( signature, genesets, background=length(unique(unlist(genesets))), plotting=TRUE) { if (!is(signature, "vector")) stop("Expected signature to be a vector of symbols\n") if (!is(genesets, "list")) stop("Expected genesets to be a list of genesets\n") signature <- unique(signature) genesets <- lapply(genesets, unique) # Construct table signature.found <- signature[signature %in% unique(unlist(genesets))] n.hits <- sapply(genesets, function(x, y) length(intersect(x, y)), signature.found) n.drawn <- length(signature) n.genesets <- sapply(genesets, length) n.left <- background-n.genesets # Hypergeometric test pvals <- suppressWarnings(stats::phyper(q=n.hits-1, m=n.genesets, n=n.left, k=n.drawn, lower.tail=FALSE)) # Format data data <- data.frame(label=names(genesets), pval=signif(pvals, 2), fdr=signif(stats::p.adjust(pvals, method="fdr"), 2), signature=length(signature), geneset=n.genesets, overlap=n.hits, background=background, hits=sapply(genesets, function(x, y) paste(intersect(x, y), collapse=','), signature.found), stringsAsFactors=FALSE) # Handle plots if (plotting) { plots <- mapply(function(geneset, title) { ggvenn(signature, geneset, "Signature", "Geneset", title) }, genesets, names(genesets), USE.NAMES=TRUE, SIMPLIFY=FALSE) } else { plots <- lapply(genesets, function(x) {ggempty()}) } return(list(data=data, plots=plots)) }
We then apply it to the list of oral cancer knockdown signatures and two MSigDB geneset compendia, the set of canonical pathways c2.cp and a set of hallmark genesets h.all [Liberzon et al., 2015].
library(openxlsx) hyperE <- hyperEnrichment(signature=hyperSig,categories=getGeneSet(hyperGsets),ntotal=backpop) head(hyperE) hyperE.fdr25 <- hyperE[hyperE[,'fdr']<=0.25,] head(hyperE.fdr25) ## let us save it as a '.xlsx' object for you to inspect write.xlsx(hyperE.fdr25, file=file.path(system.file("extdata", package="BS831"), "hyperE.fdr25.xls"))
Let's load a different geneset compendium (hallmark genesets)
HALL <- new("GeneSet", file.path(system.file("extdata", package="BS831"), "h.all.v6.1.symbols.gmt")) # show the size of the first 10 genesets sapply(getGeneSet(HALL)[1:10],length) hyperHALL <- hyperEnrichment(drawn=hyperSig,categories=getGeneSet(HALL),ntotal=backpop) hyperHALL.fdr25 <- hyperHALL[hyperHALL[,'fdr']<=0.25,] head(hyperHALL.fdr25)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.