# set global chunk options
library(knitr)
opts_chunk$set(size='tiny')

Purpose

The whole purpose of this vignette/cheatsheet is to show different approaches for testing for significance of certain over-representation in one group of genes vs another.

The data

we'll take a look what is enriched in the sixth cluster compare to the rest

library(vp.misc)
library(clusterProfiler)
library(GOstats)
library(ReactomePA)
data(gcSample)
# there are 8 clusters
sapply(gcSample, length)
# the IDs are Entrez Gene IDs
head(gcSample[[1]])

KEGG annotation

GOstats package

p <- new("KEGGHyperGParams",
     geneIds=gcSample[[6]],
     universeGeneIds=unique(unlist(gcSample)),
     pvalueCutoff = 1,
     annotation="org.Hs.eg.db",
     testDirection="over")
xke1 <- hyperGTest(p)
res <- summary(xke1)[,c("KEGGID","Term","Pvalue","Count","Size")]
kable(head(res))

clusterProfiler package

xke2 <- enrichKEGG(gcSample[[6]], universe = unlist((gcSample)),
                   pvalueCutoff = 1, qvalueCutoff = 1,
                   minGSSize = 0,
                   use_internal_data=TRUE) # then uses KEGG.db as GOstats below
res <- summary(xke2)[,c("ID","Description","pvalue","GeneRatio","BgRatio")]
rownames(res) <- NULL
kable(head(res))

GO BP annotation

Pretty much same thing as above, but with a few tweaks

GOstats package

p <- new("GOHyperGParams",
     geneIds=gcSample[[6]],
     universeGeneIds=unique(unlist(gcSample)),
     pvalueCutoff = 1,
     annotation="org.Hs.eg.db",
     ontology="BP",
     conditional=FALSE, # influential option
     testDirection="over")
xgobp1 <- hyperGTest(p)
res <- summary(xgobp1)[,c("GOBPID","Term","Pvalue","Count","Size")]
kable(head(res))

clusterProfiler package

xgobp2 <- enrichGO(gcSample[[6]], 
                   OrgDb = "org.Hs.eg.db",
                   universe = unique(unlist((gcSample))),
                   ont = "BP", pvalueCutoff = 1, qvalueCutoff = 1,
                   minGSSize = 0) 
xgobp2 <- subset_by_size(xgobp2, maxObsSize = 50)
res <- summary(xgobp2)[,c("ID","Description","pvalue","GeneRatio","BgRatio")]
rownames(res) <- NULL
kable(head(res))

There is a discrepancy between the results.

clusterProfiler does the test in the following chain:

enrichGO -> DOSE:::enrich.internal

GOstats chain:

hyperGTest -> GOstats:::.hyperGTestInternal -> Category:::.doHyperGTest -> Category:::.doHyperGInternal

It seems that GOstats does quite a bit of gene and GO term removal prior the test.


GO CC annotation

GOstats package

p <- new("GOHyperGParams",
     geneIds=gcSample[[6]],
     universeGeneIds=unique(unlist(gcSample)),
     pvalueCutoff = 1,
     annotation="org.Hs.eg.db",
     ontology="CC",
     conditional=FALSE, # influential option
     testDirection="over")
xgocc1 <- hyperGTest(p)
res <- summary(xgocc1)[,c("GOCCID","Term","Pvalue","Count","Size")]
kable(head(res))

clusterProfiler package

xgocc2 <- enrichGO(gcSample[[6]], 
                   OrgDb = "org.Hs.eg.db",
                   universe = unique(unlist((gcSample))),
                   ont = "CC", pvalueCutoff = 1, qvalueCutoff = 1,
                   minGSSize = 0) 
xgocc2 <- subset_by_size(xgocc2, maxObsSize = 50)
res <- summary(xgocc2)[,c("ID","Description","pvalue","GeneRatio","BgRatio")]
rownames(res) <- NULL
kable(head(res))

GO MF annotation

GOstats package

p <- new("GOHyperGParams",
     geneIds=gcSample[[6]],
     universeGeneIds=unique(unlist(gcSample)),
     pvalueCutoff = 1,
     annotation="org.Hs.eg.db",
     ontology="MF",
     conditional=FALSE, # influential option
     testDirection="over")
xgomf1 <- hyperGTest(p)
res <- summary(xgomf1)[,c("GOMFID","Term","Pvalue","Count","Size")]
kable(head(res))

clusterProfiler package

xgomf2 <- enrichGO(gcSample[[6]], 
                   OrgDb = "org.Hs.eg.db",
                   universe = unique(unlist((gcSample))),
                   ont = "MF", pvalueCutoff = 1, qvalueCutoff = 1,
                   minGSSize = 0) 
xgomf2 <- subset_by_size(xgomf2, maxObsSize = 50)
res <- summary(xgomf2)[,c("ID","Description","pvalue","GeneRatio","BgRatio")]
rownames(res) <- NULL
kable(head(res))

Reactome

As for Reactome there is only one pre-canned solution - from ReactomePA

ReactomePA package

xreac <- enrichPathway(gcSample[[6]], 
                       organism = "human",
                       universe = unique(unlist((gcSample))),
                       pvalueCutoff = 1, qvalueCutoff = 1, minGSSize = 0) 
xreac <- subset_by_size(xreac, maxObsSize = 50)
res <- summary(xreac)[,c("ID","Description","pvalue","GeneRatio","BgRatio")]
rownames(res) <- NULL
kable(head(res))

There is a warning because some genes map to both human and mouse pathways. Later this creates trouble. Here is my post about that on Bioconductor's mailing list.


PFAM

GOstats package

p <- new("PFAMHyperGParams",
     geneIds=gcSample[[6]],
     universeGeneIds=unique(unlist(gcSample)),
     pvalueCutoff = 1,
     annotation="org.Hs.eg.db",
     testDirection="over")
xpfam <- hyperGTest(p)
res <- summary(xpfam)
kable(head(res))

Adding description to PFAM term is something to think about.


Custom stuff

Taking advantage of enricher function from clusterProfiler

# randomly assign genes to 20 clusters
setnum <- replicate(length(unique(unlist(gcSample))), sample(1:20, 1))
setnum <- paste("set", setnum, sep = '')
TERM2GENE <- data.frame(term=setnum, gene=unique(unlist(gcSample)),
                        stringsAsFactors = FALSE)
TERM2NAME <- data.frame(term=setnum, 
                        name=paste("Description: ", setnum, sep = ''),
                        stringsAsFactors = FALSE)
xfake <- enricher(gene = gcSample[[6]], 
                  universe = unique(unlist(gcSample)),
                  pvalueCutoff = 1, qvalueCutoff = 1, minGSSize = 0,
                  TERM2GENE = TERM2GENE, 
                  TERM2NAME = TERM2NAME)
res <- summary(xfake)[,c("ID","Description","pvalue","GeneRatio","BgRatio")]
rownames(res) <- NULL
kable(head(res))
cnetplot(xfake, vertex.label.cex = 0.5)

A different layout

cnetplot(xfake, vertex.label.cex = 0.5, layout="kk")

Check out as well HTSanalyzeR and geecc Bioconductor packages.



vladpetyuk/vp.misc documentation built on June 25, 2021, 6:35 a.m.