cssFile <- system.file("extdata", "style.css", package="ccPaper") options(markdown.HTML.stylesheet = cssFile)
Note that running this analysis will require at least 4 GB of RAM unless objects are deleted when they are no longer required.
To demonstrate the utility of the general categoryCompare
approach, we will construct an artificial data set based on the Gene Ontology. We will attempt to demonstrate that differences in the set or list based comparisons are also dependent on the number of items annotated to a particular term.
Set based: calculations performed independently on each set of features, and then the results are combined.
List based: calculations performed on the intersected list of two sets of features.
library(ggplot2) useTheme <- theme_bw() + theme(legend.key=element_blank(), legend.title = element_text(size=25), axis.title = element_text(size=25), axis.text = element_text(size=15), legend.text = element_text(size=20), strip.text.y = element_text(size=20), strip.text.x = element_text(size=20)) savePlot <- function(plotObj, plotFile, figPath="/mlab/data/rmflight/Documents/projects/work/ccPaper/savedPlots"){ if (file.exists(figPath)){ plotObj <- plotObj + ggtitle(NULL) png(filename=file.path(figPath, plotFile), width=854, height=628) print(plotObj) dev.off() } }
We will make use of three sets of GO terms from H. sapiens. For each set, we want differing numbers of genes annotated to them.
Let's first look at the distribution of number of genes annotated to the GO terms in biological process
library(org.Hs.eg.db) library(GO.db) library(ccPaper) hsGO <- as.list(org.Hs.egGO2ALLEGS) goOntology <- Ontology(names(hsGO)) goBP <- names(goOntology)[goOntology == "BP"] hsGO <- hsGO[goBP] hsGO <- lapply(hsGO, unique) universeGenes <- unique(unlist(hsGO))
hsGO_count <- sapply(hsGO, length) hsGO_count <- data.frame(size=hsGO_count) p <- ggplot(hsGO_count, aes(x=size)) + geom_bar(binwidth=10) + xlim(0, 2000) + ylim(0, 500) + useTheme + ggtitle("Number of genes annotated to GO terms") + xlab('Num. of Genes') + ylab('Count') p savePlot(p, "go2gene_distribution.png")
Based on the distribution in the previous figure, we will set up 3 different groups of GO terms, with a different number from each group:
This gives us a total of 100 GO terms, with a good sample from each group. We will generate random samples that are defined by these limits, and use this set of GO terms going forward.
grpLow <- c(10, 100) grpMed <- c(250, 500) grpHi <- c(500, 1500)
set.seed(271113) # for reproducibility nGO <- c(50, 30, 20) names(nGO) <- c("low", "med", "hi") GO_low <- limitedRandomSample(hsGO_count, grpLow, nGO["low"]) GO_med <- limitedRandomSample(hsGO_count, grpMed, nGO["med"]) GO_hi <- limitedRandomSample(hsGO_count, grpHi, nGO["hi"]) useGO <- c(GO_low, GO_med, GO_hi) useGOSizeClass <- c(rep("low", nGO["low"]), rep("med", nGO["med"]), rep("hi", nGO["hi"])) names(useGOSizeClass) <- useGO
Before we go and generate our samples, we should actually process some real experimental data and examine the fraction of annotated genes in the experiment compared to the total number of genes annotated. We will use the ALL
data set from Bioconductor
and a paired lung cancer dataset available from GEO (available as part of this package).
The ALL
data set has two classes of tumor, B and T-cell lymphoblastoma. We will do a simple differential expression between the two cell types.
library(ALL) library(limma) data(ALL) grpALL <- factor(strtrim(pData(ALL)$BT, 1)) designALL <- model.matrix(~0+grpALL) colnames(designALL) <- c("B", "T") fitALL <- lmFit(ALL, designALL) contMatrixALL <- makeContrasts(BvT=B-T, levels=designALL) fitALL2 <- contrasts.fit(fitALL, contMatrixALL) fitALL2 <- eBayes(fitALL2) deALL <- rownames(topTable(fitALL2, adjust="BH", p.value=0.05, number=Inf))
Now for all the GO terms that map to the probes in the ALL
differentially expressed genes, how does the proportion of differentially expressed genes change as a function of the number of genes annotated to the GO term?
library(hgu95av2.db) ALLgo <- as.list(hgu95av2GO2ALLPROBES) ALL_sizeFrac <- trimGOFrac(ALLgo, deALL) ggplot(ALL_sizeFrac, aes(x=size, y=frac)) + geom_point() + xlim(0, 1500) + useTheme + ggtitle("DE ALL fraction as a function of size") + xlab("Number of Genes Annotated to Term") + ylab("Fraction of Term Genes in DE Sample")
The lung cancer data is a paired sample design from GEO.
data(lung) sampleStat <- strsplit2(pData(lung)$title, " ") notNP <- grep("^NP", sampleStat[,2], invert=T) lung <- lung[, notNP] sampleStat <- sampleStat[notNP,] sampleID <- factor(sampleStat[,2]) cancerID <- factor(sampleStat[,1]) lungDesign <- model.matrix(~sampleID+cancerID) fitLung <- lmFit(lung, lungDesign) fitLung <- eBayes(fitLung) deLung <- rownames(topTable(fitLung, coef="cancerIDTumor", number=2000, adjust.method="BY", p.value=0.00001))
library(hgu133plus2.db) lung2go <- as.list(hgu133plus2GO2ALLPROBES) lung_sizeFrac <- trimGOFrac(lung2go, deLung) lung_sizeFrac$genelist <- "lung" ggplot(lung_sizeFrac, aes(x=size, y=frac)) + geom_point() + xlim(0, 1500) + useTheme + ggtitle("DE Lung fraction as a function of size") + xlab("Num. of Genes") + ylab("Fraction")
Now, for all of these we need to generate two samples of genes from them to comprise our differentially expressed (DE) sets from the genome, representing two different expression experiments.
How do we select these two sets of genes?
What we expect from this is that in both datasets, the GO terms with lower numbers of genes annotated will have large fractions of their annotations present, while GO terms with more and more genes will have lower fractions, and there will be less overlap between the two datasets.
These expectations should be checked.
goList <- list(low=GO_low, med=GO_med, hi=GO_hi) nGene <- 1000 sample1_org <- sampleTerms(useGO, hsGO, 1000, 4)[1:nGene] sample2_org <- sampleTerms(useGO, hsGO, 1000, 4)[1:nGene] #check our expectations goFractions <- calcFraction(hsGO[useGO], list(sample1=sample1_org, sample2=sample2_org)) ggplot(goFractions, aes(x=size, y=frac, color=genelist)) + geom_point() + useTheme + ggtitle("Sample GO fractions") + xlab("Num. of Genes") + ylab("Fraction")
Lets actually compare this to the distribution of fractions vs sizes from the experimental data sets. Because those used all the GO terms, we will redo them with all GO terms as well, not just those from our sample.
sampleGOFracs <- calcFraction(hsGO, list(sample1=sample1_org, sample2=sample2_org)) ALL_sizeFrac$genelist <- "all" ALL_tmp <- rbind(ALL_sizeFrac, lung_sizeFrac, sampleGOFracs[,c("frac", "genelist", "size")]) ggplot(ALL_tmp, aes(x=size, y=frac)) + geom_point() + xlim(0, 1500) + facet_grid(genelist ~ .) + useTheme + ggtitle("Sample GO fractions for all") + xlab("Num. of Genes") + ylab("Fraction")
keepItems <- c("goFractions", "sample1_org", "sample2_org", "hsGO", "useGO", "useGOSizeClass", "GO_hi", "GO_low", "GO_med", "grpHi", "grpLow", "grpMed", "minGO", "nGO", "nGene", "universeGenes", "savePlot", "useTheme", "fitALL2") allItems <- ls() delItems <- allItems[!(allItems %in% keepItems)] rm(list=delItems)
For each geneList
, we will do the GO enrichment calculations, and then get the results. We also do the intersection of all the geneList
's, and do the calculations again.
samples_noNoise <- list(sample1 = sample1_org, sample2 = sample2_org) go_noNoise <- hyperGOMultiEnrichment(samples_noNoise, universe=universeGenes)
Now get the p-values out (-1 * log10(pvalue)
). For each of our test GO terms, we take the minimum p-value from the results of the set calculations, and also the values from the list calculations.
noNoise_results <- pvaluesMultiEnrich(c("sample1", "sample2"), useGO, go_noNoise) noNoise_pvalues <- pvalueDiffSig(noNoise_results$values, pCutoff=0.05, log=TRUE)
Finally, lets add in our values of the fraction, total genes, and the class (low, med, hi) so we can do some fancy plotting stuff with it.
noNoise_pvalues$sizeClass <- useGOSizeClass noNoise_pvalues$size <- goFractions[1:length(useGOSizeClass),4] noNoise_pvalues$frac <- goFractions$frac[1:length(useGOSizeClass)]
Now we will plot the difference in log-p-values (where we have set - list), so that positive values imply that set had lower p-values than list, based on the fraction of genes annotated.
noNoise_pvalues$sizeClass <- factor(noNoise_pvalues$sizeClass, levels=c("low", "med", "hi"), ordered=TRUE) p <- ggplot(noNoise_pvalues, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ sizeClass, scales="free_x") + xlab("fraction") + useTheme + ggtitle("Sample - Combined Differences") + xlab("Fraction") + ylab("Diff. log p-values") p savePlot(p, "nonoise_sample_combineddiff.png") ggplot(noNoise_pvalues, aes(x=sizeClass, y=diff)) + geom_boxplot() + geom_point() + ggtitle("Sample - Combined Differences") + xlab("Size Class") + ylab("Diff. log p-values")
We can see that it appears in general that set has better p-values than list, especially in the med and hi groups. Also, both methods get pretty much the same GO terms as significant.
How many of the GO terms have better p-values in set compared to list?
tapply(noNoise_pvalues$diff, noNoise_pvalues$sizeClass, function(x){sum(x > 0)})
What is the average difference among each group?
tapply(noNoise_pvalues$diff, noNoise_pvalues$sizeClass, mean)
So ideally, the mean of the full set of differences, combined with how many pass in each case, should adequately quantify how well each of the methods is doing.
In addition to the genes that are annotated to the GO terms of interest, we will also add in genes that have no relation to the GO terms under consideration, these would be considered noise genes. To make the model simple, we will have the same genes as noise in both samples.
nNoise <- 500 noiseGenes <- possibleNoise(hsGO, useGO) noiseGenes <- sample(noiseGenes, nNoise) # check that we did this right, the fraction should not change after adding noise genes sample1 <- c(sample1_org, noiseGenes) sample2 <- c(sample2_org, noiseGenes) samplesNoise <- list(sample1=sample1, sample2=sample2) goFracNoise <- calcFraction(hsGO[useGO], samplesNoise) plot(goFracNoise$frac, goFractions$frac)
go_noise <- hyperGOMultiEnrichment(samplesNoise, universeGenes)
noise_results <- pvaluesMultiEnrich(c("sample1", "sample2"), useGO, go_noise) noise_pvalues <- pvalueDiffSig(noise_results$values, pCutoff=0.05, log=TRUE) noise_pvalues$sizeClass <- useGOSizeClass noise_pvalues$size <- goFracNoise[1:length(useGOSizeClass),4] noise_pvalues$frac <- goFracNoise$frac[1:length(useGOSizeClass)]
noise_pvalues$sizeClass <- factor(noise_pvalues$sizeClass, c("low", "med", "hi"), ordered=TRUE) ggplot(noise_pvalues, aes(x=frac, y=diff, color=sigState)) + geom_point() + facet_grid(. ~ sizeClass, scales="free_x") + useTheme + ggtitle("Sample - Combined with Noise") + xlab("Fraction") + ylab("Diff. log p-values")
tapply(noise_pvalues$diff, noise_pvalues$sizeClass, function(x){sum(x > 0)}) tapply(noise_pvalues$diff, noise_pvalues$sizeClass, mean)
Although using gene lists that were perfect (i.e. no noise) everything in the list method was pretty much still significant, on average sets gave much better p-values. However, as we add noise to the system in the form of genes that are not annotated to our test GO terms, more terms are significant only in the sets method (i.e. show up in both sets).
Is there any difference in the difference of p-values between noise vs noNoise (comparing set vs list)?
diff_pvalues <- noise_pvalues diff_pvalues$diff <- noise_pvalues$diff - noNoise_pvalues$diff ggplot(diff_pvalues, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ sizeClass, scales="free_x") + ggtitle("difference of differences noise vs noNoise") + useTheme + xlab("Fraction") + ylab("Diff. noise / noNoise") tapply(diff_pvalues$diff, diff_pvalues$sizeClass, mean)
From this plot, it appears that not only do the set values return lower p-values, but return lower p-values when noise is added overall.
To make sure that the results are consistent, we will do several iterations of the above calculations.
nTimes <- 100 nGene <- 1000 testSamples <- lapply(seq(1, nTimes), function(x){ list(s1 = sampleTerms(useGO, hsGO, 1000, 4)[1:nGene], s2 = sampleTerms(useGO, hsGO, 1000, 4)[1:nGene], noise = sample(noiseGenes, nNoise)) }) save(useGO, testSamples, universeGenes, useGOSizeClass, file="inst/data/100GeneSamples.RData")
data("100GeneSamples") library(snowfall) # load("inst/data/100GeneSamples.RData") testFun <- function(x){ nnList <- x[c("s1", "s2")] nnRes <- hyperGOMultiEnrichment(nnList, universe=universeGenes) nn_res <- pvaluesMultiEnrich(c("s1", "s2"), useGO, nnRes) nnP <- pvalueDiffSig(nn_res$values, pCutoff=0.05, log=TRUE) nnP$class <- useGOSizeClass noiList <- list(s1 = c(x$s1, x$noise), s2 = c(x$s2, x$noise)) noiRes <- hyperGOMultiEnrichment(noiList, universe=universeGenes) noi_res <- pvaluesMultiEnrich(c("s1", "s2"), useGO, noiRes) noiP <- pvalueDiffSig(noi_res$values, pCutoff=0.05, log=TRUE) noiP$class <- useGOSizeClass return(list(clean = nnP, noise = noiP)) } sfInit(parallel=TRUE, cpus=10) sfExport("useGOSizeClass", "universeGenes", "useGO") sfLibrary(ccPaper) sfLibrary(GO.db) sfLibrary(org.Hs.eg.db) testRes <- sfLapply(testSamples, testFun) sfStop() save(testRes, file="inst/data/100GeneResults.RData")
For each of the GO terms, we want to calculate the mean, standard deviation, and then a 95% confidence interval to represent graphically.
data("100GeneResults") cleanRes <- lapply(testRes, function(x){x$clean}) testStats_clean <- sameGOStats(cleanRes) testStats_clean$class <- useGOSizeClass testStats_clean$frac <- goFractions$frac[1:length(useGOSizeClass)] testStats_clean$class <- factor(testStats_clean$class, levels=c("low", "med", "hi"), ordered=TRUE) p <- ggplot(testStats_clean, aes(x=frac, y=mean, ymax=max, ymin=min)) + geom_point() + geom_errorbar() + facet_grid(. ~ class, scales="free_x") + useTheme + ggtitle("Sample - List Differences, 100 Gene Samples") + xlab("Fraction") + ylab("Mean diff. log p-values") p savePlot(p, "genesamples100_sampleListDiffs.png")
noiseRes <- lapply(testRes, function(x){x$noise}) testStats_noise <- sameGOStats(noiseRes) testStats_noise$class <- useGOSizeClass testStats_noise$frac <- goFracNoise$frac[1:length(useGOSizeClass)] testStats_noise$class <- factor(testStats_noise$class, levels=c("low", "med", "hi"), ordered=TRUE) ggplot(testStats_noise, aes(x=frac, y=mean, ymax=max, ymin=min)) + geom_point() + geom_errorbar() + facet_grid(. ~ class, scales="free_x") + ggtitle("Sample - List Differences, 100 Gene Samples, with noise") + xlab("Fraction") + ylab("Mean diff. log p-values") + useTheme
These show that if there is a difference in the p-values from set and list, they are real.
What about differences between noise and clean?
noiseCleanDiff <- diffGOStats(testRes) noiseCleanDiff$class <- useGOSizeClass noiseCleanDiff$frac <- goFracNoise$frac[1:length(useGOSizeClass)] noiseCleanDiff$class <- factor(noiseCleanDiff$class, levels=c("low", "med", "hi"), ordered=TRUE) ggplot(noiseCleanDiff, aes(x=frac, y=mean, ymax=max, ymin=min)) + geom_point() + geom_errorbar() + facet_grid(. ~ class, scales="free_x") + ggtitle("Difference of Differences, 100 Gene Samples") + useTheme + xlab("Fraction") + ylab("Diff noise / noNoise")
Here we see that the noisy case also tends to generate higher differences in p-values between set and list than the clean case.
Caveats: Currently the fraction of genes annotated to GO terms is assumed to be the same for each sample. This may not actually be the case, and ideally this should be repeated with independent fraction estimates for each sampling.
OK, the error bars are not too bad. How about sampling multiple sets of GO terms??
goLimits <- list(low = c(10, 100), med = c(250, 500), hi = c(500, 1500)) nSample <- 100 sfInit(parallel=TRUE, cpus=10) sfLibrary(ccPaper) sfExport("hsGO", "nGO", "goLimits") testGOSample <- sfLapply(seq(1, nSample), function(x){ t1 <- goSample(hsGO, nGO, goLimits, nSample=2, nGene=1000, nNoise=500) return(t1) }) sfStop() save(testGOSample, file="inst/data/100GOSamples.RData")
Running those samples:
data("100GOSamples") runGOSamples <- function(x){ cleanSample <- x$geneSample cleanRes <- hyperGOMultiEnrichment(cleanSample, universe=universeGenes) cleanP <- pvaluesMultiEnrich(names(cleanSample), x$goData$id, cleanRes) cleanP_dif <- pvalueDiffSig(cleanP$values, pCutoff=0.05, log=TRUE) cleanP_dif <- cbind(cleanP_dif, x$goData) noiseSample <- lapply(names(cleanSample), function(inSample){ c(x$geneSample[[inSample]], x$noiseSample[[inSample]]) }) names(noiseSample) <- names(cleanSample) noiseRes <- hyperGOMultiEnrichment(noiseSample, universe=universeGenes) noiseP <- pvaluesMultiEnrich(names(cleanSample), x$goData$id, noiseRes) noiseP_dif <- pvalueDiffSig(noiseP$values, pCutoff=0.05, log=TRUE) noiseP_dif <- cbind(noiseP_dif, x$goData) return(list(clean = cleanP_dif, noise = noiseP_dif)) } library(snowfall) sfInit(parallel=TRUE, cpus=10) sfLibrary(ccPaper) sfLibrary(org.Hs.eg.db) sfLibrary(GO.db) sfExport("universeGenes") testGORes <- sfLapply(testGOSample, runGOSamples) sfStop() save(testGORes, file="inst/data/100GOResults.RData")
data("100GOResults") grab_noNoise <- function(x){ x$clean[,c("diff", "class", "frac")] } go_noNoise <- lapply(testGORes, grab_noNoise) go_noNoise <- do.call(rbind, go_noNoise) go_noNoise$class <- factor(go_noNoise$class, levels=c("low", "med", "hi"), ordered=TRUE) go_noNoiseT <- go_noNoise[(!is.nan(go_noNoise$diff)),] p <- ggplot(go_noNoiseT, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class, scales="free_x") + useTheme + stat_density2d(aes(fill = ..level..), geom="polygon") + ggtitle("Sample - List Differences, 100 GO Samples, noNoise") + xlab("Fraction") + ylab("Diff. log p-values") + labs(fill="density") p savePlot(p, "gosamples100_samplelistDiff.png") grab_noise <- function(x){ x$noise[, c("diff", "class", "frac")] } go_noise <- lapply(testGORes, grab_noise) go_noise <- do.call(rbind, go_noise) go_noise$class <- factor(go_noise$class, levels=c("low", "med", "hi"), ordered=TRUE) ggplot(go_noise, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class, scales="free_x") + ggtitle("Sample - List Differences, 100 GO Samples, Noise") + xlab("Fraction") + ylab("Diff. log p-values") + useTheme grab_noiseDiff <- function(x){ outRes <- x$clean[, c("diff", "class", "frac")] outRes$diff <- x$noise$diff - x$clean$diff outRes } go_diff <- lapply(testGORes, grab_noiseDiff) go_diff <- do.call(rbind, go_diff) go_diff$class <- factor(go_diff$class, levels=c("low", "med", "hi"), ordered=TRUE) ggplot(go_diff, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class, scales="free_x") + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + ylab("Diff. log p-values") + useTheme ggplot(go_diff, aes(x = frac, y = diff, fill=class)) + geom_boxplot(position="identity", alpha=0.4) + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + ylab("Diff. noise / noNoise") + useTheme
How about we compare the distribution of noise and no-noise using violin plots?
go_noise$noise <- "noise" go_noNoise$noise <- "none" go_all <- rbind(go_noise, go_noNoise) rownames(go_all) <- NULL ggplot(go_all, aes(x = frac, y = diff, fill = noise)) + geom_violin(trim=TRUE, alpha=0.5, position="identity") + facet_grid(. ~ class, scales="free_x") + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + ylab("Diff. log p-values") + useTheme
What if we aggregate things into bins first, based on their class and fraction?
go_diff$frac10 <- round(go_diff$frac * 10) / 10 ggplot(go_diff, aes(x = factor(frac10), y = diff)) + geom_boxplot() + facet_grid(. ~ class, scales="free_x") + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + ylab("Diff. log p-values") + useTheme probPoint <- is.infinite(go_diff$diff) | is.na(go_diff$diff) | is.nan(go_diff$diff) go_diff <- go_diff[!probPoint, ] ggplot(go_diff, aes(x=frac, y=diff)) + stat_smooth() + facet_grid(. ~ class, scales="free_x") + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + ylab("Diff. log p-values") + useTheme
Although the improvement of noise over clean data dips below 0 at medium fractions, it does tend to improve rather significantly as the fraction of genes annotated to a GO term increases.
For a given set of genes and GO terms, lets now keep adding more and more noise genes, and for the set amount of noise genes, go from 0% to 100% shared noise genes. And then see how things change. For this, we are going to use our initial set of GO terms (useGO
), and a single original set of genes, simply sampling noise genes as we go along. Note that in this case noise genes are defined as genes that are not annotated to any of the GO terms that the DE genes are annotated to.
As in the previous examples, we will generate the samples first, and then run the calculations on them. This is because it is easier to test sample generation and enrichment separately.
noiseGenes <- possibleNoise(hsGO, useGO) sizeNoise <- seq(0, 1000, 10) fracShared <- seq(0, 1, 0.01) noiseSamples <- sweepNoiseSample(noiseGenes, nSamples=2, sizeNoise, fracShared) nGene <- 1000 cleanSample <- list(s1 = sampleTerms(useGO, hsGO, nGene, 4)[1:nGene], s2 = sampleTerms(useGO, hsGO, nGene, 4)[1:nGene]) save(noiseSamples, cleanSample, sizeNoise, fracShared, universeGenes, useGO, useGOSizeClass, file=file.path(useDir, 'inst/data/noiseSamples.RData'))
data(noiseSamples) useNoise <- unlist(noiseSamples, recursive=FALSE) sizeNoiseAll <- rep(sizeNoise, each=length(fracShared)) fracSharedAll <- rep(fracShared, length(sizeNoise)) runSweepNoise <- function(noiseGenes){ inNoise <- lapply(seq(1, length(cleanSample)), function(x){ c(cleanSample[[x]], noiseGenes[[x]]) }) names(inNoise) <- names(cleanSample) noiseRes <- hyperGOMultiEnrichment(inNoise, universe=universeGenes) noiseP <- pvaluesMultiEnrich(names(inNoise), useGO, noiseRes) noisePComp <- pvalueDiffSig(noiseP$values, pCutoff=0.05, log=TRUE) noisePComp$class <- useGOSizeClass return(noisePComp) } library(snowfall) sfInit(parallel=TRUE, cpus=10) sfLibrary(ccPaper) sfLibrary(org.Hs.eg.db) sfLibrary(GO.db) sfExport("universeGenes", "useGO", "useGOSizeClass", "cleanSample") noiseSweepRes <- sfLapply(useNoise, runSweepNoise) sfStop() save(noiseSweepRes, file.path(useDir, "inst/data/noiseSweepRes.RData"))
Examine the noise results.
data(noiseSamples) data(noiseSweepRes) noiseFrac <- data.frame(size=rep(sizeNoise, each=length(fracShared)), shared=rep(fracShared, length(sizeNoise)))
outMedians <- lapply(noiseSweepRes, function(inRes){ tapply(inRes$diff, inRes$class, median) }) outMedians <- do.call(rbind, outMedians) outMedians <- cbind(outMedians, noiseFrac) ggplot(outMedians, aes(x=size, y=hi, color=shared)) + geom_point() + useTheme + ggtitle("Hi Median Difference") + xlab("Noise Added") + ylab("Median Diff.") ggplot(outMedians, aes(x=size, y=med, color=shared)) + geom_point() + useTheme + ggtitle("Medium Median Difference") + xlab("Noise Added") + ylab("Median Diff.") ggplot(outMedians, aes(x=size, y=low, color=shared)) + geom_point() + useTheme + ggtitle("Low Median Difference") + xlab("Noise Added") + ylab("Median Diff.")
outMedians2 <- data.frame(diff=c(outMedians$low, outMedians$med, outMedians$hi), class=c(rep("low", nrow(outMedians)), rep("med", nrow(outMedians)), rep("hi", nrow(outMedians))), size=rep(outMedians$size, 3), shared=rep(outMedians$shared, 3)) outMedians2$class <- factor(outMedians2$class, c("low", "med", "hi"), ordered=TRUE) p <- ggplot(outMedians2, aes(x=size, y=diff, color=shared)) + geom_point() + useTheme + facet_grid(class ~ ., scales="free_y") + ggtitle("Sample - Combined as a function of size and shared fraction") p savePlot(p, "samplecombined_functionSizeFraction.png")
Another way to do the analysis is using a Gene Set Enrichment Analysis, or GSEA. In contrast to over-representation analysis, where we count if a group is more highly represented in a group vs the background; GSEA tests if a set of genes are more highly ranked than random sets of genes.
For a hypothetical example, we are going to use the geneSetTest
from limma
, because it allows us to test on a set of gene statistics themselves.
The sets will be our same GO terms, and we will change the proportion of genes from each GO term that are enhanced above a base value. All genes will be assigned a value from a uniform distribution with a value of 0.5. For each GO term set, some genes will be selected (with an exponential decaying probability for more genes), and assigned p-values from a uniform distribution with a value of 0.01.
For the list version, the p-values of genes will be combined using Fishers method, see the function fishersMethod
provided in this package.
We will use the t-values that limma
has generated for the ALL data set, and just sample from those.
tALL <- topTable(fitALL2, number=Inf) tALL <- tALL$t[(tALL$t < 10) & (tALL$t > -10)] tValues <- runif(length(universeGenes), -6, 6) gseaSamples <- list(s1 = sampleTerms(useGO, hsGO, 1000, 4), s2 = sampleTerms(useGO, hsGO, 1000, 4)) gseaGOFracS1 <- calcFraction(hsGO[useGO], list(s1=gseaSamples[[1]])) gseaTValues <- sampletValueGenGSEA(gseaSamples, universeGenes, tValues)
Check that we did what we thought. If done right, all of the sample genes should be on one side of the a histogram.
tmpDat <- data.frame(x=gseaTValues[[1]], type="all", stringsAsFactors=FALSE) tmpDat$type[(universeGenes %in% gseaSamples[[1]])] <- "sample" ggplot(tmpDat, aes(x=x, fill=type)) + geom_histogram(binwidth=0.1, position="identity") + ggtitle("location of samples") + useTheme tmpComb <- data.frame(x=gseaTValues[["comb"]], type="all", stringsAsFactors=FALSE) tmpComb$type[(universeGenes %in% gseaSamples[[1]])] <- "sample" ggplot(tmpComb, aes(x=x, fill=type)) + geom_histogram(binwidth=0.1, position="identity") + ggtitle("location of samples") + useTheme
Now lets actually do some calculations.
go2index <- symbols2indices(hsGO[useGO], universeGenes) gseaSetResults <- lapply(gseaTValues, function(x){ mmGeneSetTest(go2index, x) })
And figure out what is doing better.
cleanDiff <- gseaDiffs(gseaSetResults) cleanDiff$sizeClass <- useGOSizeClass cleanDiff$sizeClass <- factor(cleanDiff$sizeClass, c("low", "med", "hi"), ordered=TRUE) cleanDiff$frac <- gseaGOFracS1$frac ggplot(cleanDiff, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ sizeClass) + ggtitle("Sample - List for GSEA, using GO") + useTheme + xlab("Fraction") + ylab("Diff. log p-values")
So this appears to be a different trend than for the hypergeometric case, in that a meta sample (achieved by averaging the t-statistics) actually does no worse, and in many cases much better than the worst p-value. This effect is less-pronounced as the number of genes in a class become larger, and as the fraction covered increases.
So what happens if we add some noisy genes into the top 1000? Stuff that isn't annotated at all to the GO terms of interest?
nNoise <- 2000 noiseGenes <- possibleNoise(hsGO, useGO) noiseGenes <- sample(noiseGenes, nNoise) gseaNoise <- list(s1=c(gseaSamples[["s1"]], noiseGenes), s2=c(gseaSamples[["s2"]], noiseGenes)) gseaTValues_noise <- sampletValueGenGSEA(gseaNoise, universeGenes, tValues) tmpDat <- data.frame(x=gseaTValues_noise[[1]], type="all", stringsAsFactors=FALSE) tmpDat$type[(universeGenes %in% gseaSamples[[1]])] <- "sample" ggplot(tmpDat, aes(x=x, fill=type)) + geom_histogram(binwidth=0.1, position="identity", alpha=0.75) + ggtitle("Sample - List GSEA GO, adding noise genes") + useTheme
gseaSetResults_noise <- lapply(gseaTValues_noise, function(x){ mmGeneSetTest(go2index, x) })
noiseDiff <- gseaDiffs(gseaSetResults_noise) noiseDiff$sizeClass <- useGOSizeClass noiseDiff$sizeClass <- factor(noiseDiff$sizeClass, c("low", "med", "hi"), ordered=TRUE) noiseDiff$frac <- gseaGOFracS1$frac ggplot(noiseDiff, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ sizeClass) + ggtitle("Sample - List, GSEA GO with Noise") + useTheme + xlab("Fraction") + ylab("Diff. log p-values") diffDiff <- data.frame(diff=(noiseDiff$diff - cleanDiff$diff), sizeClass=noiseDiff$sizeClass, frac=noiseDiff$frac) ggplot(diffDiff, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ sizeClass) + ggtitle("Difference of Differences, GSEA GO") + useTheme + xlab("Fraction") + ylab("Diff. noise / noNoise")
So, it appears that having a uniform distribution does not make things behave the way we might expect. What if it is more an issue of independence, that GO is a bad fit for GSEA type analysis because of the issue that many of the terms are heavily overlapped.
What if we force independence on our objects? i.e. each object has independent sets of genes, and treat them appropriately? i.e. to count as "diff" we will put genes (features) into the top 50% of ranked t-values (therefore reducing coverage means pushing annotated genes down into the bottom 50% of ranked values), and adding noise means to push un-annotated (i.e. random) above in rank.
Lets try this for a single example.
nGene <- 10000 indepTValues <- sort(runif(nGene, -6, 6)) names(indepTValues) <- as.character(seq(1, nGene)) gseaMap <- list(t1=sample(seq(1, nGene/2), 100)) tmpDat <- data.frame(tVal=indepTValues, status="all", stringsAsFactors=F) tmpDat$status[gseaMap[["t1"]]] <- "sample" ggplot(tmpDat, aes(x=tVal, fill=status)) + geom_histogram(binwidt=0.1, position="identity") + ggtitle("Check independent distribution, 100%") + useTheme + xlab("t-statistic") geneSetTest(gseaMap$t1, indepTValues, alternative="down", ranks.only=FALSE) gseaMap80 <- list(t1 = c(sample(seq(1, nGene/2), 80), sample(seq((nGene/2) + 1, nGene), 20 ))) geneSetTest(gseaMap80$t1, indepTValues, alternative="down", ranks.only=FALSE) tmpDat <- data.frame(tVal=indepTValues, status="all", stringsAsFactors=F) tmpDat$status[gseaMap80[["t1"]]] <- "sample" ggplot(tmpDat, aes(x=tVal, fill=status)) + geom_histogram(binwidt=0.1, position="identity") + ggtitle("check independent distribution, 80%") + useTheme + xlab("t-statistic") gseaMap50 <- list(t1 = c(sample(seq(1, nGene/2), 50), sample(seq((nGene/2) + 1, nGene), 50))) geneSetTest(gseaMap50$t1, indepTValues, alternative="down", ranks.only=FALSE) gseaMapUnif <- list(t1 = sample(nGene, 100)) geneSetTest(gseaMapUnif$t1, indepTValues, alternative="down", ranks.only=FALSE)
OK, so at least we verify that as the percentage of genes in the top 50% decreases, then the p-values decrease as well, as we expect they should. What about as we increase the number of genes above above the set, while keeping the number in the top 50 the same?
gseaAbove100 <- list(t1=sample(seq(100, nGene/2), 100)) geneSetTest(gseaAbove100$t1, indepTValues, alternative="down", ranks.only=FALSE) gseaAbove500 <- list(t1=sample(seq(501, nGene/2), 100)) geneSetTest(gseaAbove500$t1, indepTValues, alternative="down", ranks.only=FALSE) gseaAbove1000 <- list(t1=sample(seq(1001, nGene/2), 100)) geneSetTest(gseaAbove1000$t1, indepTValues, alternative="down", ranks.only=FALSE) gseaAbove2000 <- list(t1=sample(seq(2001, nGene/2), 100)) geneSetTest(gseaAbove2000$t1, indepTValues, alternative="down", ranks.only=FALSE) gseaAbove4000 <- list(t1=sample(seq(4001, nGene/2), 100)) geneSetTest(gseaAbove4000$t1, indepTValues, alternative="down", ranks.only=FALSE)
OK, so it takes a rather large proportion above to change the p-value any amount.
So, what should we do. We need a function that will let us vary the size and proportion of genes that are in the bottom 50% of values.
?gseaSizeRange ?gseaProportion
Lets verify this function is doing what we think it is.
testSize <- gseaSizeRange(c(500,500), 50) useStatistics <- runif(10000, -6, 6) useSig <- runif(50, .1, 1) testStats <- gseaProportion(testSize, useSig, useStatistics) fullIndex <- seq(1, length(useStatistics)) useCut <- 5000 realRanks <- lapply(testStats, function(inStat){ apply(inStat$stats, 2, function(inCol){ outRank <- rank(inCol)[inStat$index] sum(outRank < useCut) / length(inStat$index) }) }) realRanks <- do.call(rbind, realRanks) plot(realRanks[,1], useSig) plot(realRanks[,2], useSig) plot(realRanks[,3], useSig)
OK, it looks like it is working. That's good.
testSize <- gseaSizeRange(c(500,500), 50) useStatistics <- runif(10000, -6, 6) useSig <- runif(50, .1, 1) testStats <- gseaProportion(testSize, useSig, useStatistics) outP <- lapply(testStats, function(inStat){ tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){ geneSetTest(inStat$index, inStat$stats[,useColumn], "down", ranks.only=FALSE) }) tmpP <- do.call(cbind, tmpP) colnames(tmpP) <- colnames(inStat$stats) return(tmpP) }) outP2 <- do.call(rbind, outP) outP2 <- -1 * log(outP2) tmpP <- data.frame(set=apply(outP2[, c("s1", "s2")], 1, min), list=outP2[,"comb"]) tmpP$diff <- tmpP$set - tmpP$list tmpP$frac <- useSig p <- ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + useTheme + ggtitle("Sample - Combined average t-statistics") + xlab("Fraction") + ylab("Diff. log p-values") p savePlot(p, "samplecombined_tstatistics.png")
Yeah, that doesn't seem to really work.
What if GSEA is dependent on the shared proportion that is in the up or down state? i.e. if we allow the proportion "down" to vary between the two samples, and plot the difference in set and list values based on how close the value of the proportion in the top is??
?gseaProportionVary
Check that we generate different significant proportions for independent samples, and what is the combined.
testSize <- gseaSizeRange(c(500,500), 50) useStatistics <- runif(10000, -6, 6) useSig <- list(s1=runif(50, .1, 1), s2=runif(50, .1, 1)) testStats <- gseaProportionVary(testSize, useSig, useStatistics) fullIndex <- seq(1, length(useStatistics)) useCut <- 5000 realRanks <- lapply(testStats, function(inStat){ apply(inStat$stats, 2, function(inCol){ outRank <- rank(inCol)[inStat$index] sum(outRank < useCut) / length(inStat$index) }) }) realRanks <- do.call(rbind, realRanks) plot(realRanks[,1], useSig[[1]]) plot(realRanks[,2], useSig[[2]]) plot(realRanks[,3], useSig[[1]]) plot(realRanks[,3], useSig[[2]])
OK, this seems to be working still.
Now, lets try this on some fake data.
testSize <- gseaSizeRange(c(500,500), 50) useStatistics <- runif(10000, -6, 6) useSig <- list(s1=runif(50, .1, 1), s2=runif(50, .1, 1)) testStats <- gseaProportionVary(testSize, useSig, useStatistics) outP <- lapply(testStats, function(inStat){ tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){ geneSetTest(inStat$index, inStat$stats[,useColumn], "down", ranks.only=FALSE) }) tmpP <- do.call(cbind, tmpP) colnames(tmpP) <- colnames(inStat$stats) return(tmpP) }) outP2 <- do.call(rbind, outP) outP2 <- -1 * log(outP2) tmpP <- data.frame(set=apply(outP2[, c("s1", "s2")], 1, min), list=outP2[,"comb"]) tmpP$diff <- tmpP$set - tmpP$list realRanks <- lapply(testStats, function(inStat){ apply(inStat$stats, 2, function(inCol){ outRank <- rank(inCol)[inStat$index] sum(outRank < useCut) / length(inStat$index) }) }) realRanks <- do.call(rbind, realRanks) tmpP$frac <- realRanks[,3] ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + ggtitle("Sample - Combined vary fraction") + useTheme + xlab("Fraction") + ylab("Diff. log p-values")
So, none of this has really worked, at least not in the way that was hoped.
For our hypothetical comparison, we need to think of how one would combine two studies. In this case, one might combine two studies by combining the p-values. Therefore, we will use ranked p-values as our metric in the geneSetTest
.
For this to work with geneSetTest
, we need to make the range 0, 0.5, 1 conform to -1, 0, 1, because geneSetTest
works nicer with signed values. We will use a linear regression to transform the p-values to a signed range.
regData <- data.frame(y=c(-1, 0, 1), x=c(0, 0.5, 1)) regModel <- lm(y ~ x, regData) testData <- runif(10000, 0, 1) outData <- regModel$coefficients[1] + regModel$coefficients[2]*testData hist(outData, 100)
So our transformation works. Lets make sure it will work like the t-statistic.
nGene <- 10000 indepTValues <- sort(runif(nGene, 0, 1)) names(indepTValues) <- as.character(seq(1, nGene)) gseaMap <- list(t1=sample(seq(1, nGene/4), 100)) tmpDat <- data.frame(tVal=indepTValues, status="all", stringsAsFactors=F) tmpDat$status[gseaMap[["t1"]]] <- "sample" ggplot(tmpDat, aes(x=tVal, fill=status)) + geom_histogram(binwidt=0.01, position="identity") + ggtitle("check p-values, 100%") + useTheme + xlab("P-values") geneSetTest(gseaMap$t1, p2signed(regModel,indepTValues), alternative="down", ranks.only=FALSE) gseaMap80 <- list(t1 = c(sample(seq(1, nGene/2), 80), sample(seq((nGene/2) + 1, nGene), 20 ))) geneSetTest(gseaMap80$t1, p2signed(regModel,indepTValues), alternative="down", ranks.only=FALSE) tmpDat <- data.frame(tVal=indepTValues, status="all", stringsAsFactors=F) tmpDat$status[gseaMap80[["t1"]]] <- "sample" ggplot(tmpDat, aes(x=tVal, fill=status)) + geom_histogram(binwidt=0.01, position="identity") + ggtitle("check p-values, 80%") + useTheme + xlab("P-values") gseaMap50 <- list(t1 = c(sample(seq(1, nGene/2), 50), sample(seq((nGene/2) + 1, nGene), 50))) geneSetTest(gseaMap50$t1, p2signed(regModel,indepTValues), alternative="down", ranks.only=FALSE)
Now, lets maintain proportions between samples as we did before, but for the list method, combine the p-values using Fisher's method.
?gseaProportionFisher
testSize <- gseaSizeRange(c(500,500), 50) useStatistics <- runif(10000, 0, 1) useSig <- runif(50, .1, 1) testStats <- gseaProportionFisher(testSize, useSig, useStatistics) fullIndex <- seq(1, length(useStatistics)) useCut <- 5000 realRanks <- lapply(testStats, function(inStat){ apply(inStat$stats, 2, function(inCol){ outRank <- rank(inCol)[inStat$index] sum(outRank < useCut) / length(inStat$index) }) }) realRanks <- do.call(rbind, realRanks) plot(realRanks[,1], useSig) plot(realRanks[,2], useSig) plot(realRanks[,3], useSig)
testSize <- gseaSizeRange(c(500,500), 50) useStatistics <- runif(10000, 0, 1) useSig <- runif(50, .1, 1) testStats <- gseaProportionFisher(testSize, useSig, useStatistics) outP <- lapply(testStats, function(inStat){ tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){ geneSetTest(inStat$index, p2signed(regModel, inStat$stats[,useColumn]), "down", ranks.only=FALSE) }) tmpP <- do.call(cbind, tmpP) colnames(tmpP) <- colnames(inStat$stats) return(tmpP) }) outP2 <- do.call(rbind, outP) outP2 <- -1 * log(outP2) tmpP <- data.frame(set=apply(outP2[, c("s1", "s2")], 1, min), list=outP2[,"comb"]) tmpP$diff <- tmpP$set - tmpP$list tmpP$frac <- useSig ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + ggtitle("Sample - Combined Fishers") + useTheme + xlab("Fraction") + ylab("Diff. log p-values") # try also using ranks only outPRank <- lapply(testStats, function(inStat){ tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){ geneSetTest(inStat$index, inStat$stats, ranks.only=TRUE) }) tmpP <- do.call(cbind, tmpP) colnames(tmpP) <- colnames(inStat$stats) return(tmpP) }) outP2Rank <- do.call(rbind, outPRank) outP2Rank <- -1 * log(outP2Rank) tmpP <- data.frame(set=apply(outP2Rank[, c("s1", "s2")], 1, min), list=outP2Rank[,"comb"]) tmpP$diff <- tmpP$set - tmpP$list tmpP$frac <- useSig ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + ggtitle("Sample - Combined Fishers Ranks") + useTheme + xlab("Fraction") + ylab("Diff. log p-values")
Finally, we note that Shen & Tseng from 2010 combined independent experiments by taking the maximum p-value between samples.
?gseaProportionMax
testSize <- gseaSizeRange(c(500,500), 50) useStatistics <- runif(10000, 0, 1) useSig <- runif(50, .1, 1) testStats <- gseaProportionMax(testSize, useSig, useStatistics) fullIndex <- seq(1, length(useStatistics)) useCut <- 5000 realRanks <- lapply(testStats, function(inStat){ apply(inStat$stats, 2, function(inCol){ outRank <- rank(inCol)[inStat$index] sum(outRank < useCut) / length(inStat$index) }) }) realRanks <- do.call(rbind, realRanks) plot(realRanks[,1], useSig) plot(realRanks[,2], useSig) plot(realRanks[,3], useSig)
testSize <- gseaSizeRange(c(500,500), 50) useStatistics <- runif(10000, 0, 1) useSig <- runif(50, .1, 1) testStats <- gseaProportionMax(testSize, useSig, useStatistics) outP <- lapply(testStats, function(inStat){ tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){ geneSetTest(inStat$index, p2signed(regModel, inStat$stats[,useColumn]), "down", ranks.only=FALSE) }) tmpP <- do.call(cbind, tmpP) colnames(tmpP) <- colnames(inStat$stats) return(tmpP) }) outP2 <- do.call(rbind, outP) outP2 <- -1 * log(outP2) tmpP <- data.frame(set=apply(outP2[, c("s1", "s2")], 1, min), list=outP2[,"comb"]) tmpP$diff <- tmpP$set - tmpP$list tmpP$frac <- useSig ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + ggtitle("Sample - Combined Max-P") + useTheme + xlab("Fraction") + ylab("Diff. log p-values")
And wow! It appears to work!
Lets test using different size ranges and the proportions.
gseaLo <- gseaSizeRange(c(20, 250), 50) gseaMed <- gseaSizeRange(c(250, 500), 30) gseaHi <- gseaSizeRange(c(500, 1000), 20) gseaSizes <- c(gseaLo, gseaMed, gseaHi) gseaSizeClass <- c(rep("low", 50), rep("med", 30), rep("hi", 20)) gseaFrac <- runif(100, 0.3, 1) gseaStatistics <- runif(10000, 0, 1) testStats <- gseaProportionMax(gseaSizes, gseaFrac, gseaStatistics) outP <- lapply(testStats, function(inStat){ tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){ geneSetTest(inStat$index, p2signed(regModel, inStat$stats[,useColumn]), "down", ranks.only=FALSE) }) tmpP <- do.call(cbind, tmpP) colnames(tmpP) <- colnames(inStat$stats) return(tmpP) }) outP2 <- do.call(rbind, outP) outP2 <- -1 * log(outP2) tmpP <- data.frame(set=apply(outP2[, c("s1", "s2")], 1, min), list=outP2[,"comb"]) tmpP$diff <- tmpP$set - tmpP$list tmpP$frac <- gseaFrac tmpP$class <- gseaSizeClass tmpP$class <- factor(tmpP$class, c("low", "med", "hi"), ordered=TRUE) ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ class) + useTheme p <- ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + useTheme + ggtitle("Sample - Combined Max-P") + xlab("Fraction") + ylab("Diff. log p-values") p savePlot(p, "samplecombined_maxp.png")
So, this is very important. The method of combining p-values for the list method is extremely important.
Does this hold using ranks only for the calculation?
outPRank <- lapply(testStats, function(inStat){ tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){ geneSetTest(inStat$index, inStat$stats[,useColumn], ranks.only=TRUE) }) tmpP <- do.call(cbind, tmpP) colnames(tmpP) <- colnames(inStat$stats) return(tmpP) }) outP2Rank <- do.call(rbind, outPRank) outP2Rank <- -1 * log(outP2Rank) tmpPRank <- data.frame(set=apply(outP2Rank[, c("s1", "s2")], 1, min), list=outP2Rank[,"comb"]) tmpPRank$diff <- tmpPRank$set - tmpPRank$list tmpPRank$frac <- gseaFrac tmpPRank$class <- gseaSizeClass tmpPRank$class <- factor(tmpPRank$class, c("low", "med", "hi"), ordered=TRUE) ggplot(tmpPRank, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ class) + ggtitle("Sample - Combined Max-P Ranks") + xlab("Fraction") + ylab("Diff. log p-values")
No, it doesn't because the ranks.only=TRUE
will only check for mixed values, not at one end or the other.
Sys.time() sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.