## ----customCSS, include=FALSE--------------------------------------------
cssFile <- system.file("extdata", "style.css", package="ccPaper")
options(markdown.HTML.stylesheet = cssFile)
## ----setupPlot-----------------------------------------------------------
library(ggplot2)
useTheme <- theme_bw() + theme(legend.key=element_blank(), legend.title = element_text(size=25), axis.title = element_text(size=25), axis.text = element_text(size=15), legend.text = element_text(size=20), strip.text.y = element_text(size=20), strip.text.x = element_text(size=20))
savePlot <- function(plotObj, plotFile, figPath="/mlab/data/rmflight/Documents/projects/work/ccPaper/savedPlots"){
if (file.exists(figPath)){
plotObj <- plotObj + ggtitle(NULL)
png(filename=file.path(figPath, plotFile), width=854, height=628)
print(plotObj)
dev.off()
}
}
## ----goAnnotations, message=FALSE----------------------------------------
library(org.Hs.eg.db)
library(GO.db)
library(ccPaper)
hsGO <- as.list(org.Hs.egGO2ALLEGS)
goOntology <- Ontology(names(hsGO))
goBP <- names(goOntology)[goOntology == "BP"]
hsGO <- hsGO[goBP]
hsGO <- lapply(hsGO, unique)
universeGenes <- unique(unlist(hsGO))
## ----goCountDist---------------------------------------------------------
hsGO_count <- sapply(hsGO, length)
hsGO_count <- data.frame(size=hsGO_count)
p <- ggplot(hsGO_count, aes(x=size)) + geom_bar(binwidth=10) + xlim(0, 2000) + ylim(0, 500) + useTheme + ggtitle("Number of genes annotated to GO terms") + xlab('Num. of Genes') + ylab('Count')
p
savePlot(p, "go2gene_distribution.png")
## ----defineGroupLimits---------------------------------------------------
grpLow <- c(10, 100)
grpMed <- c(250, 500)
grpHi <- c(500, 1500)
## ----sampleGroups--------------------------------------------------------
set.seed(271113) # for reproducibility
nGO <- c(50, 30, 20)
names(nGO) <- c("low", "med", "hi")
GO_low <- limitedRandomSample(hsGO_count, grpLow, nGO["low"])
GO_med <- limitedRandomSample(hsGO_count, grpMed, nGO["med"])
GO_hi <- limitedRandomSample(hsGO_count, grpHi, nGO["hi"])
useGO <- c(GO_low, GO_med, GO_hi)
useGOSizeClass <- c(rep("low", nGO["low"]), rep("med", nGO["med"]), rep("hi", nGO["hi"]))
names(useGOSizeClass) <- useGO
## ----deALL---------------------------------------------------------------
library(ALL)
library(limma)
data(ALL)
grpALL <- factor(strtrim(pData(ALL)$BT, 1))
designALL <- model.matrix(~0+grpALL)
colnames(designALL) <- c("B", "T")
fitALL <- lmFit(ALL, designALL)
contMatrixALL <- makeContrasts(BvT=B-T, levels=designALL)
fitALL2 <- contrasts.fit(fitALL, contMatrixALL)
fitALL2 <- eBayes(fitALL2)
deALL <- rownames(topTable(fitALL2, adjust="BH", p.value=0.05, number=Inf))
## ----goDistALL-----------------------------------------------------------
library(hgu95av2.db)
ALLgo <- as.list(hgu95av2GO2ALLPROBES)
ALL_sizeFrac <- trimGOFrac(ALLgo, deALL)
ggplot(ALL_sizeFrac, aes(x=size, y=frac)) + geom_point() + xlim(0, 1500) + useTheme + ggtitle("DE ALL fraction as a function of size") + xlab("Number of Genes Annotated to Term") + ylab("Fraction of Term Genes in DE Sample")
## ----deLung--------------------------------------------------------------
data(lung)
sampleStat <- strsplit2(pData(lung)$title, " ")
notNP <- grep("^NP", sampleStat[,2], invert=T)
lung <- lung[, notNP]
sampleStat <- sampleStat[notNP,]
sampleID <- factor(sampleStat[,2])
cancerID <- factor(sampleStat[,1])
lungDesign <- model.matrix(~sampleID+cancerID)
fitLung <- lmFit(lung, lungDesign)
fitLung <- eBayes(fitLung)
deLung <- rownames(topTable(fitLung, coef="cancerIDTumor", number=2000, adjust.method="BY", p.value=0.00001))
## ----goDistLung----------------------------------------------------------
library(hgu133plus2.db)
lung2go <- as.list(hgu133plus2GO2ALLPROBES)
lung_sizeFrac <- trimGOFrac(lung2go, deLung)
lung_sizeFrac$genelist <- "lung"
ggplot(lung_sizeFrac, aes(x=size, y=frac)) + geom_point() + xlim(0, 1500) + useTheme + ggtitle("DE Lung fraction as a function of size") + xlab("Num. of Genes") + ylab("Fraction")
## ----generateSamples-----------------------------------------------------
goList <- list(low=GO_low, med=GO_med, hi=GO_hi)
nGene <- 1000
sample1_org <- sampleTerms(useGO, hsGO, 1000, 4)[1:nGene]
sample2_org <- sampleTerms(useGO, hsGO, 1000, 4)[1:nGene]
#check our expectations
goFractions <- calcFraction(hsGO[useGO], list(sample1=sample1_org, sample2=sample2_org))
ggplot(goFractions, aes(x=size, y=frac, color=genelist)) + geom_point() + useTheme + ggtitle("Sample GO fractions") + xlab("Num. of Genes") + ylab("Fraction")
## ----compareFracs--------------------------------------------------------
sampleGOFracs <- calcFraction(hsGO, list(sample1=sample1_org, sample2=sample2_org))
ALL_sizeFrac$genelist <- "all"
ALL_tmp <- rbind(ALL_sizeFrac, lung_sizeFrac, sampleGOFracs[,c("frac", "genelist", "size")])
ggplot(ALL_tmp, aes(x=size, y=frac)) + geom_point() + xlim(0, 1500) + facet_grid(genelist ~ .) + useTheme + ggtitle("Sample GO fractions for all") + xlab("Num. of Genes") + ylab("Fraction")
## ----cleanupData---------------------------------------------------------
keepItems <- c("goFractions", "sample1_org", "sample2_org", "hsGO", "useGO", "useGOSizeClass", "GO_hi", "GO_low", "GO_med", "grpHi", "grpLow", "grpMed", "minGO", "nGO", "nGene", "universeGenes", "savePlot", "useTheme", "fitALL2")
allItems <- ls()
delItems <- allItems[!(allItems %in% keepItems)]
rm(list=delItems)
## ----runCalcs------------------------------------------------------------
samples_noNoise <- list(sample1 = sample1_org, sample2 = sample2_org)
go_noNoise <- hyperGOMultiEnrichment(samples_noNoise, universe=universeGenes)
## ----noNoisePvalues------------------------------------------------------
noNoise_results <- pvaluesMultiEnrich(c("sample1", "sample2"), useGO, go_noNoise)
noNoise_pvalues <- pvalueDiffSig(noNoise_results$values, pCutoff=0.05, log=TRUE)
## ----noNoiseAddInfo------------------------------------------------------
noNoise_pvalues$sizeClass <- useGOSizeClass
noNoise_pvalues$size <- goFractions[1:length(useGOSizeClass),4]
noNoise_pvalues$frac <- goFractions$frac[1:length(useGOSizeClass)]
## ----noNoisePlotStuff----------------------------------------------------
noNoise_pvalues$sizeClass <- factor(noNoise_pvalues$sizeClass, levels=c("low", "med", "hi"), ordered=TRUE)
p <- ggplot(noNoise_pvalues, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ sizeClass, scales="free_x") + xlab("fraction") + useTheme + ggtitle("Sample - Combined Differences") + xlab("Fraction") + ylab("Diff. log p-values")
p
savePlot(p, "nonoise_sample_combineddiff.png")
ggplot(noNoise_pvalues, aes(x=sizeClass, y=diff)) + geom_boxplot() + geom_point() + ggtitle("Sample - Combined Differences") + xlab("Size Class") + ylab("Diff. log p-values")
## ----noNoiseBetterPVals--------------------------------------------------
tapply(noNoise_pvalues$diff, noNoise_pvalues$sizeClass, function(x){sum(x > 0)})
## ----noNoiseBetterMean---------------------------------------------------
tapply(noNoise_pvalues$diff, noNoise_pvalues$sizeClass, mean)
## ----noiseGenes----------------------------------------------------------
nNoise <- 500
noiseGenes <- possibleNoise(hsGO, useGO)
noiseGenes <- sample(noiseGenes, nNoise)
# check that we did this right, the fraction should not change after adding noise genes
sample1 <- c(sample1_org, noiseGenes)
sample2 <- c(sample2_org, noiseGenes)
samplesNoise <- list(sample1=sample1, sample2=sample2)
goFracNoise <- calcFraction(hsGO[useGO], samplesNoise)
plot(goFracNoise$frac, goFractions$frac)
## ----noisyEnrichment-----------------------------------------------------
go_noise <- hyperGOMultiEnrichment(samplesNoise, universeGenes)
## ----noisePvalues--------------------------------------------------------
noise_results <- pvaluesMultiEnrich(c("sample1", "sample2"), useGO, go_noise)
noise_pvalues <- pvalueDiffSig(noise_results$values, pCutoff=0.05, log=TRUE)
noise_pvalues$sizeClass <- useGOSizeClass
noise_pvalues$size <- goFracNoise[1:length(useGOSizeClass),4]
noise_pvalues$frac <- goFracNoise$frac[1:length(useGOSizeClass)]
## ----plotSummarizeNoise--------------------------------------------------
noise_pvalues$sizeClass <- factor(noise_pvalues$sizeClass, c("low", "med", "hi"), ordered=TRUE)
ggplot(noise_pvalues, aes(x=frac, y=diff, color=sigState)) + geom_point() + facet_grid(. ~ sizeClass, scales="free_x") + useTheme + ggtitle("Sample - Combined with Noise") + xlab("Fraction") + ylab("Diff. log p-values")
## ----summarizeCountsNoise------------------------------------------------
tapply(noise_pvalues$diff, noise_pvalues$sizeClass, function(x){sum(x > 0)})
tapply(noise_pvalues$diff, noise_pvalues$sizeClass, mean)
## ----diffDiffs-----------------------------------------------------------
diff_pvalues <- noise_pvalues
diff_pvalues$diff <- noise_pvalues$diff - noNoise_pvalues$diff
ggplot(diff_pvalues, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ sizeClass, scales="free_x") + ggtitle("difference of differences noise vs noNoise") + useTheme + xlab("Fraction") + ylab("Diff. noise / noNoise")
tapply(diff_pvalues$diff, diff_pvalues$sizeClass, mean)
## ----geneSamples, eval=FALSE---------------------------------------------
## nTimes <- 100
## nGene <- 1000
##
## testSamples <- lapply(seq(1, nTimes), function(x){
## list(s1 = sampleTerms(useGO, hsGO, 1000, 4)[1:nGene],
## s2 = sampleTerms(useGO, hsGO, 1000, 4)[1:nGene],
## noise = sample(noiseGenes, nNoise))
## })
##
## save(useGO, testSamples, universeGenes, useGOSizeClass, file="inst/data/100GeneSamples.RData")
## ----runSamples, eval=FALSE----------------------------------------------
## data("100GeneSamples")
## library(snowfall)
## # load("inst/data/100GeneSamples.RData")
## testFun <- function(x){
## nnList <- x[c("s1", "s2")]
## nnRes <- hyperGOMultiEnrichment(nnList, universe=universeGenes)
## nn_res <- pvaluesMultiEnrich(c("s1", "s2"), useGO, nnRes)
## nnP <- pvalueDiffSig(nn_res$values, pCutoff=0.05, log=TRUE)
## nnP$class <- useGOSizeClass
##
## noiList <- list(s1 = c(x$s1, x$noise), s2 = c(x$s2, x$noise))
## noiRes <- hyperGOMultiEnrichment(noiList, universe=universeGenes)
## noi_res <- pvaluesMultiEnrich(c("s1", "s2"), useGO, noiRes)
## noiP <- pvalueDiffSig(noi_res$values, pCutoff=0.05, log=TRUE)
## noiP$class <- useGOSizeClass
##
## return(list(clean = nnP, noise = noiP))
## }
## sfInit(parallel=TRUE, cpus=10)
## sfExport("useGOSizeClass", "universeGenes", "useGO")
## sfLibrary(ccPaper)
## sfLibrary(GO.db)
## sfLibrary(org.Hs.eg.db)
## testRes <- sfLapply(testSamples, testFun)
## sfStop()
## save(testRes, file="inst/data/100GeneResults.RData")
## ----noNoiseStats--------------------------------------------------------
data("100GeneResults")
cleanRes <- lapply(testRes, function(x){x$clean})
testStats_clean <- sameGOStats(cleanRes)
testStats_clean$class <- useGOSizeClass
testStats_clean$frac <- goFractions$frac[1:length(useGOSizeClass)]
testStats_clean$class <- factor(testStats_clean$class, levels=c("low", "med", "hi"), ordered=TRUE)
p <- ggplot(testStats_clean, aes(x=frac, y=mean, ymax=max, ymin=min)) + geom_point() + geom_errorbar() + facet_grid(. ~ class, scales="free_x") + useTheme + ggtitle("Sample - List Differences, 100 Gene Samples") + xlab("Fraction") + ylab("Mean diff. log p-values")
p
savePlot(p, "genesamples100_sampleListDiffs.png")
## ----noiseStats----------------------------------------------------------
noiseRes <- lapply(testRes, function(x){x$noise})
testStats_noise <- sameGOStats(noiseRes)
testStats_noise$class <- useGOSizeClass
testStats_noise$frac <- goFracNoise$frac[1:length(useGOSizeClass)]
testStats_noise$class <- factor(testStats_noise$class, levels=c("low", "med", "hi"), ordered=TRUE)
ggplot(testStats_noise, aes(x=frac, y=mean, ymax=max, ymin=min)) + geom_point() + geom_errorbar() + facet_grid(. ~ class, scales="free_x") + ggtitle("Sample - List Differences, 100 Gene Samples, with noise") + xlab("Fraction") + ylab("Mean diff. log p-values") + useTheme
## ----100GeneNoiseClean---------------------------------------------------
noiseCleanDiff <- diffGOStats(testRes)
noiseCleanDiff$class <- useGOSizeClass
noiseCleanDiff$frac <- goFracNoise$frac[1:length(useGOSizeClass)]
noiseCleanDiff$class <- factor(noiseCleanDiff$class, levels=c("low", "med", "hi"), ordered=TRUE)
ggplot(noiseCleanDiff, aes(x=frac, y=mean, ymax=max, ymin=min)) + geom_point() + geom_errorbar() + facet_grid(. ~ class, scales="free_x") + ggtitle("Difference of Differences, 100 Gene Samples") + useTheme + xlab("Fraction") + ylab("Diff noise / noNoise")
## ----genSamples, eval=FALSE----------------------------------------------
## goLimits <- list(low = c(10, 100), med = c(250, 500), hi = c(500, 1500))
##
## nSample <- 100
##
## sfInit(parallel=TRUE, cpus=10)
## sfLibrary(ccPaper)
## sfExport("hsGO", "nGO", "goLimits")
##
## testGOSample <- sfLapply(seq(1, nSample), function(x){
## t1 <- goSample(hsGO, nGO, goLimits, nSample=2, nGene=1000, nNoise=500)
## return(t1)
## })
##
## sfStop()
## save(testGOSample, file="inst/data/100GOSamples.RData")
## ----runGOSamples, eval=FALSE--------------------------------------------
## data("100GOSamples")
## runGOSamples <- function(x){
## cleanSample <- x$geneSample
## cleanRes <- hyperGOMultiEnrichment(cleanSample, universe=universeGenes)
##
## cleanP <- pvaluesMultiEnrich(names(cleanSample), x$goData$id, cleanRes)
## cleanP_dif <- pvalueDiffSig(cleanP$values, pCutoff=0.05, log=TRUE)
## cleanP_dif <- cbind(cleanP_dif, x$goData)
##
## noiseSample <- lapply(names(cleanSample), function(inSample){
## c(x$geneSample[[inSample]], x$noiseSample[[inSample]])
## })
## names(noiseSample) <- names(cleanSample)
## noiseRes <- hyperGOMultiEnrichment(noiseSample, universe=universeGenes)
## noiseP <- pvaluesMultiEnrich(names(cleanSample), x$goData$id, noiseRes)
## noiseP_dif <- pvalueDiffSig(noiseP$values, pCutoff=0.05, log=TRUE)
## noiseP_dif <- cbind(noiseP_dif, x$goData)
## return(list(clean = cleanP_dif, noise = noiseP_dif))
## }
##
## library(snowfall)
## sfInit(parallel=TRUE, cpus=10)
## sfLibrary(ccPaper)
## sfLibrary(org.Hs.eg.db)
## sfLibrary(GO.db)
## sfExport("universeGenes")
##
## testGORes <- sfLapply(testGOSample, runGOSamples)
## sfStop()
## save(testGORes, file="inst/data/100GOResults.RData")
## ----lookDiffs-----------------------------------------------------------
data("100GOResults")
grab_noNoise <- function(x){
x$clean[,c("diff", "class", "frac")]
}
go_noNoise <- lapply(testGORes, grab_noNoise)
go_noNoise <- do.call(rbind, go_noNoise)
go_noNoise$class <- factor(go_noNoise$class, levels=c("low", "med", "hi"), ordered=TRUE)
go_noNoiseT <- go_noNoise[(!is.nan(go_noNoise$diff)),]
p <- ggplot(go_noNoiseT, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class, scales="free_x") + useTheme + stat_density2d(aes(fill = ..level..), geom="polygon") + ggtitle("Sample - List Differences, 100 GO Samples, noNoise") + xlab("Fraction") + ylab("Diff. log p-values") + labs(fill="density")
p
savePlot(p, "gosamples100_samplelistDiff.png")
grab_noise <- function(x){
x$noise[, c("diff", "class", "frac")]
}
go_noise <- lapply(testGORes, grab_noise)
go_noise <- do.call(rbind, go_noise)
go_noise$class <- factor(go_noise$class, levels=c("low", "med", "hi"), ordered=TRUE)
ggplot(go_noise, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class, scales="free_x") + ggtitle("Sample - List Differences, 100 GO Samples, Noise") + xlab("Fraction") + ylab("Diff. log p-values") + useTheme
grab_noiseDiff <- function(x){
outRes <- x$clean[, c("diff", "class", "frac")]
outRes$diff <- x$noise$diff - x$clean$diff
outRes
}
go_diff <- lapply(testGORes, grab_noiseDiff)
go_diff <- do.call(rbind, go_diff)
go_diff$class <- factor(go_diff$class, levels=c("low", "med", "hi"), ordered=TRUE)
ggplot(go_diff, aes(x = frac, y = diff)) + geom_point() + facet_grid(. ~ class, scales="free_x") + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + ylab("Diff. log p-values") + useTheme
ggplot(go_diff, aes(x = frac, y = diff, fill=class)) + geom_boxplot(position="identity", alpha=0.4) + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + ylab("Diff. noise / noNoise") + useTheme
## ----compNoiseNoNoise----------------------------------------------------
go_noise$noise <- "noise"
go_noNoise$noise <- "none"
go_all <- rbind(go_noise, go_noNoise)
rownames(go_all) <- NULL
ggplot(go_all, aes(x = frac, y = diff, fill = noise)) + geom_violin(trim=TRUE, alpha=0.5, position="identity") + facet_grid(. ~ class, scales="free_x") + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + ylab("Diff. log p-values") + useTheme
## ----aggregate-----------------------------------------------------------
go_diff$frac10 <- round(go_diff$frac * 10) / 10
ggplot(go_diff, aes(x = factor(frac10), y = diff)) + geom_boxplot() + facet_grid(. ~ class, scales="free_x") + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + ylab("Diff. log p-values") + useTheme
probPoint <- is.infinite(go_diff$diff) | is.na(go_diff$diff) | is.nan(go_diff$diff)
go_diff <- go_diff[!probPoint, ]
ggplot(go_diff, aes(x=frac, y=diff)) + stat_smooth() + facet_grid(. ~ class, scales="free_x") + ggtitle("Difference of Differences, 100 GO Samples") + xlab("Fraction") + ylab("Diff. log p-values") + useTheme
## ----sweepNoise, eval=FALSE----------------------------------------------
## noiseGenes <- possibleNoise(hsGO, useGO)
## sizeNoise <- seq(0, 1000, 10)
## fracShared <- seq(0, 1, 0.01)
## noiseSamples <- sweepNoiseSample(noiseGenes, nSamples=2, sizeNoise, fracShared)
##
## nGene <- 1000
## cleanSample <- list(s1 = sampleTerms(useGO, hsGO, nGene, 4)[1:nGene],
## s2 = sampleTerms(useGO, hsGO, nGene, 4)[1:nGene])
##
## save(noiseSamples, cleanSample, sizeNoise, fracShared, universeGenes, useGO, useGOSizeClass, file=file.path(useDir, 'inst/data/noiseSamples.RData'))
## ----runSweptNoise, eval=FALSE-------------------------------------------
## data(noiseSamples)
##
## useNoise <- unlist(noiseSamples, recursive=FALSE)
## sizeNoiseAll <- rep(sizeNoise, each=length(fracShared))
## fracSharedAll <- rep(fracShared, length(sizeNoise))
##
## runSweepNoise <- function(noiseGenes){
## inNoise <- lapply(seq(1, length(cleanSample)), function(x){
## c(cleanSample[[x]], noiseGenes[[x]])
## })
## names(inNoise) <- names(cleanSample)
##
## noiseRes <- hyperGOMultiEnrichment(inNoise, universe=universeGenes)
## noiseP <- pvaluesMultiEnrich(names(inNoise), useGO, noiseRes)
## noisePComp <- pvalueDiffSig(noiseP$values, pCutoff=0.05, log=TRUE)
## noisePComp$class <- useGOSizeClass
## return(noisePComp)
## }
##
## library(snowfall)
## sfInit(parallel=TRUE, cpus=10)
## sfLibrary(ccPaper)
## sfLibrary(org.Hs.eg.db)
## sfLibrary(GO.db)
## sfExport("universeGenes", "useGO", "useGOSizeClass", "cleanSample")
## noiseSweepRes <- sfLapply(useNoise, runSweepNoise)
## sfStop()
## save(noiseSweepRes, file.path(useDir, "inst/data/noiseSweepRes.RData"))
## ----loadSweptNoise------------------------------------------------------
data(noiseSamples)
data(noiseSweepRes)
noiseFrac <- data.frame(size=rep(sizeNoise, each=length(fracShared)),
shared=rep(fracShared, length(sizeNoise)))
## ----examineSweptNoise---------------------------------------------------
outMedians <- lapply(noiseSweepRes, function(inRes){
tapply(inRes$diff, inRes$class, median)
})
outMedians <- do.call(rbind, outMedians)
outMedians <- cbind(outMedians, noiseFrac)
ggplot(outMedians, aes(x=size, y=hi, color=shared)) + geom_point() + useTheme + ggtitle("Hi Median Difference") + xlab("Noise Added") + ylab("Median Diff.")
ggplot(outMedians, aes(x=size, y=med, color=shared)) + geom_point() + useTheme + ggtitle("Medium Median Difference") + xlab("Noise Added") + ylab("Median Diff.")
ggplot(outMedians, aes(x=size, y=low, color=shared)) + geom_point() + useTheme + ggtitle("Low Median Difference") + xlab("Noise Added") + ylab("Median Diff.")
## ----examine2------------------------------------------------------------
outMedians2 <- data.frame(diff=c(outMedians$low, outMedians$med, outMedians$hi), class=c(rep("low", nrow(outMedians)), rep("med", nrow(outMedians)), rep("hi", nrow(outMedians))), size=rep(outMedians$size, 3), shared=rep(outMedians$shared, 3))
outMedians2$class <- factor(outMedians2$class, c("low", "med", "hi"), ordered=TRUE)
p <- ggplot(outMedians2, aes(x=size, y=diff, color=shared)) + geom_point() + useTheme + facet_grid(class ~ ., scales="free_y") + ggtitle("Sample - Combined as a function of size and shared fraction")
p
savePlot(p, "samplecombined_functionSizeFraction.png")
## ----generateSampleRanks-------------------------------------------------
tALL <- topTable(fitALL2, number=Inf)
tALL <- tALL$t[(tALL$t < 10) & (tALL$t > -10)]
tValues <- runif(length(universeGenes), -6, 6)
gseaSamples <- list(s1 = sampleTerms(useGO, hsGO, 1000, 4),
s2 = sampleTerms(useGO, hsGO, 1000, 4))
gseaGOFracS1 <- calcFraction(hsGO[useGO], list(s1=gseaSamples[[1]]))
gseaTValues <- sampletValueGenGSEA(gseaSamples, universeGenes, tValues)
## ----checkValues---------------------------------------------------------
tmpDat <- data.frame(x=gseaTValues[[1]], type="all", stringsAsFactors=FALSE)
tmpDat$type[(universeGenes %in% gseaSamples[[1]])] <- "sample"
ggplot(tmpDat, aes(x=x, fill=type)) + geom_histogram(binwidth=0.1, position="identity") + ggtitle("location of samples") + useTheme
tmpComb <- data.frame(x=gseaTValues[["comb"]], type="all", stringsAsFactors=FALSE)
tmpComb$type[(universeGenes %in% gseaSamples[[1]])] <- "sample"
ggplot(tmpComb, aes(x=x, fill=type)) + geom_histogram(binwidth=0.1, position="identity") + ggtitle("location of samples") + useTheme
## ----unSomeGSEA----------------------------------------------------------
go2index <- symbols2indices(hsGO[useGO], universeGenes)
gseaSetResults <- lapply(gseaTValues, function(x){
mmGeneSetTest(go2index, x)
})
## ----gseaBetterGO--------------------------------------------------------
cleanDiff <- gseaDiffs(gseaSetResults)
cleanDiff$sizeClass <- useGOSizeClass
cleanDiff$sizeClass <- factor(cleanDiff$sizeClass, c("low", "med", "hi"), ordered=TRUE)
cleanDiff$frac <- gseaGOFracS1$frac
ggplot(cleanDiff, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ sizeClass) + ggtitle("Sample - List for GSEA, using GO") + useTheme + xlab("Fraction") + ylab("Diff. log p-values")
## ----gseaNoise-----------------------------------------------------------
nNoise <- 2000
noiseGenes <- possibleNoise(hsGO, useGO)
noiseGenes <- sample(noiseGenes, nNoise)
gseaNoise <- list(s1=c(gseaSamples[["s1"]], noiseGenes),
s2=c(gseaSamples[["s2"]], noiseGenes))
gseaTValues_noise <- sampletValueGenGSEA(gseaNoise, universeGenes, tValues)
tmpDat <- data.frame(x=gseaTValues_noise[[1]], type="all", stringsAsFactors=FALSE)
tmpDat$type[(universeGenes %in% gseaSamples[[1]])] <- "sample"
ggplot(tmpDat, aes(x=x, fill=type)) + geom_histogram(binwidth=0.1, position="identity", alpha=0.75) + ggtitle("Sample - List GSEA GO, adding noise genes") + useTheme
## ----runGSEANoise--------------------------------------------------------
gseaSetResults_noise <- lapply(gseaTValues_noise, function(x){
mmGeneSetTest(go2index, x)
})
## ----gseaBetter----------------------------------------------------------
noiseDiff <- gseaDiffs(gseaSetResults_noise)
noiseDiff$sizeClass <- useGOSizeClass
noiseDiff$sizeClass <- factor(noiseDiff$sizeClass, c("low", "med", "hi"), ordered=TRUE)
noiseDiff$frac <- gseaGOFracS1$frac
ggplot(noiseDiff, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ sizeClass) + ggtitle("Sample - List, GSEA GO with Noise") + useTheme + xlab("Fraction") + ylab("Diff. log p-values")
diffDiff <- data.frame(diff=(noiseDiff$diff - cleanDiff$diff), sizeClass=noiseDiff$sizeClass, frac=noiseDiff$frac)
ggplot(diffDiff, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ sizeClass) + ggtitle("Difference of Differences, GSEA GO") + useTheme + xlab("Fraction") + ylab("Diff. noise / noNoise")
## ----independentGSEA-----------------------------------------------------
nGene <- 10000
indepTValues <- sort(runif(nGene, -6, 6))
names(indepTValues) <- as.character(seq(1, nGene))
gseaMap <- list(t1=sample(seq(1, nGene/2), 100))
tmpDat <- data.frame(tVal=indepTValues, status="all", stringsAsFactors=F)
tmpDat$status[gseaMap[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x=tVal, fill=status)) + geom_histogram(binwidt=0.1, position="identity") + ggtitle("Check independent distribution, 100%") + useTheme + xlab("t-statistic")
geneSetTest(gseaMap$t1, indepTValues, alternative="down", ranks.only=FALSE)
gseaMap80 <- list(t1 = c(sample(seq(1, nGene/2), 80),
sample(seq((nGene/2) + 1, nGene), 20 )))
geneSetTest(gseaMap80$t1, indepTValues, alternative="down", ranks.only=FALSE)
tmpDat <- data.frame(tVal=indepTValues, status="all", stringsAsFactors=F)
tmpDat$status[gseaMap80[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x=tVal, fill=status)) + geom_histogram(binwidt=0.1, position="identity") + ggtitle("check independent distribution, 80%") + useTheme + xlab("t-statistic")
gseaMap50 <- list(t1 = c(sample(seq(1, nGene/2), 50),
sample(seq((nGene/2) + 1, nGene), 50)))
geneSetTest(gseaMap50$t1, indepTValues, alternative="down", ranks.only=FALSE)
gseaMapUnif <- list(t1 = sample(nGene, 100))
geneSetTest(gseaMapUnif$t1, indepTValues, alternative="down", ranks.only=FALSE)
## ----checkNabove---------------------------------------------------------
gseaAbove100 <- list(t1=sample(seq(100, nGene/2), 100))
geneSetTest(gseaAbove100$t1, indepTValues, alternative="down", ranks.only=FALSE)
gseaAbove500 <- list(t1=sample(seq(501, nGene/2), 100))
geneSetTest(gseaAbove500$t1, indepTValues, alternative="down", ranks.only=FALSE)
gseaAbove1000 <- list(t1=sample(seq(1001, nGene/2), 100))
geneSetTest(gseaAbove1000$t1, indepTValues, alternative="down", ranks.only=FALSE)
gseaAbove2000 <- list(t1=sample(seq(2001, nGene/2), 100))
geneSetTest(gseaAbove2000$t1, indepTValues, alternative="down", ranks.only=FALSE)
gseaAbove4000 <- list(t1=sample(seq(4001, nGene/2), 100))
geneSetTest(gseaAbove4000$t1, indepTValues, alternative="down", ranks.only=FALSE)
## ----gseaFunctions, eval=FALSE-------------------------------------------
## ?gseaSizeRange
## ?gseaProportion
## ----checkGSEAProp-------------------------------------------------------
testSize <- gseaSizeRange(c(500,500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- runif(50, .1, 1)
testStats <- gseaProportion(testSize, useSig, useStatistics)
fullIndex <- seq(1, length(useStatistics))
useCut <- 5000
realRanks <- lapply(testStats, function(inStat){
apply(inStat$stats, 2, function(inCol){
outRank <- rank(inCol)[inStat$index]
sum(outRank < useCut) / length(inStat$index)
})
})
realRanks <- do.call(rbind, realRanks)
plot(realRanks[,1], useSig)
plot(realRanks[,2], useSig)
plot(realRanks[,3], useSig)
## ----testFunctions-------------------------------------------------------
testSize <- gseaSizeRange(c(500,500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- runif(50, .1, 1)
testStats <- gseaProportion(testSize, useSig, useStatistics)
outP <- lapply(testStats, function(inStat){
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){
geneSetTest(inStat$index, inStat$stats[,useColumn], "down", ranks.only=FALSE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)
tmpP <- data.frame(set=apply(outP2[, c("s1", "s2")], 1, min), list=outP2[,"comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
p <- ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + useTheme + ggtitle("Sample - Combined average t-statistics") + xlab("Fraction") + ylab("Diff. log p-values")
p
savePlot(p, "samplecombined_tstatistics.png")
## ----gseaVaryProp, eval=FALSE--------------------------------------------
## ?gseaProportionVary
## ----checkgseaProportionsVary--------------------------------------------
testSize <- gseaSizeRange(c(500,500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- list(s1=runif(50, .1, 1), s2=runif(50, .1, 1))
testStats <- gseaProportionVary(testSize, useSig, useStatistics)
fullIndex <- seq(1, length(useStatistics))
useCut <- 5000
realRanks <- lapply(testStats, function(inStat){
apply(inStat$stats, 2, function(inCol){
outRank <- rank(inCol)[inStat$index]
sum(outRank < useCut) / length(inStat$index)
})
})
realRanks <- do.call(rbind, realRanks)
plot(realRanks[,1], useSig[[1]])
plot(realRanks[,2], useSig[[2]])
plot(realRanks[,3], useSig[[1]])
plot(realRanks[,3], useSig[[2]])
## ----gseaProportions2Data------------------------------------------------
testSize <- gseaSizeRange(c(500,500), 50)
useStatistics <- runif(10000, -6, 6)
useSig <- list(s1=runif(50, .1, 1), s2=runif(50, .1, 1))
testStats <- gseaProportionVary(testSize, useSig, useStatistics)
outP <- lapply(testStats, function(inStat){
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){
geneSetTest(inStat$index, inStat$stats[,useColumn], "down", ranks.only=FALSE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)
tmpP <- data.frame(set=apply(outP2[, c("s1", "s2")], 1, min), list=outP2[,"comb"])
tmpP$diff <- tmpP$set - tmpP$list
realRanks <- lapply(testStats, function(inStat){
apply(inStat$stats, 2, function(inCol){
outRank <- rank(inCol)[inStat$index]
sum(outRank < useCut) / length(inStat$index)
})
})
realRanks <- do.call(rbind, realRanks)
tmpP$frac <- realRanks[,3]
ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + ggtitle("Sample - Combined vary fraction") + useTheme + xlab("Fraction") + ylab("Diff. log p-values")
## ----generateRegression--------------------------------------------------
regData <- data.frame(y=c(-1, 0, 1), x=c(0, 0.5, 1))
regModel <- lm(y ~ x, regData)
testData <- runif(10000, 0, 1)
outData <- regModel$coefficients[1] + regModel$coefficients[2]*testData
hist(outData, 100)
## ----pvalRanks-----------------------------------------------------------
nGene <- 10000
indepTValues <- sort(runif(nGene, 0, 1))
names(indepTValues) <- as.character(seq(1, nGene))
gseaMap <- list(t1=sample(seq(1, nGene/4), 100))
tmpDat <- data.frame(tVal=indepTValues, status="all", stringsAsFactors=F)
tmpDat$status[gseaMap[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x=tVal, fill=status)) + geom_histogram(binwidt=0.01, position="identity") + ggtitle("check p-values, 100%") + useTheme + xlab("P-values")
geneSetTest(gseaMap$t1, p2signed(regModel,indepTValues), alternative="down", ranks.only=FALSE)
gseaMap80 <- list(t1 = c(sample(seq(1, nGene/2), 80),
sample(seq((nGene/2) + 1, nGene), 20 )))
geneSetTest(gseaMap80$t1, p2signed(regModel,indepTValues), alternative="down", ranks.only=FALSE)
tmpDat <- data.frame(tVal=indepTValues, status="all", stringsAsFactors=F)
tmpDat$status[gseaMap80[["t1"]]] <- "sample"
ggplot(tmpDat, aes(x=tVal, fill=status)) + geom_histogram(binwidt=0.01, position="identity") + ggtitle("check p-values, 80%") + useTheme + xlab("P-values")
gseaMap50 <- list(t1 = c(sample(seq(1, nGene/2), 50),
sample(seq((nGene/2) + 1, nGene), 50)))
geneSetTest(gseaMap50$t1, p2signed(regModel,indepTValues), alternative="down", ranks.only=FALSE)
## ----gseaPropFisher, eval=FALSE------------------------------------------
## ?gseaProportionFisher
## ----checkGSEAPropFisher-------------------------------------------------
testSize <- gseaSizeRange(c(500,500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, .1, 1)
testStats <- gseaProportionFisher(testSize, useSig, useStatistics)
fullIndex <- seq(1, length(useStatistics))
useCut <- 5000
realRanks <- lapply(testStats, function(inStat){
apply(inStat$stats, 2, function(inCol){
outRank <- rank(inCol)[inStat$index]
sum(outRank < useCut) / length(inStat$index)
})
})
realRanks <- do.call(rbind, realRanks)
plot(realRanks[,1], useSig)
plot(realRanks[,2], useSig)
plot(realRanks[,3], useSig)
## ----testFishers---------------------------------------------------------
testSize <- gseaSizeRange(c(500,500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, .1, 1)
testStats <- gseaProportionFisher(testSize, useSig, useStatistics)
outP <- lapply(testStats, function(inStat){
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){
geneSetTest(inStat$index, p2signed(regModel, inStat$stats[,useColumn]), "down", ranks.only=FALSE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)
tmpP <- data.frame(set=apply(outP2[, c("s1", "s2")], 1, min), list=outP2[,"comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + ggtitle("Sample - Combined Fishers") + useTheme + xlab("Fraction") + ylab("Diff. log p-values")
# try also using ranks only
outPRank <- lapply(testStats, function(inStat){
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){
geneSetTest(inStat$index, inStat$stats, ranks.only=TRUE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2Rank <- do.call(rbind, outPRank)
outP2Rank <- -1 * log(outP2Rank)
tmpP <- data.frame(set=apply(outP2Rank[, c("s1", "s2")], 1, min), list=outP2Rank[,"comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + ggtitle("Sample - Combined Fishers Ranks") + useTheme + xlab("Fraction") + ylab("Diff. log p-values")
## ----gseaProportionMax, eval=FALSE---------------------------------------
## ?gseaProportionMax
## ----checkGSEAMax--------------------------------------------------------
testSize <- gseaSizeRange(c(500,500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, .1, 1)
testStats <- gseaProportionMax(testSize, useSig, useStatistics)
fullIndex <- seq(1, length(useStatistics))
useCut <- 5000
realRanks <- lapply(testStats, function(inStat){
apply(inStat$stats, 2, function(inCol){
outRank <- rank(inCol)[inStat$index]
sum(outRank < useCut) / length(inStat$index)
})
})
realRanks <- do.call(rbind, realRanks)
plot(realRanks[,1], useSig)
plot(realRanks[,2], useSig)
plot(realRanks[,3], useSig)
## ----runGSEAMax----------------------------------------------------------
testSize <- gseaSizeRange(c(500,500), 50)
useStatistics <- runif(10000, 0, 1)
useSig <- runif(50, .1, 1)
testStats <- gseaProportionMax(testSize, useSig, useStatistics)
outP <- lapply(testStats, function(inStat){
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){
geneSetTest(inStat$index, p2signed(regModel, inStat$stats[,useColumn]), "down", ranks.only=FALSE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)
tmpP <- data.frame(set=apply(outP2[, c("s1", "s2")], 1, min), list=outP2[,"comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- useSig
ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + ggtitle("Sample - Combined Max-P") + useTheme + xlab("Fraction") + ylab("Diff. log p-values")
## ----varySizesAndProps---------------------------------------------------
gseaLo <- gseaSizeRange(c(20, 250), 50)
gseaMed <- gseaSizeRange(c(250, 500), 30)
gseaHi <- gseaSizeRange(c(500, 1000), 20)
gseaSizes <- c(gseaLo, gseaMed, gseaHi)
gseaSizeClass <- c(rep("low", 50), rep("med", 30), rep("hi", 20))
gseaFrac <- runif(100, 0.3, 1)
gseaStatistics <- runif(10000, 0, 1)
testStats <- gseaProportionMax(gseaSizes, gseaFrac, gseaStatistics)
outP <- lapply(testStats, function(inStat){
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){
geneSetTest(inStat$index, p2signed(regModel, inStat$stats[,useColumn]), "down", ranks.only=FALSE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2 <- do.call(rbind, outP)
outP2 <- -1 * log(outP2)
tmpP <- data.frame(set=apply(outP2[, c("s1", "s2")], 1, min), list=outP2[,"comb"])
tmpP$diff <- tmpP$set - tmpP$list
tmpP$frac <- gseaFrac
tmpP$class <- gseaSizeClass
tmpP$class <- factor(tmpP$class, c("low", "med", "hi"), ordered=TRUE)
ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ class) + useTheme
p <- ggplot(tmpP, aes(x=frac, y=diff)) + geom_point() + useTheme + ggtitle("Sample - Combined Max-P") + xlab("Fraction") + ylab("Diff. log p-values")
p
savePlot(p, "samplecombined_maxp.png")
## ----maxPRankOnly--------------------------------------------------------
outPRank <- lapply(testStats, function(inStat){
tmpP <- lapply(seq(1, ncol(inStat$stats)), function(useColumn){
geneSetTest(inStat$index, inStat$stats[,useColumn], ranks.only=TRUE)
})
tmpP <- do.call(cbind, tmpP)
colnames(tmpP) <- colnames(inStat$stats)
return(tmpP)
})
outP2Rank <- do.call(rbind, outPRank)
outP2Rank <- -1 * log(outP2Rank)
tmpPRank <- data.frame(set=apply(outP2Rank[, c("s1", "s2")], 1, min), list=outP2Rank[,"comb"])
tmpPRank$diff <- tmpPRank$set - tmpPRank$list
tmpPRank$frac <- gseaFrac
tmpPRank$class <- gseaSizeClass
tmpPRank$class <- factor(tmpPRank$class, c("low", "med", "hi"), ordered=TRUE)
ggplot(tmpPRank, aes(x=frac, y=diff)) + geom_point() + facet_grid(. ~ class) + ggtitle("Sample - Combined Max-P Ranks") + xlab("Fraction") + ylab("Diff. log p-values")
## ------------------------------------------------------------------------
Sys.time()
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.