knitr::opts_chunk$set(echo = TRUE)
The objective of this document is benchmark methods to control FDR in the context of differential gene expression.
The data consists of 20 samples from two regions of the human basal ganglia, the nucleus accumbens and the putamen, from the GTEx project. Shortly, samples were downloaded using the Short Read Archive Toolkit and mapped to the human reference genome version GRCh38 using STAR v2.4.2a. htseq-count was used to tabulate the number of uniquely mapping reads for each gene. We used DESeq2 to format the data into a DESeqDataSet object.
library(dplyr) library(ggplot2) library(DESeq2) library(SummarizedBenchmark) library(BiocParallel) ## load helper functions for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) { source(f) } ## project data/results folders datdir <- "data" resdir <- "results" sbdir <- "../../results/RNAseq" dir.create(datdir, showWarnings = FALSE) dir.create(resdir, showWarnings = FALSE) dir.create(sbdir, showWarnings = FALSE) ## intermediary files we create below count_file <- file.path(resdir, "brain-counts.rds") result_file <- file.path(resdir, "brain-results.rds") bench_file <- file.path(sbdir, "brain-benchmark.rds") bench_file_uninf <- file.path(sbdir, "brain-uninf-benchmark.rds") ## set up parallel backend cores <- as.numeric(Sys.getenv("SLURM_NTASKS")) multicoreParam <- SerialParam()
We download the DESeqDataSet object from zenodo that contains the gene level counts for GTEx samples.
if (!file.exists(count_file)) { download.file("https://zenodo.org/record/1475409/files/rnaseq-brain-counts.rds?download=1", destfile = count_file) } dsd <- readRDS(count_file)
We use DESeq2 to test for differential gene expression between the two cell types. We set the parameter independentFiltering=FALSE
to skip the independent filtering step, as this step would be redundant with some of the FDR control methods that use gene expression as an independent covariate to increase power.
if (file.exists(result_file)) { res <- readRDS(result_file) } else { dds <- DESeq(dsd, parallel = TRUE, BPPARAM = multicoreParam) res <- results(dds, independentFiltering = FALSE) %>% as.data.frame() %>% na.omit() %>% dplyr::select(pvalue, baseMean, log2FoldChange, lfcSE, stat) %>% dplyr::rename(pval = pvalue, ind_covariate = baseMean, effect_size = log2FoldChange, SE = lfcSE, test_statistic = stat) saveRDS(res, file = result_file) }
Add random (uninformative) covariate.
set.seed(83750) res$rand_covar <- rnorm(nrow(res))
In RNA-seq differential expression analysis, it is very well established that the mean expression is an informative covariate that is independent under the null and that can be used to increase power while keeping FDR control.
rank_scatter(res, pvalue = "pval", covariate = "ind_covariate") + ggtitle("Mean coverage as independent covariate") + xlab("Mean Expression")
We noticed some discreteness in the distribution of p-values towards values close to 1 that was particularly pronounced in the first covariate bin. The overall distribution of p-values could violate some of the assumptions of the FDR control methods.
strat_hist(res, pvalue = "pval", covariate = "ind_covariate", maxy = 25)
This discreteness, however, corresponded to genes with very low expression values. For example, there were r nrow(dplyr::filter(res, ind_covariate <= 1))
genes with less than an average of 1 read across the 20 samples. The distribution of p-values looked much better when removing these very lowly expressed genes.
res <- dplyr::filter(res, ind_covariate >= 1) strat_hist(res, pvalue = "pval", covariate = "ind_covariate", maxy = 25)
rank_scatter(res, pvalue = "pval", covariate = "rand_covar") + ggtitle("Random independent covariate")
strat_hist(res, pvalue = "pval", covariate = "rand_covar", maxy = 25)
We use the common BenchDesign
with the set of multiple testing correction
methods already included. We also add in Scott's FDR Regression (both
nulltype = "empirical"
and nulltype = "theoretical"
)
since our test statistics are approximately t-distributed.
if (file.exists(bench_file)) { sb <- readRDS(bench_file) } else { bd <- initializeBenchDesign() bd <- addBMethod(bd, "fdrreg-t", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'theoretical', control = list(lambda = 0.01)) bd <- addBMethod(bd, "fdrreg-e", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'empirical', control = list(lambda = 0.01)) sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = bench_file) }
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb) estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>% filter(performanceMetric == "rejections") %>% select(blabel, performanceMetric, alpha, value) %>% mutate(n = nrow(sb), prop = round(value / n, 3)) %>% arrange(desc(value)) %>% as_tibble() %>% print(n = 40)
ash was the method that rejected the largest number of hypotheses, followed by lfdr and fdrreg-theoretical.
rejections_scatter(sb, supplementary = FALSE) rejection_scatter_bins(sb, covariate = "ind_covariate", bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb), order.by = "freq", decreasing = TRUE, supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
We use the common BenchDesign
with the set of multiple testing correction
methods already included. We also add in Scott's FDR Regression (both
nulltype = "empirical"
and nulltype = "theoretical"
)
since our test statistics are approximately t-distributed.
if (file.exists(bench_file_uninf)) { sb <- readRDS(bench_file_uninf) } else { bd <- initializeBenchDesign() bd <- addBMethod(bd, "fdrreg-t", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'theoretical', control = list(lambda = 0.01)) bd <- addBMethod(bd, "fdrreg-e", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'empirical', control = list(lambda = 0.01)) res <- res %>% dplyr::mutate(ind_covariate = rand_covar) sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = bench_file_uninf) }
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb) estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>% filter(performanceMetric == "rejections") %>% select(blabel, performanceMetric, alpha, value) %>% mutate(n = nrow(sb), prop = round(value / n, 3)) %>% arrange(desc(value)) %>% as_tibble() %>% print(n = 40)
rejections_scatter(sb, supplementary = FALSE) rejection_scatter_bins(sb, covariate = "ind_covariate", bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb), order.by = "freq", decreasing = TRUE, supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Here we compare the method ranks for the different covariates at alpha = 0.10.
plotMethodRanks(c(bench_file, bench_file_uninf), colLabels = c("mean", "uninf"), alpha = 0.10, xlab = "Comparison")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.