knitr::opts_chunk$set(echo = TRUE)
As a second RNA-seq dataset, we will test for differences in gene expression upon the knockout of the microRNA mir-200c (Kim et al., 2013). The raw fastq files can be found under the accession number SRP030475
. As the number of samples is limited the experiment might be underpowered, as in most RNA-seq analysis. This is an experimental scenario that could benefit from power gained using modern FDR control methods.
library(dplyr) library(ggplot2) library(DESeq2) library(SummarizedBenchmark) library(BiocParallel) library(recount) ## load helper functions for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) { source(f) } ## project data/results folders datdir <- "data" resdir <- "results" sbdir <- "../../results/RNAseq" dir.create(datdir, showWarnings = FALSE) dir.create(resdir, showWarnings = FALSE) dir.create(sbdir, showWarnings = FALSE) ## intermediary files we create below count_file <- file.path(datdir, "rse_gene.Rdata") result_file <- file.path(resdir, "mir200c-results.rds") bench_file <- file.path(sbdir, "mir200c-benchmark.rds") bench_file_uninf <- file.path(sbdir, "mir200c-uninf-benchmark.rds") ## set up parallel backend cores <- as.numeric(Sys.getenv("SLURM_NTASKS")) multicoreParam <- SerialParam()
We will download the pre-processed gene level counts available through recount2.
if (!file.exists(count_file)) { download_study('SRP030475', outdir = datdir) } load(count_file) dsd <- scale_counts(rse_gene)
We next subset for samples containing the control samples and the samples where mir200c was knocked down. recount2 downloads data as a RangeSummarizedExperiment object, so we convert this into a DESeqDataSet object.
dsd <- dsd[, grepl("WT|200c", colData(dsd)$title)] colData(dsd)$mir200c <- factor(ifelse(grepl("WT", colData(dsd)$title), "WT", "KO")) dsd <- as(dsd, "DESeqDataSet") storage.mode(assays(dsd)[["counts"]]) <- "integer"
Then, we set the design parameter to test for differences in expression upon mir200c knockout and run DESeq2. Similarly to the previous dataset, we set the parameter independentFiltering=FALSE
.
if (file.exists(result_file)) { res <- readRDS(result_file) } else { design(dsd) <- ~ mir200c dds <- DESeq(dsd) res <- results(dds, independentFiltering = FALSE) %>% as.data.frame() %>% na.omit() %>% dplyr::select(pvalue, baseMean, log2FoldChange, lfcSE, stat) %>% dplyr::rename(pval = pvalue, ind_covariate = baseMean, effect_size = log2FoldChange, SE = lfcSE, test_statistic = stat) saveRDS(res, file = result_file) }
Add random (uninformative) covariate.
set.seed(4719) res$rand_covar <- rnorm(nrow(res))
As with the GTEx example, the mean counts is used as the informative covariate.
rank_scatter(res, pvalue = "pval", covariate = "ind_covariate") + ggtitle("Mean coverage as independent covariate") + xlab("Mean Expression")
Similar to the GTEx dataset, keeping all the tests results in a strange discreteness. This is removed once we filter very lowly expressed genes. For the first covariate bin, however, there is a strange behaviour in which the distribution seems a bit skewed towards larger p-values.
strat_hist(res, pvalue = "pval", covariate = "ind_covariate", maxy = 7.5) res <- filter(res, ind_covariate > 1) strat_hist(res, pvalue = "pval", covariate = "ind_covariate", maxy = 3)
rank_scatter(res, pvalue = "pval", covariate = "rand_covar") + ggtitle("Random independent covariate")
strat_hist(res, pvalue = "pval", covariate = "rand_covar", maxy = 7.5)
We use the common BenchDesign
with the set of multiple testing correction
methods already included. We also add in Scott's FDR Regression (both
nulltype = "empirical"
and nulltype = "theoretical"
)
since our test statistics are t-distributed.
if (file.exists(bench_file)) { sb <- readRDS(bench_file) } else { bd <- initializeBenchDesign() bd <- addBMethod(bd, "fdrreg-t", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'theoretical', control = list(lambda = 0.01)) bd <- addBMethod(bd, "fdrreg-e", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'empirical', control = list(lambda = 0.01)) sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = bench_file) }
There are some warnings from both BH
and fdrreg-empirical
. However, they do return results.
I tried to increase the nmids
parameter for Scott's FDR Regression but this does not appear to make a difference.
head(assays(sb)[["bench"]])
fdrreg-empirical
rejects many more hypothesis than the rest of the methods, followed by lfdr
and ihw
.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb) estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>% filter(performanceMetric == "rejections") %>% select(blabel, performanceMetric, alpha, value) %>% mutate(n = nrow(sb), prop = round(value / n, 3)) %>% arrange(desc(value)) %>% as_tibble() %>% print(n = 40)
rejections_scatter(sb, supplementary = FALSE) rejection_scatter_bins(sb, covariate = "ind_covariate", bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb), order.by = "freq", decreasing = TRUE, supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
We use the common BenchDesign
with the set of multiple testing correction
methods already included. We also add in Scott's FDR Regression (both
nulltype = "empirical"
and nulltype = "theoretical"
)
since our test statistics are t-distributed.
if (file.exists(bench_file_uninf)) { sb <- readRDS(bench_file_uninf) } else { bd <- initializeBenchDesign() bd <- addBMethod(bd, "fdrreg-t", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'theoretical', control = list(lambda = 0.01)) bd <- addBMethod(bd, "fdrreg-e", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'empirical', control = list(lambda = 0.01)) res <- res %>% dplyr::mutate(ind_covariate = rand_covar) sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = bench_file_uninf) }
head(assays(sb)[["bench"]])
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb) estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>% filter(performanceMetric == "rejections") %>% select(blabel, performanceMetric, alpha, value) %>% mutate(n = nrow(sb), prop = round(value / n, 3)) %>% arrange(desc(value)) %>% as_tibble() %>% print(n = 40)
rejections_scatter(sb, supplementary = FALSE) rejection_scatter_bins(sb, covariate = "ind_covariate", bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb), order.by = "freq", decreasing = TRUE, supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Here we compare the method ranks for the different covariates at alpha = 0.10.
plotMethodRanks(c(bench_file, bench_file_uninf), colLabels = c("mean", "uninf"), alpha = 0.10, xlab = "Comparison")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.