knitr::opts_chunk$set(echo = TRUE)
As a second RNA-seq dataset, we will test for differences in gene expression upon the knockout of the microRNA mir-200c (Kim et al., 2013). The raw fastq files can be found under the accession number SRP030475. As the number of samples is limited the experiment might be underpowered, as in most RNA-seq analysis. This is an experimental scenario that could benefit from power gained using modern FDR control methods.
library(dplyr) library(ggplot2) library(DESeq2) library(SummarizedBenchmark) library(BiocParallel) library(recount) ## load helper functions for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) { source(f) } ## project data/results folders datdir <- "data" resdir <- "results" sbdir <- "../../results/RNAseq" dir.create(datdir, showWarnings = FALSE) dir.create(resdir, showWarnings = FALSE) dir.create(sbdir, showWarnings = FALSE) ## intermediary files we create below count_file <- file.path(datdir, "rse_gene.Rdata") result_file <- file.path(resdir, "mir200c-results.rds") bench_file <- file.path(sbdir, "mir200c-benchmark.rds") bench_file_uninf <- file.path(sbdir, "mir200c-uninf-benchmark.rds") ## set up parallel backend cores <- as.numeric(Sys.getenv("SLURM_NTASKS")) multicoreParam <- SerialParam()
We will download the pre-processed gene level counts available through recount2.
if (!file.exists(count_file)) { download_study('SRP030475', outdir = datdir) } load(count_file) dsd <- scale_counts(rse_gene)
We next subset for samples containing the control samples and the samples where mir200c was knocked down. recount2 downloads data as a RangeSummarizedExperiment object, so we convert this into a DESeqDataSet object.
dsd <- dsd[, grepl("WT|200c", colData(dsd)$title)] colData(dsd)$mir200c <- factor(ifelse(grepl("WT", colData(dsd)$title), "WT", "KO")) dsd <- as(dsd, "DESeqDataSet") storage.mode(assays(dsd)[["counts"]]) <- "integer"
Then, we set the design parameter to test for differences in expression upon mir200c knockout and run DESeq2. Similarly to the previous dataset, we set the parameter independentFiltering=FALSE.
if (file.exists(result_file)) { res <- readRDS(result_file) } else { design(dsd) <- ~ mir200c dds <- DESeq(dsd) res <- results(dds, independentFiltering = FALSE) %>% as.data.frame() %>% na.omit() %>% dplyr::select(pvalue, baseMean, log2FoldChange, lfcSE, stat) %>% dplyr::rename(pval = pvalue, ind_covariate = baseMean, effect_size = log2FoldChange, SE = lfcSE, test_statistic = stat) saveRDS(res, file = result_file) }
Add random (uninformative) covariate.
set.seed(4719) res$rand_covar <- rnorm(nrow(res))
As with the GTEx example, the mean counts is used as the informative covariate.
rank_scatter(res, pvalue = "pval", covariate = "ind_covariate") + ggtitle("Mean coverage as independent covariate") + xlab("Mean Expression")
Similar to the GTEx dataset, keeping all the tests results in a strange discreteness. This is removed once we filter very lowly expressed genes. For the first covariate bin, however, there is a strange behaviour in which the distribution seems a bit skewed towards larger p-values.
strat_hist(res, pvalue = "pval", covariate = "ind_covariate", maxy = 7.5) res <- filter(res, ind_covariate > 1) strat_hist(res, pvalue = "pval", covariate = "ind_covariate", maxy = 3)
rank_scatter(res, pvalue = "pval", covariate = "rand_covar") + ggtitle("Random independent covariate")
strat_hist(res, pvalue = "pval", covariate = "rand_covar", maxy = 7.5)
We use the common BenchDesign with the set of multiple testing correction
methods already included. We also add in Scott's FDR Regression (both
nulltype = "empirical" and nulltype = "theoretical")
since our test statistics are t-distributed.
if (file.exists(bench_file)) { sb <- readRDS(bench_file) } else { bd <- initializeBenchDesign() bd <- addBMethod(bd, "fdrreg-t", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'theoretical', control = list(lambda = 0.01)) bd <- addBMethod(bd, "fdrreg-e", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'empirical', control = list(lambda = 0.01)) sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = bench_file) }
There are some warnings from both BH and fdrreg-empirical. However, they do return results.
I tried to increase the nmids parameter for Scott's FDR Regression but this does not appear to make a difference.
head(assays(sb)[["bench"]])
fdrreg-empirical rejects many more hypothesis than the rest of the methods, followed by lfdr and ihw.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb) estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>% filter(performanceMetric == "rejections") %>% select(blabel, performanceMetric, alpha, value) %>% mutate(n = nrow(sb), prop = round(value / n, 3)) %>% arrange(desc(value)) %>% as_tibble() %>% print(n = 40)
rejections_scatter(sb, supplementary = FALSE) rejection_scatter_bins(sb, covariate = "ind_covariate", bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb), order.by = "freq", decreasing = TRUE, supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
We use the common BenchDesign with the set of multiple testing correction
methods already included. We also add in Scott's FDR Regression (both
nulltype = "empirical" and nulltype = "theoretical")
since our test statistics are t-distributed.
if (file.exists(bench_file_uninf)) { sb <- readRDS(bench_file_uninf) } else { bd <- initializeBenchDesign() bd <- addBMethod(bd, "fdrreg-t", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'theoretical', control = list(lambda = 0.01)) bd <- addBMethod(bd, "fdrreg-e", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'empirical', control = list(lambda = 0.01)) res <- res %>% dplyr::mutate(ind_covariate = rand_covar) sb <- buildBench(bd, data = res, ftCols = "ind_covariate") saveRDS(sb, file = bench_file_uninf) }
head(assays(sb)[["bench"]])
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb) estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>% filter(performanceMetric == "rejections") %>% select(blabel, performanceMetric, alpha, value) %>% mutate(n = nrow(sb), prop = round(value / n, 3)) %>% arrange(desc(value)) %>% as_tibble() %>% print(n = 40)
rejections_scatter(sb, supplementary = FALSE) rejection_scatter_bins(sb, covariate = "ind_covariate", bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb), order.by = "freq", decreasing = TRUE, supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
Here we compare the method ranks for the different covariates at alpha = 0.10.
plotMethodRanks(c(bench_file, bench_file_uninf), colLabels = c("mean", "uninf"), alpha = 0.10, xlab = "Comparison")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.