knitr::opts_chunk$set(echo = TRUE)

Summary

As a second RNA-seq dataset, we will test for differences in gene expression upon the knockout of the microRNA mir-200c (Kim et al., 2013). The raw fastq files can be found under the accession number SRP030475. As the number of samples is limited the experiment might be underpowered, as in most RNA-seq analysis. This is an experimental scenario that could benefit from power gained using modern FDR control methods.

Workspace Setup

library(dplyr)
library(ggplot2)
library(DESeq2)
library(SummarizedBenchmark)
library(BiocParallel)
library(recount)

## load helper functions
for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) {
    source(f)
}

## project data/results folders
datdir <- "data"
resdir <- "results"
sbdir <- "../../results/RNAseq"
dir.create(datdir, showWarnings = FALSE)
dir.create(resdir, showWarnings = FALSE)
dir.create(sbdir, showWarnings = FALSE)

## intermediary files we create below
count_file <- file.path(datdir, "rse_gene.Rdata")
result_file <- file.path(resdir, "mir200c-results.rds")
bench_file <- file.path(sbdir, "mir200c-benchmark.rds")
bench_file_uninf <- file.path(sbdir, "mir200c-uninf-benchmark.rds")

## set up parallel backend
cores <- as.numeric(Sys.getenv("SLURM_NTASKS"))
multicoreParam <- SerialParam()

Data Preparation

We will download the pre-processed gene level counts available through recount2.

if (!file.exists(count_file)) {
    download_study('SRP030475', outdir = datdir)
}
load(count_file)
dsd <- scale_counts(rse_gene)

We next subset for samples containing the control samples and the samples where mir200c was knocked down. recount2 downloads data as a RangeSummarizedExperiment object, so we convert this into a DESeqDataSet object.

dsd <- dsd[, grepl("WT|200c", colData(dsd)$title)]
colData(dsd)$mir200c <- factor(ifelse(grepl("WT", colData(dsd)$title), "WT", "KO"))
dsd <- as(dsd, "DESeqDataSet")
storage.mode(assays(dsd)[["counts"]]) <- "integer"

Data Analysis

Differential Testing

Then, we set the design parameter to test for differences in expression upon mir200c knockout and run DESeq2. Similarly to the previous dataset, we set the parameter independentFiltering=FALSE.

if (file.exists(result_file)) {
    res <- readRDS(result_file)
} else {
    design(dsd) <- ~ mir200c
    dds <- DESeq(dsd)
    res <- results(dds, independentFiltering = FALSE) %>% 
        as.data.frame() %>%
        na.omit() %>% 
        dplyr::select(pvalue, baseMean, log2FoldChange, lfcSE, stat) %>%
        dplyr::rename(pval = pvalue,
                      ind_covariate = baseMean, 
                      effect_size = log2FoldChange,
                      SE = lfcSE, 
                      test_statistic = stat)
    saveRDS(res, file = result_file)
}

Add random (uninformative) covariate.

set.seed(4719)
res$rand_covar <- rnorm(nrow(res))

Covariate Diagnostics

As with the GTEx example, the mean counts is used as the informative covariate.

Mean Counts

rank_scatter(res, pvalue = "pval", covariate = "ind_covariate") +
    ggtitle("Mean coverage as independent covariate") +
    xlab("Mean Expression")

Similar to the GTEx dataset, keeping all the tests results in a strange discreteness. This is removed once we filter very lowly expressed genes. For the first covariate bin, however, there is a strange behaviour in which the distribution seems a bit skewed towards larger p-values.

strat_hist(res, pvalue = "pval",
           covariate = "ind_covariate", maxy = 7.5)

res <- filter(res, ind_covariate > 1)
strat_hist(res, pvalue = "pval",
           covariate = "ind_covariate", maxy = 3)

Random

rank_scatter(res, pvalue = "pval", covariate = "rand_covar") +
    ggtitle("Random independent covariate")
strat_hist(res, pvalue = "pval",
           covariate = "rand_covar", maxy = 7.5)

Multiple-Testing Correction - mean

We use the common BenchDesign with the set of multiple testing correction methods already included. We also add in Scott's FDR Regression (both nulltype = "empirical" and nulltype = "theoretical") since our test statistics are t-distributed.

if (file.exists(bench_file)) {
    sb <- readRDS(bench_file)
} else {
    bd <- initializeBenchDesign()

    bd <- addBMethod(bd, "fdrreg-t",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'theoretical',
                     control = list(lambda = 0.01))

   bd <- addBMethod(bd, "fdrreg-e",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'empirical',
                     control = list(lambda = 0.01))

    sb <- buildBench(bd, data = res, ftCols = "ind_covariate")
    saveRDS(sb, file = bench_file)
}

There are some warnings from both BH and fdrreg-empirical. However, they do return results. I tried to increase the nmids parameter for Scott's FDR Regression but this does not appear to make a difference.

head(assays(sb)[["bench"]])

Benchmark Metrics - mean

fdrreg-empirical rejects many more hypothesis than the rest of the methods, followed by lfdr and ihw.

assayNames(sb) <- "qvalue"
sb <- addDefaultMetrics(sb)
estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>%
    filter(performanceMetric == "rejections") %>%
    select(blabel, performanceMetric, alpha, value) %>%
    mutate(n = nrow(sb), prop = round(value / n, 3)) %>%
    arrange(desc(value)) %>%
    as_tibble() %>%
    print(n = 40)
rejections_scatter(sb, supplementary = FALSE)
rejection_scatter_bins(sb, covariate = "ind_covariate",
                       bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb),
                      order.by = "freq", decreasing = TRUE,
                      supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")

Multiple-Testing Correction - random

We use the common BenchDesign with the set of multiple testing correction methods already included. We also add in Scott's FDR Regression (both nulltype = "empirical" and nulltype = "theoretical") since our test statistics are t-distributed.

if (file.exists(bench_file_uninf)) {
    sb <- readRDS(bench_file_uninf)
} else {
    bd <- initializeBenchDesign()

    bd <- addBMethod(bd, "fdrreg-t",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'theoretical',
                     control = list(lambda = 0.01))

   bd <- addBMethod(bd, "fdrreg-e",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'empirical',
                     control = list(lambda = 0.01))

    res <- res %>% dplyr::mutate(ind_covariate = rand_covar)
    sb <- buildBench(bd, data = res, ftCols = "ind_covariate")
    saveRDS(sb, file = bench_file_uninf)
}
head(assays(sb)[["bench"]])

Benchmark Metrics - random

assayNames(sb) <- "qvalue"
sb <- addDefaultMetrics(sb)
estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>%
    filter(performanceMetric == "rejections") %>%
    select(blabel, performanceMetric, alpha, value) %>%
    mutate(n = nrow(sb), prop = round(value / n, 3)) %>%
    arrange(desc(value)) %>%
    as_tibble() %>%
    print(n = 40)
rejections_scatter(sb, supplementary = FALSE)
rejection_scatter_bins(sb, covariate = "ind_covariate",
                       bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb),
                      order.by = "freq", decreasing = TRUE,
                      supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")

Covariate comparison

Here we compare the method ranks for the different covariates at alpha = 0.10.

plotMethodRanks(c(bench_file, bench_file_uninf), 
                colLabels = c("mean", "uninf"), 
                alpha = 0.10, xlab = "Comparison")

Session Info

sessionInfo()


stephaniehicks/benchmarkfdrData2019 documentation built on June 20, 2021, 10 a.m.