knitr::opts_chunk$set(echo = TRUE)

Summary

The objective of this document is benchmark methods to control FDR in the context of differential gene expression.

The data consists of 20 samples from two regions of the human basal ganglia, the nucleus accumbens and the putamen, from the GTEx project. Shortly, samples were downloaded using the Short Read Archive Toolkit and mapped to the human reference genome version GRCh38 using STAR v2.4.2a. htseq-count was used to tabulate the number of uniquely mapping reads for each gene. We used DESeq2 to format the data into a DESeqDataSet object.

Workspace Setup

library(dplyr)
library(ggplot2)
library(DESeq2)
library(SummarizedBenchmark)
library(BiocParallel)

## load helper functions
for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) {
    source(f)
}

## project data/results folders
datdir <- "data"
resdir <- "results"
sbdir <- "../../results/RNAseq"
dir.create(datdir, showWarnings = FALSE)
dir.create(resdir, showWarnings = FALSE)
dir.create(sbdir, showWarnings = FALSE)

## intermediary files we create below
count_file <- file.path(resdir, "brain-counts.rds")
result_file <- file.path(resdir, "brain-results.rds")
bench_file <- file.path(sbdir, "brain-benchmark.rds")
bench_file_uninf <- file.path(sbdir, "brain-uninf-benchmark.rds")

## set up parallel backend
cores <- as.numeric(Sys.getenv("SLURM_NTASKS"))
multicoreParam <- SerialParam()

Data Preparation

We download the DESeqDataSet object from zenodo that contains the gene level counts for GTEx samples.

if (!file.exists(count_file)) {
    download.file("https://zenodo.org/record/1475409/files/rnaseq-brain-counts.rds?download=1", destfile = count_file)
}
dsd <- readRDS(count_file)

Data Analysis

Differential Testing

We use DESeq2 to test for differential gene expression between the two cell types. We set the parameter independentFiltering=FALSE to skip the independent filtering step, as this step would be redundant with some of the FDR control methods that use gene expression as an independent covariate to increase power.

if (file.exists(result_file)) {
    res <- readRDS(result_file)
} else {
    dds <- DESeq(dsd, parallel = TRUE, BPPARAM = multicoreParam)
    res <- results(dds, independentFiltering = FALSE) %>% 
        as.data.frame() %>%
        na.omit() %>% 
        dplyr::select(pvalue, baseMean, log2FoldChange, lfcSE, stat) %>%
        dplyr::rename(pval = pvalue,
                      ind_covariate = baseMean, 
                      effect_size = log2FoldChange,
                      SE = lfcSE, 
                      test_statistic = stat)
    saveRDS(res, file = result_file)
}

Add random (uninformative) covariate.

set.seed(83750)
res$rand_covar <- rnorm(nrow(res))

Covariate Diagnostics

In RNA-seq differential expression analysis, it is very well established that the mean expression is an informative covariate that is independent under the null and that can be used to increase power while keeping FDR control.

Mean Counts

rank_scatter(res, pvalue = "pval", covariate = "ind_covariate") +
    ggtitle("Mean coverage as independent covariate") +
    xlab("Mean Expression")

We noticed some discreteness in the distribution of p-values towards values close to 1 that was particularly pronounced in the first covariate bin. The overall distribution of p-values could violate some of the assumptions of the FDR control methods.

strat_hist(res, pvalue = "pval",
           covariate = "ind_covariate", maxy = 25)

This discreteness, however, corresponded to genes with very low expression values. For example, there were r nrow(dplyr::filter(res, ind_covariate <= 1)) genes with less than an average of 1 read across the 20 samples. The distribution of p-values looked much better when removing these very lowly expressed genes.

res <- dplyr::filter(res, ind_covariate >= 1)
strat_hist(res, pvalue = "pval",
           covariate = "ind_covariate", maxy = 25)

Random

rank_scatter(res, pvalue = "pval", covariate = "rand_covar") +
    ggtitle("Random independent covariate")
strat_hist(res, pvalue = "pval",
           covariate = "rand_covar", maxy = 25)

Multiple-Testing Correction - mean

We use the common BenchDesign with the set of multiple testing correction methods already included. We also add in Scott's FDR Regression (both nulltype = "empirical" and nulltype = "theoretical") since our test statistics are approximately t-distributed.

if (file.exists(bench_file)) {
    sb <- readRDS(bench_file)
} else {
    bd <- initializeBenchDesign()

    bd <- addBMethod(bd, "fdrreg-t",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'theoretical',
                     control = list(lambda = 0.01))

   bd <- addBMethod(bd, "fdrreg-e",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'empirical',
                     control = list(lambda = 0.01))

    sb <- buildBench(bd, data = res, ftCols = "ind_covariate")
    saveRDS(sb, file = bench_file)
}

Benchmark Metrics - mean

assayNames(sb) <- "qvalue"
sb <- addDefaultMetrics(sb)
estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>%
    filter(performanceMetric == "rejections") %>%
    select(blabel, performanceMetric, alpha, value) %>%
    mutate(n = nrow(sb), prop = round(value / n, 3)) %>%
    arrange(desc(value)) %>%
    as_tibble() %>%
    print(n = 40)

ash was the method that rejected the largest number of hypotheses, followed by lfdr and fdrreg-theoretical.

rejections_scatter(sb, supplementary = FALSE)
rejection_scatter_bins(sb, covariate = "ind_covariate",
                       bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb),
                      order.by = "freq", decreasing = TRUE,
                      supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")

Multiple-Testing Correction - random

We use the common BenchDesign with the set of multiple testing correction methods already included. We also add in Scott's FDR Regression (both nulltype = "empirical" and nulltype = "theoretical") since our test statistics are approximately t-distributed.

if (file.exists(bench_file_uninf)) {
    sb <- readRDS(bench_file_uninf)
} else {
    bd <- initializeBenchDesign()

    bd <- addBMethod(bd, "fdrreg-t",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'theoretical',
                     control = list(lambda = 0.01))

   bd <- addBMethod(bd, "fdrreg-e",
                     FDRreg::FDRreg,
                     function(x) { x$FDR },
                     z = test_statistic,
                     features = model.matrix( ~  splines::bs(ind_covariate, df = 3) - 1),
                     nulltype = 'empirical',
                     control = list(lambda = 0.01))

    res <- res %>% dplyr::mutate(ind_covariate = rand_covar)
    sb <- buildBench(bd, data = res, ftCols = "ind_covariate")
    saveRDS(sb, file = bench_file_uninf)
}

Benchmark Metrics - random

assayNames(sb) <- "qvalue"
sb <- addDefaultMetrics(sb)
estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>%
    filter(performanceMetric == "rejections") %>%
    select(blabel, performanceMetric, alpha, value) %>%
    mutate(n = nrow(sb), prop = round(value / n, 3)) %>%
    arrange(desc(value)) %>%
    as_tibble() %>%
    print(n = 40)
rejections_scatter(sb, supplementary = FALSE)
rejection_scatter_bins(sb, covariate = "ind_covariate",
                       bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb),
                      order.by = "freq", decreasing = TRUE,
                      supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")

Covariate comparison

Here we compare the method ranks for the different covariates at alpha = 0.10.

plotMethodRanks(c(bench_file, bench_file_uninf), 
                colLabels = c("mean", "uninf"), 
                alpha = 0.10, xlab = "Comparison")

Session Info

sessionInfo()


stephaniehicks/benchmarkfdrData2019 documentation built on June 20, 2021, 10 a.m.