In this set of simulations, we consider settings with varying number of hypothesis
tests. Since many newer FDR controlling approaches require fitting some model to
the independent covariates, they may be sensitive to lower numbers of tests. Both
informative and uninformative covariates are included in these settings as described in
simulations-informative-sine.Rmd
.
library(dplyr) library(ggplot2) library(SummarizedBenchmark) library(parallel) ## load helper functions for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) { source(f) } ## project data/results folders resdir <- "results" dir.create(resdir, showWarnings = FALSE, recursive = TRUE) ## intermediary files we create below n100_file <- file.path(resdir, "varyingntests-benchmark-n100.rds") n500_file <- file.path(resdir, "varyingntests-benchmark-n500.rds") n1000_file <- file.path(resdir, "varyingntests-benchmark-n1000.rds") n5000_file <- file.path(resdir, "varyingntests-benchmark-n5000.rds") n10000_file <- file.path(resdir, "varyingntests-benchmark-n10000.rds") n50000_file <- file.path(resdir, "varyingntests-benchmark-n50000.rds") ## number of cores for parallelization cores <- 20 B <- 100 ## define bechmarking design bd <- initializeBenchDesign()
As described in simulations-null.Rmd
, we include Scott's FDR Regression in the analysis
for simulations with Gaussian test statistics. Again, we include both
nulltype = "empirical"
and nulltype = "theoretical"
. Since all settings in this
series of simulations use test statistics simulated with Gaussian test statistics, we include
Scott's FDR Regression in all of the comparisons.
bdplus <- bd bdplus <- addBMethod(bdplus, "fdrreg-t", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'theoretical', control = list(lambda = 0.01)) bdplus <- addBMethod(bdplus, "fdrreg-e", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'empirical', control = list(lambda = 0.01))
All simulation settings will share the following parameters.
es_dist <- rnorm_generator(3) # functional: dist of alternative test stats ts_dist <- rnorm_perturber(1) # functional: sampling dist/noise for test stats null_dist <- rnorm_2pvaluer(1) # functional: dist to calc p-values icovariate <- runif # functional: independent covariate pi0 <- pi0_sine(0.90) # numeric: proportion of null hypotheses
Simulation results will be presented excluding a subset of methods, and for certain plots (upset plots), a single alpha cutoff will be used.
excludeSet <- c("unadjusted", "bl-df02", "bl-df04", "bl-df05") ualpha <- 0.05
First, we consider the setting where 100 hypotheses are tested.
m <- 100 # integer: number of hypothesis tests seed <- 1010
We next run the simulations.
if (file.exists(n100_file)) { res <- readRDS(n100_file) } else { res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m, pi0 = pi0, es_dist = es_dist, icovariate = icovariate, ts_dist = ts_dist, null_dist = null_dist, seed = seed, mc.cores = cores) saveRDS(res, file = n100_file) } res_i <- lapply(res, `[[`, "informative") res_u <- lapply(res, `[[`, "uninformative")
Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.
onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist, icovariate = icovariate, null_dist = null_dist, execute = FALSE)
rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")
strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)
We plot the averaged results across r B
replications.
resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01)) plotsim_average(resdf, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE)
We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.
covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size") covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")
Finally, (if enough methods produce rejections at r ualpha
) we take a look at
the overlap of rejections between methods.
if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) { aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE) } else { message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot") }
We also compare the simulation results with and without an informative covariate.
resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01)) resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key, performanceMetric, alpha, value), select(resdfu, rep, blabel, param.alpha, key, performanceMetric, alpha, value), by = c("rep", "blabel", "param.alpha", "key", "performanceMetric", "alpha"), suffix = c(".info", ".uninfo")) resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo) plotsim_average(resdfiu, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
Next, we consider the setting where 500 hypotheses are tested.
m <- 500 # integer: number of hypothesis tests seed <- 5050
We next run the simulations.
if (file.exists(n500_file)) { res <- readRDS(n500_file) } else { res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m, pi0 = pi0, es_dist = es_dist, icovariate = icovariate, ts_dist = ts_dist, null_dist = null_dist, seed = seed, mc.cores = cores) saveRDS(res, file = n500_file) } res_i <- lapply(res, `[[`, "informative") res_u <- lapply(res, `[[`, "uninformative")
Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.
onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist, icovariate = icovariate, null_dist = null_dist, execute = FALSE)
rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")
strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)
We plot the averaged results across r B
replications.
resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01)) plotsim_average(resdf, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE)
We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.
covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size") covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")
Finally, (if enough methods produce rejections at r ualpha
) we take a look at
the overlap of rejections between methods.
if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) { aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE) } else { message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot") }
We also compare the simulation results with and without an informative covariate.
resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01)) resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key, performanceMetric, alpha, value), select(resdfu, rep, blabel, param.alpha, key, performanceMetric, alpha, value), by = c("rep", "blabel", "param.alpha", "key", "performanceMetric", "alpha"), suffix = c(".info", ".uninfo")) resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo) plotsim_average(resdfiu, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
Next, we consider the setting where 1000 hypotheses are tested.
m <- 1000 # integer: number of hypothesis tests seed <- 10001
We next run the simulations.
if (file.exists(n1000_file)) { res <- readRDS(n1000_file) } else { res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m, pi0 = pi0, es_dist = es_dist, icovariate = icovariate, ts_dist = ts_dist, null_dist = null_dist, seed = seed, mc.cores = cores) saveRDS(res, file = n1000_file) } res_i <- lapply(res, `[[`, "informative") res_u <- lapply(res, `[[`, "uninformative")
Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.
onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist, icovariate = icovariate, null_dist = null_dist, execute = FALSE)
rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")
strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)
We plot the averaged results across r B
replications.
resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01)) plotsim_average(resdf, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE)
We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.
covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size") covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")
Finally, (if enough methods produce rejections at r ualpha
) we take a look at
the overlap of rejections between methods.
if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) { aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE) } else { message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot") }
We also compare the simulation results with and without an informative covariate.
resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01)) resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key, performanceMetric, alpha, value), select(resdfu, rep, blabel, param.alpha, key, performanceMetric, alpha, value), by = c("rep", "blabel", "param.alpha", "key", "performanceMetric", "alpha"), suffix = c(".info", ".uninfo")) resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo) plotsim_average(resdfiu, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
Next, we consider the setting where 5000 hypotheses are tested.
m <- 5000 # integer: number of hypothesis tests seed <- 50005
We next run the simulations.
if (file.exists(n5000_file)) { res <- readRDS(n5000_file) } else { res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m, pi0 = pi0, es_dist = es_dist, icovariate = icovariate, ts_dist = ts_dist, null_dist = null_dist, seed = seed, mc.cores = cores) saveRDS(res, file = n5000_file) } res_i <- lapply(res, `[[`, "informative") res_u <- lapply(res, `[[`, "uninformative")
Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.
onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist, icovariate = icovariate, null_dist = null_dist, execute = FALSE)
rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")
strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)
We plot the averaged results across r B
replications.
resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01)) plotsim_average(resdf, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE)
We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.
covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size") covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")
Finally, (if enough methods produce rejections at r ualpha
) we take a look at
the overlap of rejections between methods.
if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) { aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE) } else { message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot") }
We also compare the simulation results with and without an informative covariate.
resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01)) resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key, performanceMetric, alpha, value), select(resdfu, rep, blabel, param.alpha, key, performanceMetric, alpha, value), by = c("rep", "blabel", "param.alpha", "key", "performanceMetric", "alpha"), suffix = c(".info", ".uninfo")) resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo) plotsim_average(resdfiu, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
Next, we consider the setting where 10000 hypotheses are tested.
m <- 10000 # integer: number of hypothesis tests seed <- 1515
We next run the simulations.
if (file.exists(n10000_file)) { res <- readRDS(n10000_file) } else { res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m, pi0 = pi0, es_dist = es_dist, icovariate = icovariate, ts_dist = ts_dist, null_dist = null_dist, seed = seed, mc.cores = cores) saveRDS(res, file = n10000_file) } res_i <- lapply(res, `[[`, "informative") res_u <- lapply(res, `[[`, "uninformative")
Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.
onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist, icovariate = icovariate, null_dist = null_dist, execute = FALSE)
rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")
strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)
We plot the averaged results across r B
replications.
resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01)) plotsim_average(resdf, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE)
We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.
covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size") covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")
Finally, (if enough methods produce rejections at r ualpha
) we take a look at
the overlap of rejections between methods.
if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) { aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE) } else { message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot") }
We also compare the simulation results with and without an informative covariate.
resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01)) resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key, performanceMetric, alpha, value), select(resdfu, rep, blabel, param.alpha, key, performanceMetric, alpha, value), by = c("rep", "blabel", "param.alpha", "key", "performanceMetric", "alpha"), suffix = c(".info", ".uninfo")) resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo) plotsim_average(resdfiu, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
Next, we consider the setting where 50000 hypotheses are tested.
m <- 50000 # integer: number of hypothesis tests seed <- 5555
We next run the simulations.
if (file.exists(n50000_file)) { res <- readRDS(n50000_file) } else { res <- mclapply(X = 1:B, FUN = simIteration, bench = bdplus, m = m, pi0 = pi0, es_dist = es_dist, icovariate = icovariate, ts_dist = ts_dist, null_dist = null_dist, seed = seed, mc.cores = cores) saveRDS(res, file = n50000_file) } res_i <- lapply(res, `[[`, "informative") res_u <- lapply(res, `[[`, "uninformative")
Here, we show the relationship between the independent covariate and p-values for a single replication of the experiment.
onerun <- simIteration(bdplus, m = m, pi0 = pi0, es_dist = es_dist, ts_dist = ts_dist, icovariate = icovariate, null_dist = null_dist, execute = FALSE)
rank_scatter(onerun, pvalue = "pval", covariate = "ind_covariate")
strat_hist(onerun, pvalue = "pval", covariate = "ind_covariate", maxy = 10, numQ = 3)
We plot the averaged results across r B
replications.
resdf <- plotsim_standardize(res_i, alpha = seq(0.01, 0.10, 0.01)) plotsim_average(resdf, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE) plotsim_average(resdf, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE)
We also take a look at the distribution of rejects for each method as a function of the effect size and independent covariate.
covariateLinePlot(res_i, alpha = ualpha, covname = "effect_size") covariateLinePlot(res_i, alpha = ualpha, covname = "ind_covariate")
Finally, (if enough methods produce rejections at r ualpha
) we take a look at
the overlap of rejections between methods.
if (numberMethodsReject(resdf, alphacutoff = ualpha, filterSet = excludeSet) >= 3) { aggupset(res_i, alpha = ualpha, supplementary = FALSE, return_list = FALSE) } else { message("Not enough methods found rejections at alpha ", ualpha, "; skipping upset plot") }
We also compare the simulation results with and without an informative covariate.
resdfu <- plotsim_standardize(res_u, alpha = seq(0.01, 0.10, 0.01)) resdfiu <- dplyr::full_join(select(resdf, rep, blabel, param.alpha, key, performanceMetric, alpha, value), select(resdfu, rep, blabel, param.alpha, key, performanceMetric, alpha, value), by = c("rep", "blabel", "param.alpha", "key", "performanceMetric", "alpha"), suffix = c(".info", ".uninfo")) resdfiu <- dplyr::mutate(resdfiu, value = value.info - value.uninfo) plotsim_average(resdfiu, met="rejections", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="FDR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TPR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE) plotsim_average(resdfiu, met="TNR", filter_set = excludeSet, merge_ihw = TRUE, errorBars = TRUE, diffplot = TRUE)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.