inst/scripts/make-data/R/helpers-simulations.R

#' Simulate Collection of Test Statistics
#'
#' Function for simulating a large set of (independent) test results.
#' For Normal and t-distributed test statistics, effect sizes are directly simulated
#' from a specified true effect size distribution (`es_dist`) and perturbed according to a
#' specified noise distribution with correspond standard errors to obtain the observed set of
#' test statistics (`ts_dist`). For Chi-squared distributed test statistics, the notions of
#' effect size and standard error are not well defined, and we therefore just set the standard
#' error to 1 and effect size equal to the test statistic. This does not matter for our simulations,
#' where the effect size and standard error are only used by a single method (ASH) which is
#' not included in the comparisons under the Chi-squared distribution. This simulator resembles
#' the simulations carried out in the manuscripts of the Boca-Leek, ASH, and IHW methods.
#' (For IHW, in particular, the simulations illustrating the "size investing strategy" fall under
#' this class of simulations.)
#'
#' @param X simulation index
#' @param bench BenchDesign object of methods to run on simulated data set.
#' @param m number of test statistics.
#' @param pi0 proportion of null hypotheses. Can either be a single numeric value
#'        between 0 and 1 or a function which takes a numeric vector (i.e. 'informative
#'        covariate' values) and returns a vector of the same length containing null
#'        probabilities between 0 and 1. 
#' @param icovariate specification for the independent covariate, must be a function
#'        which takes an integer value and returns a vector of the specified length to
#'        be used as the independent covariate.
#' @param es_dist expected effect size under the alternative, can either be a single
#'        numeric value or a function which takes an integer as input and returns a
#'        numeric vector of effect sizes of the specified length.
#' @param ts_dist sampling distribution of effect size, must be a function taking two
#'        inputs, a vector of true effect sizes and a logical value whether to return
#'        the perturbed effect sizes or the corresponding standard error. The perturbed
#'        effect sizes and standard errors are used to compute the test statistics.
#' @param null_dist distribution under the null used to calculate p-values from the
#'        test statistic, must be a function which takes the full vector of null and
#'        alternative test statistics and return the corresponding p-values.
#' @param execute logical whether benchmarking should be executed or if the simulated
#'        data set should be returned. (default = TRUE)
#' @param seed integer seed for random number generator, ignored if NULL. (default = NULL) 
#'
#' @return
#' SummarizedBenchmark generated by calling `buildBench` with the specified
#' BenchDesign object and a simulated data set. If `execute = FALSE`, the simulated
#' data set is returned.
#' 
#' @details
#' If a function is specified for either of `effect_size` or `icovariate`, a function must also be
#' specified for the other parameter.
#' Within the function, the specified BenchDesign object is run against a simulated
#' data.frame with the following  columns.
#' * `qvalue`: 0/1 indicator whether data simulated under null (0) or alternative (1)
#' * `effect_size`: simulated effect size - true effect size + sampling noise
#' * `test_statistic`: `effect_size` divided by `SE`
#' * `pval`: test p-value calculated from test-statistic using `null_dist`
#' * `ind_covariate`: the independent covariate
#' * `SE`: standard errors for the `test_statistic`
#' 
#' @md
#' @author Patrick Kimes
simIteration <- function(X, bench, m, pi0, es_dist, ts_dist, null_dist,
                         icovariate, execute = TRUE, seed = NULL) {
    if (!is.null(seed)) { set.seed(seed * X) }
    stopifnot(is.function(es_dist))
    stopifnot(is.function(ts_dist))
    stopifnot(is.function(icovariate))
    
    ## simulate indep covariate from icovariate function
    ind_cov <- icovariate(m)
    stopifnot(length(ind_cov) == m)

    ## pi0 returns probability of null, sample alts from [1 - pi0s]
    if (is.function(pi0)) {
        pi0s <- pi0(ind_cov)
    } else if (length(pi0) == 1) {
        pi0s <- rep(pi0, m)
    } else {
        stop("pi0 must be function or single numeric value")
    }
    stopifnot(length(pi0s) == m)
    stopifnot(min(pi0s) >= 0 && max(pi0s) <= 1)
    alts <- which(rbinom(m, 1, 1 - pi0s) == 1)
    
    ## generate set of null (0) and alternative true effect sizes
    es <- rep(0, m)
    if (length(alts) > 0) {
        es[alts] <- es_dist(length(alts))
    }

    ## determine SE of test statistics
    SE <- ts_dist(es, se = TRUE)

    ## perturb effect sizes to get observed effect sizes
    es <- ts_dist(es)
    stopifnot(length(es) == m)
    
    ## null/alt indicator
    H <- rep(0, m)
    H[alts] <- 1

    ## test stat for normal and t-dist is just (effect size) / SE 
    ts <- es / SE

    ## calculate p-values 
    pv <- null_dist(ts)

    ## organize in data.frame
    dat <- data.frame(qvalue = H, effect_size = es, test_statistic = ts, 
                      pval = pv, ind_covariate = ind_cov, SE = SE)

    ## return data if not executing
    if (!execute) {
        return(as_tibble(dat))
    }

    ## run methods w/ specified independent covariate
    res_i <- buildBench(bench, dat, truthCol = "qvalue", ptabular = TRUE,
                        ftCols = c("ind_covariate", "effect_size"))


    ## create data.frame w/ explicitly uninformative covariate
    dat_u <- data.frame(qvalue = H, effect_size = es, test_statistic = ts, 
                        pval = pv, ind_covariate = runif(m), SE = SE)
    res_u <- buildBench(bench, dat_u, truthCol = "qvalue", ptabular = TRUE,
                        ftCols = c("ind_covariate", "effect_size"))
    
    list(informative = res_i, uninformative = res_u)
}



## ##############################################################################
## informative covariates
## - if x ~ U(0,1), then expect 80%, 90%, 95% pi0 for all cases
## ##############################################################################

## step function (4 steps)
pi0_step <- function(pi0) {
    stopifnot(pi0 <= 1L, pi0 >= 0L)
    if (pi0 >= 0.5) {
        function(x) {
            pi0 - (1-pi0)/2 +
                (1-pi0)/4 * (x > 0.25) +
                (1-pi0)/2 * (x > 0.5) +
                (1-pi0)/4 * (x > 0.75)
        }
    } else {
        function(x) {
            pi0 + pi0/2 -
                pi0/4 * (x > 0.25) -
                pi0/2 * (x > 0.5) -
                pi0/4 * (x > 0.75)
        }
    }
}

## shifted/stretched cubic function
pi0_cubic <- function(pi0) {
    stopifnot(pi0 <= 1L, pi0 >= 3/4)
    function(x) { (1-x)^(1/3) * 4*(1-pi0) + 4 * pi0 - 3 }
}

## shifted/stretched cosine function (non-monotone)
pi0_cosine <- function(pi0) {
    stopifnot(pi0 <= 1L, pi0 >= 0L)
    if (pi0 >= 0.5) {
        function(x) { (1-pi0) * cos(2*pi*x) + pi0 }
    } else {
        function(x) { pi0 - pi0 * cos(2*pi*x) }
    }
}

## shifted/stretched sine function (non-monotone)
pi0_sine <- function(pi0) {
    stopifnot(pi0 < 1L, pi0 >= 0L)
    if (pi0 >= 0.5) {
        function(x) { (1-pi0) * sin(2*pi*x) + pi0 }
    } else {
        function(x) { pi0 - pi0 * sin(2*pi*x) }
    }
}

## simple step function to test different levels of "informativeness" with pi0 = 80%
pi0_varyinfo80 <- function(zz) {
    stopifnot(zz <= 1L, zz >= 0L)
    function(x) { ifelse(x > 4/5, 0.8 * (1 - zz), 0.2 * (4 + zz)) }
}

## logistic function to test different levels of "informativeness" with pi0 = 80%
pi0_varyinfo80l <- function(zz) {
    stopifnot(zz <= 1L, zz >= 0L)
    function(x) { zz / (1+exp(5-25*x)) + (1 - zz) * 0.8 }
}


## ##############################################################################
## CODE DERIVED FROM EXTERNAL SOURCE
## source: https://github.com/stephenslab/ash/blob/94a3347/code/dsc-shrink/datamakers/datamaker.R
## source: https://github.com/stephenslab/ash/blob/d06047e/code/dsc-shrink/add_named_scenarios.R
## license: not listed
## date: 2017/10/18
## modified: Patrick Kimes
## ##############################################################################

## test statistic distributions used in ASH simulations
params <- list(spiky = list(c(.4, .2, .2, .2), c(0, 0, 0, 0), c(.25, .5, 1, 2)),
               skew = list(c(1/4, 1/4, 1/3, 1/6), c(-2, -1, 0, 1), c(2, 1.5, 1, 1)),
               bignormal = list(c(1), c(0), c(4)),
               bimodal = list(c(0.5, 0.5), c(-2, 2), c(1, 1)),
               flat_top = list(rep(1/7, 7), c(-1.5, -1, -0.5, 0, 0.5, 1, 1.5), rep(0.5, 7)),
               near_normal = list(c(2/3, 1/3), c(0, 0), c(1, 2)))

## function factory to generate mixture normal samplers 
rnormmix_generator <- function(g) {
    k <- unique(sapply(g, length))
    stopifnot(length(k) == 1)
    stopifnot(sum(g[[1]]) == 1)
    function(n) {
        comp <- sample(1:k, n, g[[1]], replace=TRUE)
        rnorm(n, g[[2]][comp], g[[3]][comp])
    }
}

## ASH effect-size distributions
sampler_spiky <- rnormmix_generator(params$spiky)
sampler_skew <- rnormmix_generator(params$skew)
sampler_bignormal <- rnormmix_generator(params$bignormal)
sampler_bimodal <- rnormmix_generator(params$bimodal)
sampler_flat_top <- rnormmix_generator(params$flat_top)
sampler_near_normal <- rnormmix_generator(params$near_normal)

## ##############################################################################
## END CODE DERIVED FROM EXTERNAL SOURCE
## ##############################################################################

## function factory: normal sampler
rnorm_generator <- function(m, s = 1) {
    function(n) {
        rnorm(n, m, s)
    }
}


## ##############################################################################
## ##############################################################################

## function factory: observed effect size and SE generators

#' normal distribution samplers
rnorm_perturber <- function(s = 1) {
    function(m, se = FALSE) {
        if (se) { return(rep(1, length(m))) }
        rnorm(length(m), m, s)
    }
}

#' non-central t-distribution samplers
rt_perturber <- function(df) {
    if (df <= 2) {
        stop("var not defined for non-central t with df <= 2.")
    }
    function(ncp, se = FALSE) {
        if (se) { return( sqrt(rchisq(length(ncp), df = df) / df) ) }
        rnorm(length(ncp), ncp, 1)
    }
}

#' non-central chi-square distirbution samplers
rchisq_perturber <- function(df) {
    function(ncp, se = FALSE) {
        stopifnot(ncp >= 0)
        if (se) { return(rep(1, length(ncp))) }
        rchisq(length(ncp), df, ncp)
    }
}


## ##############################################################################
## ##############################################################################

## function factory: two-sided normal p-value calculator
## @param s standard deviation of null zero-centered normal distribution
rnorm_2pvaluer <- function(s) {
    function(x) { 2 * (1 - pnorm(abs(x), 0, s)) }
}


## function factory: two-sided t p-value calculator
## @param df degrees of freedom of null zero-centered t-distribution
rt_2pvaluer <- function(df) {
    function(x) { 2 * (1 - pt(abs(x), df)) }
}


## function factory: chi-sq p-value calculator
## @param df degrees of freedom of null zero-centered t-distribution
rchisq_pvaluer <- function(df) {
    function(x) { 1 - pchisq(x, df) }
}
stephaniehicks/benchmarkfdrData2019 documentation built on June 20, 2021, 10 a.m.