R/eval_tibbles.R
In simTool: Conduct Simulation Studies with a Minimal Amount of Source Code

Documented in eval_tibbles

#' Workhorse for simulation studies
#'
#'  Generates data according to all provided
#'  constellations in \code{data_tibble} and applies
#'  all provided constellations in \code{proc_tibble}
#'  to them.
#'
#'
#' @param data_grid  a \code{data.frame} or \code{tibble} where the first column
#'  is a character vector with function names. The other
#'  columns contain parameters for the functions specified
#'  in the first column. Parameters with NA are ignored. If a column with name
#'  \code{.truth} exist, then the corresponding entry is passed to functions
#'    generated from
#'  \code{proc_grid} and the function specified in \code{post_analyze}.
#' @param proc_grid  similar as \code{data_grid} the first
#'  column must contain function names. The other columns
#'  contain parameters for the functions specified in the
#'  first column. The data generated according to
#'  \code{data_grid} will always be passed to the first
#'  unspecified argument of the functions specified in the first
#'  column of \code{proc_grid}. If a function specified in
#'  \code{proc_grid} has an argument \code{.truth}, then the corresponding
#'  entry in the
#'  \code{.truth} column from \code{data_grid} is passed to the
#'  \code{.truth} parameter or if no column \code{.truth} exist in
#'  \code{data_grid}, then all parameters used
#'  for the data generation are passed to the \code{.truth} parameter.
#' @param replications  number of replications for the simulation
#' @param discard_generated_data  if \code{TRUE} the generated
#'  data is deleted after all function constellations in
#'  \code{proc_grid} have been applied. Otherwise, ALL
#'  generated data sets will be part of the returned object.
#' @param post_analyze this is a convenience function, that is applied
#'  directly after the data analyzing function. If this function has an
#'  argument \code{.truth}, then the corresponding entry in the
#'  \code{.truth} column from \code{data_grid} is passed to the \code{.truth}
#'  parameter or if no column \code{.truth} exist in \code{data_grid},
#'  then all parameters used for the data generation are passed to the
#'  \code{.truth} parameter.
#' @param summary_fun  named list of univariate function to summarize the
#'  results (numeric or logical) over the replications, e.g.
#'  list(mean = mean, sd = sd).
#' @param group_for_summary if the result returned by the data analyzing
#'  function  or \code{post_analyze}
#'  is a \code{data.frame} with more than one row, one usually is interested
#'  in summarizing the results while grouping for some variables. This group
#'  variables can be passed as a character vector into \code{group_for_summary}
#' @param ncpus  a cluster of \code{ncpus} workers (R-processes)
#'  is created on the local machine to conduct the
#'  simulation. If \code{ncpus}
#'  equals one no cluster is created and the simulation
#'  is conducted by the current R-process.
#' @param cluster  a cluster generated by the \code{parallel}
#'  package that will be used to conduct the simulation.
#'  If \code{cluster} is specified, then \code{ncpus} will
#'  be ignored.
#' @param cluster_seed if the simulation is done in parallel
#'  manner, then the combined multiple-recursive generator from L'Ecuyer (1999)
#'  is used to generate random numbers. Thus \code{cluster_seed} must be a
#'  (signed) integer vector of length 6.
#'  The 6 elements of the seed are internally regarded as
#'  32-bit unsigned integers. Neither the first three nor the last three
#'  should be all zero, and they are limited to less than 4294967087 and
#'  4294944443 respectively.
#' @param cluster_libraries  a character vector specifying
#'  the packages that should be loaded by the workers.
#' @param cluster_global_objects  a character vector specifying
#'  the names of R objects in the global environment that should
#'  be exported to the global environment of every worker.
#' @param envir  must be provided if the functions specified
#'  in \code{data_grid} or \code{proc_grid} are not part
#'  of the global environment.
#' @param simplify usually the result column is nested, by default it is
#'  tried to unnest it.
#' @return  The returned object list of the class
#'  \code{eval_tibbles}, where the element \code{simulations} contain
#'  the results of the simulation.
#' @note  If \code{cluster} is provided by the user the
#'  function \code{eval_tibbles} will NOT stop the cluster.
#'  This has to be done by the user. Conducting parallel
#'  simulations by specifying \code{ncpus} will internally
#'  create a cluster and stop it after the simulation
#'  is done.
#' @author  Marsel Scheer
#' @examples
#' rng <- function(data, ...) {
#'   ret <- range(data)
#'   names(ret) <- c("min", "max")
#'   ret
#' }
#'
#' ### The following line is only necessary
#' ### if the examples are not executed in the global
#' ### environment, which for instance is the case when
#' ### the oneline-documentation
#' ### http://marselscheer.github.io/simTool/reference/eval_tibbles.html
#' ### is build. In such case eval_tibble() would search the
#' ### above defined function rng() in the global environment where
#' ### it does not exist!
#' eval_tibbles <- purrr::partial(eval_tibbles, envir = environment())
#'
#' dg <- expand_tibble(fun = "rnorm", n = c(5L, 10L))
#' pg <- expand_tibble(proc = c("rng", "median", "length"))
#'
#' eval_tibbles(dg, pg, rep = 2, simplify = FALSE)
#' eval_tibbles(dg, pg, rep = 2)
#' eval_tibbles(dg, pg,
#'   rep = 2,
#'   post_analyze = purrr::compose(as.data.frame, t)
#' )
#' eval_tibbles(dg, pg, rep = 2, summary_fun = list(mean = mean, sd = sd))
#'
#' regData <- function(n, SD) {
#'   data.frame(
#'     x = seq(0, 1, length = n),
#'     y = rnorm(n, sd = SD)
#'   )
#' }
#'
#' eg <- eval_tibbles(
#'   expand_tibble(fun = "regData", n = 5L, SD = 1:2),
#'   expand_tibble(proc = "lm", formula = c("y~x", "y~I(x^2)")),
#'   replications = 3
#' )
#' eg
#'
#' presever_rownames <- function(mat) {
#'   rn <- rownames(mat)
#'   ret <- tibble::as_tibble(mat)
#'   ret$term <- rn
#'   ret
#' }
#'
#' eg <- eval_tibbles(
#'   expand_tibble(fun = "regData", n = 5L, SD = 1:2),
#'   expand_tibble(proc = "lm", formula = c("y~x", "y~I(x^2)")),
#'   post_analyze = purrr::compose(presever_rownames, coef, summary),
#'   # post_analyze = broom::tidy, # is a nice out of the box alternative
#'   summary_fun = list(mean = mean, sd = sd),
#'   group_for_summary = "term",
#'   replications = 3
#' )
#' eg$simulation
#'
#' dg <- expand_tibble(fun = "rexp", rate = c(10, 100), n = c(50L, 100L))
#' pg <- expand_tibble(proc = c("t.test"), conf.level = c(0.8, 0.9, 0.95))
#' et <- eval_tibbles(dg, pg,
#'   ncpus = 1,
#'   replications = 10^1,
#'   post_analyze = function(ttest, .truth) {
#'     mu <- 1 / .truth$rate
#'     ttest$conf.int[1] <= mu && mu <= ttest$conf.int[2]
#'   },
#'   summary_fun = list(mean = mean, sd = sd)
#' )
#' et
#'
#' dg <- dplyr::bind_rows(
#'   expand_tibble(fun = "rexp", rate = 10, .truth = 1 / 10, n = c(50L, 100L)),
#'   expand_tibble(fun = "rnorm", .truth = 0, n = c(50L, 100L))
#' )
#' pg <- expand_tibble(proc = c("t.test"), conf.level = c(0.8, 0.9, 0.95))
#' et <- eval_tibbles(dg, pg,
#'   ncpus = 1,
#'   replications = 10^1,
#'   post_analyze = function(ttest, .truth) {
#'     ttest$conf.int[1] <= .truth && .truth <= ttest$conf.int[2]
#'   },
#'   summary_fun = list(mean = mean, sd = sd)
#' )
#' et
#' ### need to remove the locally adapted eval_tibbles()
#' ### otherwise executing the examples would mask
#' ### eval_tibbles from simTool-namespace.
#' rm(eval_tibbles)
#' @export
eval_tibbles <-
  function(data_grid, proc_grid = expand_tibble(proc = "length"),
           replications = 1, discard_generated_data = FALSE,
           post_analyze = identity,
           summary_fun = NULL, group_for_summary = NULL,
           ncpus = 1L, cluster = NULL, cluster_seed = rep(12345, 6),
           cluster_libraries = NULL,
           cluster_global_objects = NULL,
           envir = globalenv(),
           simplify = TRUE) {
    if (is.element(".truth", names(proc_grid))) {
      stop(".truth column in proc_grid not allowed!")
    }

    mc <- match.call()

    user_provided_cluster <- !is.null(cluster)
    summary_fun <- prepare_summary_fun(summary_fun)
    df <- data_grid_to_fun(data_grid, envir)
    pf <- proc_grid_to_fun(proc_grid, envir)
    generator_info <- RNGkind()
    cluster <- prepare_cluster(
      cluster, ncpus, cluster_global_objects,
      cluster_libraries, cluster_seed, df, pf
    )
    post_analyze <- extend_with_truth_parameter(post_analyze)
    sim_fun <- define_simulation(
      pf, discard_generated_data, cluster,
      replications, summary_fun,
      group_for_summary, post_analyze
    )

    pb <- progress_bar(df)

    t1 <- Sys.time()

    tryCatch({
        simulation_list <- lapply(df, function(fc) {
          ret <- sim_fun(fc)
          pb()
          ret
        })
        t2 <- Sys.time()
      },
      finally = {
        if (ncpus > 1 && !user_provided_cluster) {
          # cluster created by the user wont be stopped
          parallel::stopCluster(cluster)
          RNGkind(kind = generator_info[1])
        }
      }
    )

    est_reps_per_hour <-
      as.integer(replications / as.numeric(difftime(t2, t1, units = "hour")))

    ret <- list(
      call = mc, data_grid = data_grid, proc_grid = proc_grid,
      simulation = frame_simulation(
        data_grid, proc_grid,
        simulation_list, summary_fun
      ),
      summary_fun = summary_fun,
      replications = replications,
      start_time = t1,
      end_time = t2,
      est_reps_per_hour = est_reps_per_hour,
      session_info = utils::sessionInfo()
    )
    if (simplify) {
      ret <- unnest_simulation(ret)
    }
    if (!discard_generated_data) {
      ret$generated_data <- purrr::map(
        purrr::flatten(simulation_list), ~ `[[`(., "data")
      )
    }
    class(ret) <- "eval_tibbles"
    ret
  }