#' Workhorse for simulation studies
#'
#' Generates data according to all provided
#' constellations in \code{data_tibble} and applies
#' all provided constellations in \code{proc_tibble}
#' to them.
#'
#'
#' @param data_grid a \code{data.frame} or \code{tibble} where the first column
#' is a character vector with function names. The other
#' columns contain parameters for the functions specified
#' in the first column. Parameters with NA are ignored. If a column with name
#' \code{.truth} exist, then the corresponding entry is passed to functions
#' generated from
#' \code{proc_grid} and the function specified in \code{post_analyze}.
#' @param proc_grid similar as \code{data_grid} the first
#' column must contain function names. The other columns
#' contain parameters for the functions specified in the
#' first column. The data generated according to
#' \code{data_grid} will always be passed to the first
#' unspecified argument of the functions specified in the first
#' column of \code{proc_grid}. If a function specified in
#' \code{proc_grid} has an argument \code{.truth}, then the corresponding
#' entry in the
#' \code{.truth} column from \code{data_grid} is passed to the
#' \code{.truth} parameter or if no column \code{.truth} exist in
#' \code{data_grid}, then all parameters used
#' for the data generation are passed to the \code{.truth} parameter.
#' @param replications number of replications for the simulation
#' @param discard_generated_data if \code{TRUE} the generated
#' data is deleted after all function constellations in
#' \code{proc_grid} have been applied. Otherwise, ALL
#' generated data sets will be part of the returned object.
#' @param post_analyze this is a convenience function, that is applied
#' directly after the data analyzing function. If this function has an
#' argument \code{.truth}, then the corresponding entry in the
#' \code{.truth} column from \code{data_grid} is passed to the \code{.truth}
#' parameter or if no column \code{.truth} exist in \code{data_grid},
#' then all parameters used for the data generation are passed to the
#' \code{.truth} parameter.
#' @param summary_fun named list of univariate function to summarize the
#' results (numeric or logical) over the replications, e.g.
#' list(mean = mean, sd = sd).
#' @param group_for_summary if the result returned by the data analyzing
#' function or \code{post_analyze}
#' is a \code{data.frame} with more than one row, one usually is interested
#' in summarizing the results while grouping for some variables. This group
#' variables can be passed as a character vector into \code{group_for_summary}
#' @param ncpus a cluster of \code{ncpus} workers (R-processes)
#' is created on the local machine to conduct the
#' simulation. If \code{ncpus}
#' equals one no cluster is created and the simulation
#' is conducted by the current R-process.
#' @param cluster a cluster generated by the \code{parallel}
#' package that will be used to conduct the simulation.
#' If \code{cluster} is specified, then \code{ncpus} will
#' be ignored.
#' @param cluster_seed if the simulation is done in parallel
#' manner, then the combined multiple-recursive generator from L'Ecuyer (1999)
#' is used to generate random numbers. Thus \code{cluster_seed} must be a
#' (signed) integer vector of length 6.
#' The 6 elements of the seed are internally regarded as
#' 32-bit unsigned integers. Neither the first three nor the last three
#' should be all zero, and they are limited to less than 4294967087 and
#' 4294944443 respectively.
#' @param cluster_libraries a character vector specifying
#' the packages that should be loaded by the workers.
#' @param cluster_global_objects a character vector specifying
#' the names of R objects in the global environment that should
#' be exported to the global environment of every worker.
#' @param envir must be provided if the functions specified
#' in \code{data_grid} or \code{proc_grid} are not part
#' of the global environment.
#' @param simplify usually the result column is nested, by default it is
#' tried to unnest it.
#' @return The returned object list of the class
#' \code{eval_tibbles}, where the element \code{simulations} contain
#' the results of the simulation.
#' @note If \code{cluster} is provided by the user the
#' function \code{eval_tibbles} will NOT stop the cluster.
#' This has to be done by the user. Conducting parallel
#' simulations by specifying \code{ncpus} will internally
#' create a cluster and stop it after the simulation
#' is done.
#' @author Marsel Scheer
#' @examples
#' rng <- function(data, ...) {
#' ret <- range(data)
#' names(ret) <- c("min", "max")
#' ret
#' }
#'
#' ### The following line is only necessary
#' ### if the examples are not executed in the global
#' ### environment, which for instance is the case when
#' ### the oneline-documentation
#' ### http://marselscheer.github.io/simTool/reference/eval_tibbles.html
#' ### is build. In such case eval_tibble() would search the
#' ### above defined function rng() in the global environment where
#' ### it does not exist!
#' eval_tibbles <- purrr::partial(eval_tibbles, envir = environment())
#'
#' dg <- expand_tibble(fun = "rnorm", n = c(5L, 10L))
#' pg <- expand_tibble(proc = c("rng", "median", "length"))
#'
#' eval_tibbles(dg, pg, rep = 2, simplify = FALSE)
#' eval_tibbles(dg, pg, rep = 2)
#' eval_tibbles(dg, pg,
#' rep = 2,
#' post_analyze = purrr::compose(as.data.frame, t)
#' )
#' eval_tibbles(dg, pg, rep = 2, summary_fun = list(mean = mean, sd = sd))
#'
#' regData <- function(n, SD) {
#' data.frame(
#' x = seq(0, 1, length = n),
#' y = rnorm(n, sd = SD)
#' )
#' }
#'
#' eg <- eval_tibbles(
#' expand_tibble(fun = "regData", n = 5L, SD = 1:2),
#' expand_tibble(proc = "lm", formula = c("y~x", "y~I(x^2)")),
#' replications = 3
#' )
#' eg
#'
#' presever_rownames <- function(mat) {
#' rn <- rownames(mat)
#' ret <- tibble::as_tibble(mat)
#' ret$term <- rn
#' ret
#' }
#'
#' eg <- eval_tibbles(
#' expand_tibble(fun = "regData", n = 5L, SD = 1:2),
#' expand_tibble(proc = "lm", formula = c("y~x", "y~I(x^2)")),
#' post_analyze = purrr::compose(presever_rownames, coef, summary),
#' # post_analyze = broom::tidy, # is a nice out of the box alternative
#' summary_fun = list(mean = mean, sd = sd),
#' group_for_summary = "term",
#' replications = 3
#' )
#' eg$simulation
#'
#' dg <- expand_tibble(fun = "rexp", rate = c(10, 100), n = c(50L, 100L))
#' pg <- expand_tibble(proc = c("t.test"), conf.level = c(0.8, 0.9, 0.95))
#' et <- eval_tibbles(dg, pg,
#' ncpus = 1,
#' replications = 10^1,
#' post_analyze = function(ttest, .truth) {
#' mu <- 1 / .truth$rate
#' ttest$conf.int[1] <= mu && mu <= ttest$conf.int[2]
#' },
#' summary_fun = list(mean = mean, sd = sd)
#' )
#' et
#'
#' dg <- dplyr::bind_rows(
#' expand_tibble(fun = "rexp", rate = 10, .truth = 1 / 10, n = c(50L, 100L)),
#' expand_tibble(fun = "rnorm", .truth = 0, n = c(50L, 100L))
#' )
#' pg <- expand_tibble(proc = c("t.test"), conf.level = c(0.8, 0.9, 0.95))
#' et <- eval_tibbles(dg, pg,
#' ncpus = 1,
#' replications = 10^1,
#' post_analyze = function(ttest, .truth) {
#' ttest$conf.int[1] <= .truth && .truth <= ttest$conf.int[2]
#' },
#' summary_fun = list(mean = mean, sd = sd)
#' )
#' et
#' ### need to remove the locally adapted eval_tibbles()
#' ### otherwise executing the examples would mask
#' ### eval_tibbles from simTool-namespace.
#' rm(eval_tibbles)
#' @export
eval_tibbles <-
function(data_grid, proc_grid = expand_tibble(proc = "length"),
replications = 1, discard_generated_data = FALSE,
post_analyze = identity,
summary_fun = NULL, group_for_summary = NULL,
ncpus = 1L, cluster = NULL, cluster_seed = rep(12345, 6),
cluster_libraries = NULL,
cluster_global_objects = NULL,
envir = globalenv(),
simplify = TRUE) {
if (is.element(".truth", names(proc_grid))) {
stop(".truth column in proc_grid not allowed!")
}
mc <- match.call()
user_provided_cluster <- !is.null(cluster)
summary_fun <- prepare_summary_fun(summary_fun)
df <- data_grid_to_fun(data_grid, envir)
pf <- proc_grid_to_fun(proc_grid, envir)
generator_info <- RNGkind()
cluster <- prepare_cluster(
cluster, ncpus, cluster_global_objects,
cluster_libraries, cluster_seed, df, pf
)
post_analyze <- extend_with_truth_parameter(post_analyze)
sim_fun <- define_simulation(
pf, discard_generated_data, cluster,
replications, summary_fun,
group_for_summary, post_analyze
)
pb <- progress_bar(df)
t1 <- Sys.time()
tryCatch({
simulation_list <- lapply(df, function(fc) {
ret <- sim_fun(fc)
pb()
ret
})
t2 <- Sys.time()
},
finally = {
if (ncpus > 1 && !user_provided_cluster) {
# cluster created by the user wont be stopped
parallel::stopCluster(cluster)
RNGkind(kind = generator_info[1])
}
}
)
est_reps_per_hour <-
as.integer(replications / as.numeric(difftime(t2, t1, units = "hour")))
ret <- list(
call = mc, data_grid = data_grid, proc_grid = proc_grid,
simulation = frame_simulation(
data_grid, proc_grid,
simulation_list, summary_fun
),
summary_fun = summary_fun,
replications = replications,
start_time = t1,
end_time = t2,
est_reps_per_hour = est_reps_per_hour,
session_info = utils::sessionInfo()
)
if (simplify) {
ret <- unnest_simulation(ret)
}
if (!discard_generated_data) {
ret$generated_data <- purrr::map(
purrr::flatten(simulation_list), ~ `[[`(., "data")
)
}
class(ret) <- "eval_tibbles"
ret
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.