Nothing
#' Collapse separate simulation files into a single result
#'
#' This function collects and aggregates the results from
#' \code{SimDesign}'s \code{\link{runSimulation}} into a single
#' objects suitable for post-analyses, or combines all the saved results directories and combines
#' them into one. This is useful when results are run piece-wise on one node (e.g., 500 replications
#' in one batch, 500 again at a later date, though be careful about the \code{\link{set.seed}}
#' use as the random numbers will tend to correlate the more it is used) or run independently across different
#' nodes/computing cores (e.g., see \code{\link{runArraySimulation}}.
#'
#'
#' @param dir a \code{character} vector pointing to the directory name containing the \code{.rds} files.
#' All \code{.rds} files in this directory will be used after first checking
#' their status with \code{\link{SimCheck}}. For greater specificity use the
#' \code{files} argument
#'
#' @param files a \code{character} vector containing the names of the simulation's final \code{.rds} files.
#'
#' @param filename (optional) name of .rds file to save aggregate simulation file to. If not specified
#' then the results will only be returned in the R console.
#'
#' @param select a character vector indicating columns to variables to select from the
#' \code{SimExtract(what='results')} information. This is mainly useful when RAM is an issue
#' given simulations with many stored estimates. Default includes the results objects
#' in their entirety, though to omit all internally stored simulation results pass the
#' character \code{'NONE'}
#'
#' @param check.only logical; for larger simulations file sets, such as those generated by
#' \code{\link{runArraySimulation}}, return the design conditions that do no satisfy the
#' \code{target.reps} and throw warning if files are unexpectedly missing
#'
#' @param target.reps (optional) number of replications to check against to evaluate whether
#' the simulation files returned the desired number of replications. If missing, the highest
#' detected value from the collected set of replication information will be used
#'
#' @param warning_details logical; include the aggregate of the warnings to be extracted via
#' \code{\link{SimExtract}}?
#'
#' @param error_details logical; include the aggregate of the errors to be extracted via
#' \code{\link{SimExtract}}?
#'
#' @return returns a \code{data.frame/tibble} with the (weighted) average/aggregate
#' of the simulation results
#'
#' @references
#'
#' Chalmers, R. P., & Adkins, M. C. (2020). Writing Effective and Reliable Monte Carlo Simulations
#' with the SimDesign Package. \code{The Quantitative Methods for Psychology, 16}(4), 248-280.
#' \doi{10.20982/tqmp.16.4.p248}
#'
#' Sigal, M. J., & Chalmers, R. P. (2016). Play it again: Teaching statistics with Monte
#' Carlo simulation. \code{Journal of Statistics Education, 24}(3), 136-156.
#' \doi{10.1080/10691898.2016.1246953}
#'
#' @seealso \code{\link{runSimulation}}, \code{\link{runArraySimulation}},
#' \code{\link{SimCheck}}
#'
#' @export
#'
#' @author Phil Chalmers \email{rphilip.chalmers@@gmail.com}
#'
#' @examples
#' \dontrun{
#'
#' setwd('my_working_directory')
#'
#' ## run simulations to save the .rds files (or move them to the working directory)
#' # seeds1 <- genSeeds(design)
#' # seeds2 <- genSeeds(design, old.seeds=seeds1)
#' # ret1 <- runSimulation(design, ..., seed=seeds1, filename='file1')
#' # ret2 <- runSimulation(design, ..., seed=seeds2, filename='file2')
#'
#' # saves to the hard-drive and stores in workspace
#' final <- SimCollect(files = c('file1.rds', 'file2.rds'))
#' final
#'
#' # If filename not included, can be extracted from results
#' # files <- c(SimExtract(ret1, 'filename'), SimExtract(ret2, 'filename'))
#' # final <- SimCollect(files = files)
#'
#'
#' #################################################
#' # Example where each row condition is repeated, evaluated independently,
#' # and later collapsed into a single analysis object
#'
#' # Each condition repeated four times (hence, replications
#' # should be set to desired.reps/4)
#' Design <- createDesign(mu = c(0,5),
#' N = c(30, 60))
#' Design
#'
#' # assume the N=60 takes longer, and should be spread out across more arrays
#' Design_long <- expandDesign(Design, c(2,2,4,4))
#' Design_long
#'
#' replications <- c(rep(50, 4), rep(25,8))
#' data.frame(Design_long, replications)
#'
#' #-------------------------------------------------------------------
#'
#' Generate <- function(condition, fixed_objects) {
#' dat <- with(condition, rnorm(N, mean=mu))
#' dat
#' }
#'
#' Analyse <- function(condition, dat, fixed_objects) {
#' ret <- c(mean=mean(dat), SD=sd(dat))
#' ret
#' }
#'
#' Summarise <- function(condition, results, fixed_objects) {
#' ret <- colMeans(results)
#' ret
#' }
#'
#' #-------------------------------------------------------------------
#'
#' # create directory to store all final simulation files
#' dir.create('sim_files/')
#'
#' iseed <- genSeeds()
#'
#' # distribute jobs independently
#' sapply(1:nrow(Design_long), \(i) {
#' runArraySimulation(design=Design_long, replications=replications,
#' generate=Generate, analyse=Analyse, summarise=Summarise,
#' arrayID=i, dirname='sim_files/', filename='job', iseed=iseed)
#' }) |> invisible()
#'
#' # check that all replications satisfy target
#' SimCollect('sim_files/', check.only = TRUE)
#'
#' # this would have been returned were the target.rep supposed to be 1000
#' SimCollect('sim_files/', check.only = TRUE, target.reps=1000)
#'
#' # aggregate into single object
#' sim <- SimCollect('sim_files/')
#' sim
#'
#' SimClean(dir='sim_files/')
#'
#' }
SimCollect <- function(dir=NULL, files = NULL, filename = NULL,
select = NULL, check.only = FALSE,
target.reps = NULL,
warning_details = FALSE,
error_details = TRUE){
if(is.null(dir) && is.null(files))
stop('either dir or files must be specified')
if(!is.null(dir) && !is.null(files))
stop('dir OR files must be specified, not both')
if(check.only) select <- 'REPLICATIONS'
if(!is.null(dir)) SimCheck(dir=dir)
if(!is.null(dir)){
files <- dir(dir)
} else dir <- './'
files <- paste0(dir, files)
oldfiles <- files
files <- oldfiles
if(!is.null(files)){
filenames <- files
} else {
return(invisible(NULL))
}
if(!is.null(filename))
if(filename %in% dir(dir))
stop(sprintf('File \'%s\' already exists in working directory', filename),
call.=FALSE)
replications_only <- ifelse(!is.null(select) && length(select) == 1L &&
select == 'REPLICATIONS', TRUE, FALSE)
print_when <- NA
if(length(filenames) > 20L){
cat("Reading in files ")
print_when <- floor(seq(1, length(filenames), length.out=20))
}
readin <- vector('list', length(filenames))
for(i in 1:length(filenames)){
if(i %in% print_when) cat(".")
tmp <- try(readRDS(filenames[i]), TRUE)
if(is(tmp, 'try-error'))
stop(c('Could not read file ', filenames[i]))
readin[[i]] <- subset_results(tmp, select=select)
}
extra_info1 <- attr(readin[[1L]], 'extra_info')
ncores <- sum(sapply(readin, function(x) attr(x, 'extra_info')$ncores))
extra_info1[c("seeds", "date_completed", "summarise_list", "stored_results",
"total_elapsed_time", "stored_Random.seeds_list")] <- NULL
errors <- lapply(readin, function(x)
as.data.frame(x[ ,grepl('ERROR', colnames(x)), drop=FALSE]))
warnings <- lapply(readin, function(x)
as.data.frame(x[ ,grepl('WARNINGS', colnames(x)), drop=FALSE]))
nms <- unique(do.call(c, lapply(errors, function(x) colnames(x))))
if(!length(nms)) nms <- 'ERRORS'
nmsw <- unique(do.call(c, lapply(warnings, function(x) colnames(x))))
if(!length(nmsw)) nmsw <- 'WARNINGS'
readin <- lapply(readin, function(x) x[ ,!(
grepl('ERROR', colnames(x)) | grepl('WARNINGS', colnames(x))), drop=FALSE])
if(length(unique(sapply(readin, ncol))) > 1L){
tab <- table(sapply(readin, ncol))
pick <- filenames[as.integer(names(which.min(tab))) == sapply(readin, ncol)]
stop(sprintf(c('Number of columns not equal. ',
'The following files had the fewest columns:\n%s'),
paste0(pick, collapse=', ')))
}
Design.ID <- sapply(readin, \(x) SimExtract(x, 'Design.ID'))
if(is.matrix(Design.ID)){
set.index <- rep(1L, ncol(Design.ID))
Design.ID <- Design.ID[,1]
} else {
set.index <- Design.ID
}
unique.set.index <- unique(set.index)
full_out <- vector('list', length(unique.set.index))
readin.old <- readin
errors.old <- errors
warnings.old <- warnings
design_names <- attr(readin[[1L]], "design_names")$design
errors_info <- lapply(readin.old, \(x) SimExtract(x, 'errors',
append=FALSE, fuzzy=FALSE))
warnings_info <- lapply(readin.old, \(x) SimExtract(x, 'warnings',
append=FALSE, fuzzy=FALSE))
for(j in unique.set.index){
readin <- readin.old[which(j == set.index)]
errors <- errors.old[which(j == set.index)]
warnings <- warnings.old[which(j == set.index)]
try_errors <- as.data.frame(matrix(0L, nrow(readin[[1L]]), length(nms)))
caught_warnings <- as.data.frame(matrix(0L, nrow(readin[[1L]]), length(nms)))
names(try_errors) <- nms
names(caught_warnings) <- nmsw
ret <- readin[[1L]]
pick <- sapply(readin[[1L]], is.numeric) & !(colnames(readin[[1]]) %in% design_names)
ret[, pick] <- 0
pick <- pick & !(colnames(readin[[1L]]) %in% c('SIM_TIME', 'REPLICATIONS', 'SEED'))
if(check.only)
ret <- ret[,c(design_names, 'REPLICATIONS')]
weights <- sapply(readin, function(x) x$REPLICATIONS[1L])
stopifnot(all(sapply(readin, function(x) length(unique(x$REPLICATIONS)) == 1L)))
weights <- weights / sum(weights)
has_stored_results <- !is.null(SimExtract(ret, 'results'))
for(i in seq_len(length(readin))){
ret$REPLICATIONS <- ret$REPLICATIONS + readin[[i]]$REPLICATIONS
if(replications_only) next
tmp <- stats::na.omit(match(nms, names(errors[[i]])))
if(length(tmp) > 0L){
try_errors[,match(nms, names(try_errors))] <- errors[[i]][ ,tmp] +
try_errors[,match(nms, names(try_errors))]
}
tmp <- stats::na.omit(match(nmsw, names(warnings[[i]])))
if(length(tmp) > 0L){
caught_warnings[,match(nmsw, names(caught_warnings))] <- warnings[[i]][ ,tmp] +
caught_warnings[,match(nmsw, names(caught_warnings))]
}
ret$SIM_TIME <- ret$SIM_TIME + readin[[i]]$SIM_TIME
ret[ ,pick] <- ret[ ,pick] + weights[i] * readin[[i]][ ,pick]
if(has_stored_results & i > 1L){
attr(ret, 'extra_info')$stored_results <-
rbind(attr(ret, 'extra_info')$stored_results,
attr(readin[[i]], 'extra_info')$stored_results)
}
}
if(has_stored_results)
results <- attr(ret, 'extra_info')$stored_results
try_errors[try_errors == 0L] <- NA
caught_warnings[caught_warnings == 0L] <- NA
out <- dplyr::as_tibble(data.frame(ret, try_errors, caught_warnings, check.names = FALSE))
out$SEED <- NULL
if(has_stored_results)
attr(out, 'extra_info') <- list(stored_results=results)
full_out[[j]] <- out
}
if(length(unique.set.index) == 1L){
out <- full_out[[1L]]
extra_info1$stored_results <- attr(out, 'extra_info')$stored_results
if(error_details)
errors_info <- add_cbind(errors_info)
if(warning_details)
warnings_info <- add_cbind(warnings_info)
} else {
out <- do.call(rbind, full_out)
if(has_stored_results)
extra_info1$stored_results <- dplyr::bind_rows(
lapply(full_out, \(x) attr(x, 'extra_info')$stored_results))
if(error_details)
errors_info <- dplyr::bind_rows(errors_info)
if(warning_details)
warnings_info <- dplyr::bind_rows(warnings_info)
}
if(check.only){
if(is.null(target.reps)) target.reps <- max(out$REPLICATIONS)
reps_bad <- out$REPLICATIONS != target.reps
if(any(reps_bad)){
diff <- target.reps - out$REPLICATIONS
out$MISSED_REPLICATIONS <- as.integer(diff)
out$TARGET_REPLICATIONS <- as.integer(target.reps)
out$REPLICATIONS <- NULL
message("The following design conditions did not satisfy the target.reps")
return(out[reps_bad,])
} else {
message(c('All replications satisfied target.reps criteria of ', target.reps))
return(invisible(TRUE))
}
}
if(all(is.na(out$WARNINGS))) out$WARNINGS <- NULL
if(all(is.na(out$ERRORS))) out$ERRORS <- NULL
class(out) <- c('SimDesign', class(out))
if(length(unique(out$REPLICATIONS)) != 1L)
warning("Simulation results do not contain the same number of REPLICATIONS")
extra_info1$total_elapsed_time <- sum(out$SIM_TIME)
extra_info1$number_of_conditions <- nrow(out)
extra_info1$ncores <- ncores
attr(out, 'extra_info') <- extra_info1
if(error_details)
attr(out, 'ERROR_msg') <- errors_info
if(warning_details)
attr(out, 'WARNING_msg') <- warnings_info
attr(out, "design_names") <- attr(readin[[1L]], "design_names")
if(!is.null(filename)){
message(sprintf('Writing combinded file from %i simulations to \"%s\"',
length(filenames), filename))
saveRDS(out, filename)
}
out
}
subset_results <- function(obj, select){
if(is.null(select)) return(obj)
res <- attr(obj, 'extra_info')$stored_results
if(length(select) == 1L && select %in% c('NONE', 'REPLICATIONS')){
res <- NULL
} else {
res <- dplyr::select(res, select)
}
attr(obj, 'extra_info')$stored_results <- res
obj
}
#' @rdname SimCollect
#' @param ... not used
#' @export
aggregate_simulations <- function(...){
.Deprecated('SimCollect')
SimCollect(...)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.