R/SimCollect.R

Defines functions aggregate_simulations subset_results SimCollect

Documented in aggregate_simulations SimCollect

#' Collapse separate simulation files into a single result
#'
#' This function collects and aggregates the results from
#' \code{SimDesign}'s \code{\link{runSimulation}} into a single
#' objects suitable for post-analyses, or combines all the saved results directories and combines
#' them into one. This is useful when results are run piece-wise on one node (e.g., 500 replications
#' in one batch, 500 again at a later date, though be careful about the \code{\link{set.seed}}
#' use as the random numbers will tend to correlate the more it is used) or run independently across different
#' nodes/computing cores (e.g., see \code{\link{runArraySimulation}}.
#'
#'
#' @param dir a \code{character} vector pointing to the directory name containing the \code{.rds} files.
#'   All \code{.rds} files in this directory will be used after first checking
#'   their status with \code{\link{SimCheck}}. For greater specificity use the
#'   \code{files} argument
#'
#' @param files a \code{character} vector containing the names of the simulation's final \code{.rds} files.
#'
#' @param filename (optional) name of .rds file to save aggregate simulation file to. If not specified
#'   then the results will only be returned in the R console.
#'
#' @param select a character vector indicating columns to variables to select from the
#'   \code{SimExtract(what='results')} information. This is mainly useful when RAM is an issue
#'   given simulations with many stored estimates. Default includes the results objects
#'   in their entirety, though to omit all internally stored simulation results pass the
#'   character \code{'NONE'}
#'
#' @param check.only logical; for larger simulations file sets, such as those generated by
#'   \code{\link{runArraySimulation}}, return the design conditions that do no satisfy the
#'   \code{target.reps} and throw warning if files are unexpectedly missing
#'
#' @param target.reps (optional) number of replications to check against to evaluate whether
#'   the simulation files returned the desired number of replications. If missing, the highest
#'   detected value from the collected set of replication information will be used
#'
#' @param warning_details logical; include the aggregate of the warnings to be extracted via
#'   \code{\link{SimExtract}}?
#'
#' @param error_details logical; include the aggregate of the errors to be extracted via
#'   \code{\link{SimExtract}}?
#'
#' @return returns a \code{data.frame/tibble} with the (weighted) average/aggregate
#'   of the simulation results
#'
#' @references
#'
#' Chalmers, R. P., & Adkins, M. C.  (2020). Writing Effective and Reliable Monte Carlo Simulations
#' with the SimDesign Package. \code{The Quantitative Methods for Psychology, 16}(4), 248-280.
#' \doi{10.20982/tqmp.16.4.p248}
#'
#' Sigal, M. J., & Chalmers, R. P. (2016). Play it again: Teaching statistics with Monte
#' Carlo simulation. \code{Journal of Statistics Education, 24}(3), 136-156.
#' \doi{10.1080/10691898.2016.1246953}
#'
#' @seealso \code{\link{runSimulation}}, \code{\link{runArraySimulation}},
#'   \code{\link{SimCheck}}
#'
#' @export
#'
#' @author Phil Chalmers \email{rphilip.chalmers@@gmail.com}
#'
#' @examples
#' \dontrun{
#'
#' setwd('my_working_directory')
#'
#' ## run simulations to save the .rds files (or move them to the working directory)
#' # seeds1 <- genSeeds(design)
#' # seeds2 <- genSeeds(design, old.seeds=seeds1)
#' # ret1 <- runSimulation(design, ..., seed=seeds1, filename='file1')
#' # ret2 <- runSimulation(design, ..., seed=seeds2, filename='file2')
#'
#' # saves to the hard-drive and stores in workspace
#' final <- SimCollect(files = c('file1.rds', 'file2.rds'))
#' final
#'
#' # If filename not included, can be extracted from results
#' # files <- c(SimExtract(ret1, 'filename'), SimExtract(ret2, 'filename'))
#' # final <- SimCollect(files = files)
#'
#'
#' #################################################
#' # Example where each row condition is repeated, evaluated independently,
#' # and later collapsed into a single analysis object
#'
#' # Each condition repeated four times (hence, replications
#' # should be set to desired.reps/4)
#' Design <- createDesign(mu = c(0,5),
#'                        N  = c(30, 60))
#' Design
#'
#' # assume the N=60 takes longer, and should be spread out across more arrays
#' Design_long <- expandDesign(Design, c(2,2,4,4))
#' Design_long
#'
#' replications <- c(rep(50, 4), rep(25,8))
#' data.frame(Design_long, replications)
#'
#' #-------------------------------------------------------------------
#'
#' Generate <- function(condition, fixed_objects) {
#'     dat <- with(condition, rnorm(N, mean=mu))
#'     dat
#' }
#'
#' Analyse <- function(condition, dat, fixed_objects) {
#'     ret <- c(mean=mean(dat), SD=sd(dat))
#'     ret
#' }
#'
#' Summarise <- function(condition, results, fixed_objects) {
#'     ret <- colMeans(results)
#'     ret
#' }
#'
#' #-------------------------------------------------------------------
#'
#' # create directory to store all final simulation files
#' dir.create('sim_files/')
#'
#' iseed <- genSeeds()
#'
#' # distribute jobs independently
#' sapply(1:nrow(Design_long), \(i) {
#'   runArraySimulation(design=Design_long, replications=replications,
#'                 generate=Generate, analyse=Analyse, summarise=Summarise,
#'                 arrayID=i, dirname='sim_files/', filename='job', iseed=iseed)
#' }) |> invisible()
#'
#' # check that all replications satisfy target
#' SimCollect('sim_files/', check.only = TRUE)
#'
#' # this would have been returned were the target.rep supposed to be 1000
#' SimCollect('sim_files/', check.only = TRUE, target.reps=1000)
#'
#' # aggregate into single object
#' sim <- SimCollect('sim_files/')
#' sim
#'
#' SimClean(dir='sim_files/')
#'
#' }
SimCollect <- function(dir=NULL, files = NULL, filename = NULL,
                       select = NULL, check.only = FALSE,
                       target.reps = NULL,
                       warning_details = FALSE,
                       error_details = TRUE){
    if(is.null(dir) && is.null(files))
        stop('either dir or files must be specified')
    if(!is.null(dir) && !is.null(files))
        stop('dir OR files must be specified, not both')
    if(check.only) select <- 'REPLICATIONS'
    if(!is.null(dir)) SimCheck(dir=dir)
    if(!is.null(dir)){
        files <- dir(dir)
    } else dir <- './'
    files <- paste0(dir, files)
    oldfiles <- files
    files <- oldfiles
    if(!is.null(files)){
        filenames <- files
    } else {
        return(invisible(NULL))
    }
    if(!is.null(filename))
        if(filename %in% dir(dir))
            stop(sprintf('File \'%s\' already exists in working directory', filename),
                 call.=FALSE)
    replications_only <- ifelse(!is.null(select) && length(select) == 1L &&
                                    select == 'REPLICATIONS', TRUE, FALSE)

    print_when <- NA
    if(length(filenames) > 20L){
        cat("Reading in files ")
        print_when <- floor(seq(1, length(filenames), length.out=20))
    }
    readin <- vector('list', length(filenames))
    for(i in 1:length(filenames)){
        if(i %in% print_when) cat(".")
        tmp <- try(readRDS(filenames[i]), TRUE)
        if(is(tmp, 'try-error'))
            stop(c('Could not read file ', filenames[i]))
        readin[[i]] <- subset_results(tmp, select=select)
    }
    extra_info1 <- attr(readin[[1L]], 'extra_info')
    ncores <- sum(sapply(readin, function(x) attr(x, 'extra_info')$ncores))
    extra_info1[c("seeds", "date_completed", "summarise_list", "stored_results",
                  "total_elapsed_time", "stored_Random.seeds_list")] <- NULL
    errors <- lapply(readin, function(x)
        as.data.frame(x[ ,grepl('ERROR', colnames(x)), drop=FALSE]))
    warnings <- lapply(readin, function(x)
        as.data.frame(x[ ,grepl('WARNINGS', colnames(x)), drop=FALSE]))
    nms <- unique(do.call(c, lapply(errors, function(x) colnames(x))))
    if(!length(nms)) nms <- 'ERRORS'
    nmsw <- unique(do.call(c, lapply(warnings, function(x) colnames(x))))
    if(!length(nmsw)) nmsw <- 'WARNINGS'
    readin <- lapply(readin, function(x) x[ ,!(
        grepl('ERROR', colnames(x)) | grepl('WARNINGS', colnames(x))), drop=FALSE])
    if(length(unique(sapply(readin, ncol))) > 1L){
        tab <- table(sapply(readin, ncol))
        pick <- filenames[as.integer(names(which.min(tab))) == sapply(readin, ncol)]
        stop(sprintf(c('Number of columns not equal. ',
                     'The following files had the fewest columns:\n%s'),
                     paste0(pick, collapse=', ')))
    }
    Design.ID <- sapply(readin, \(x) SimExtract(x, 'Design.ID'))
    if(is.matrix(Design.ID)){
        set.index <- rep(1L, ncol(Design.ID))
        Design.ID <- Design.ID[,1]
    } else {
        set.index <- Design.ID
    }
    unique.set.index <- unique(set.index)
    full_out <- vector('list', length(unique.set.index))
    readin.old <- readin
    errors.old <- errors
    warnings.old <- warnings
    design_names <- attr(readin[[1L]], "design_names")$design
    errors_info <- lapply(readin.old, \(x) SimExtract(x, 'errors',
                                                      append=FALSE, fuzzy=FALSE))
    warnings_info <- lapply(readin.old, \(x) SimExtract(x, 'warnings',
                                                        append=FALSE, fuzzy=FALSE))
    for(j in unique.set.index){
        readin <- readin.old[which(j == set.index)]
        errors <- errors.old[which(j == set.index)]
        warnings <- warnings.old[which(j == set.index)]
        try_errors <- as.data.frame(matrix(0L, nrow(readin[[1L]]), length(nms)))
        caught_warnings <- as.data.frame(matrix(0L, nrow(readin[[1L]]), length(nms)))
        names(try_errors) <- nms
        names(caught_warnings) <- nmsw
        ret <- readin[[1L]]
        pick <- sapply(readin[[1L]], is.numeric) & !(colnames(readin[[1]]) %in% design_names)
        ret[, pick] <- 0
        pick <- pick & !(colnames(readin[[1L]]) %in% c('SIM_TIME', 'REPLICATIONS', 'SEED'))
        if(check.only)
            ret <- ret[,c(design_names, 'REPLICATIONS')]
        weights <- sapply(readin, function(x) x$REPLICATIONS[1L])
        stopifnot(all(sapply(readin, function(x) length(unique(x$REPLICATIONS)) == 1L)))
        weights <- weights / sum(weights)
        has_stored_results <- !is.null(SimExtract(ret, 'results'))
        for(i in seq_len(length(readin))){
            ret$REPLICATIONS <- ret$REPLICATIONS + readin[[i]]$REPLICATIONS
            if(replications_only) next
            tmp <- stats::na.omit(match(nms, names(errors[[i]])))
            if(length(tmp) > 0L){
                try_errors[,match(nms, names(try_errors))] <- errors[[i]][ ,tmp] +
                    try_errors[,match(nms, names(try_errors))]
            }
            tmp <- stats::na.omit(match(nmsw, names(warnings[[i]])))
            if(length(tmp) > 0L){
                caught_warnings[,match(nmsw, names(caught_warnings))] <- warnings[[i]][ ,tmp] +
                    caught_warnings[,match(nmsw, names(caught_warnings))]
            }
            ret$SIM_TIME <- ret$SIM_TIME + readin[[i]]$SIM_TIME
            ret[ ,pick] <- ret[ ,pick] + weights[i] * readin[[i]][ ,pick]
            if(has_stored_results & i > 1L){
                attr(ret, 'extra_info')$stored_results <-
                rbind(attr(ret, 'extra_info')$stored_results,
                      attr(readin[[i]], 'extra_info')$stored_results)
            }
        }
        if(has_stored_results)
            results <- attr(ret, 'extra_info')$stored_results
        try_errors[try_errors == 0L] <- NA
        caught_warnings[caught_warnings == 0L] <- NA
        out <- dplyr::as_tibble(data.frame(ret, try_errors, caught_warnings, check.names = FALSE))
        out$SEED <- NULL
        if(has_stored_results)
            attr(out, 'extra_info') <- list(stored_results=results)
        full_out[[j]] <- out
    }
    if(length(unique.set.index) == 1L){
        out <- full_out[[1L]]
        extra_info1$stored_results <- attr(out, 'extra_info')$stored_results
        if(error_details)
            errors_info <- add_cbind(errors_info)
        if(warning_details)
            warnings_info <- add_cbind(warnings_info)
    } else {
        out <- do.call(rbind, full_out)
        if(has_stored_results)
            extra_info1$stored_results <- dplyr::bind_rows(
                lapply(full_out, \(x) attr(x, 'extra_info')$stored_results))
        if(error_details)
            errors_info <- dplyr::bind_rows(errors_info)
        if(warning_details)
            warnings_info <- dplyr::bind_rows(warnings_info)
    }
    if(check.only){
        if(is.null(target.reps)) target.reps <- max(out$REPLICATIONS)
        reps_bad <- out$REPLICATIONS != target.reps
        if(any(reps_bad)){
            diff <- target.reps - out$REPLICATIONS
            out$MISSED_REPLICATIONS <- as.integer(diff)
            out$TARGET_REPLICATIONS <- as.integer(target.reps)
            out$REPLICATIONS <- NULL
            message("The following design conditions did not satisfy the target.reps")
            return(out[reps_bad,])
        } else {
            message(c('All replications satisfied target.reps criteria of ', target.reps))
            return(invisible(TRUE))
        }
    }
    if(all(is.na(out$WARNINGS))) out$WARNINGS <- NULL
    if(all(is.na(out$ERRORS))) out$ERRORS <- NULL
    class(out) <- c('SimDesign', class(out))
    if(length(unique(out$REPLICATIONS)) != 1L)
        warning("Simulation results do not contain the same number of REPLICATIONS")
    extra_info1$total_elapsed_time <- sum(out$SIM_TIME)
    extra_info1$number_of_conditions <- nrow(out)
    extra_info1$ncores <- ncores
    attr(out, 'extra_info') <- extra_info1
    if(error_details)
        attr(out, 'ERROR_msg') <- errors_info
    if(warning_details)
        attr(out, 'WARNING_msg') <- warnings_info
    attr(out, "design_names") <- attr(readin[[1L]], "design_names")
    if(!is.null(filename)){
        message(sprintf('Writing combinded file from %i simulations to \"%s\"',
                        length(filenames), filename))
        saveRDS(out, filename)
    }
    out
}

subset_results <- function(obj, select){
    if(is.null(select)) return(obj)
    res <- attr(obj, 'extra_info')$stored_results
    if(length(select) == 1L && select %in% c('NONE', 'REPLICATIONS')){
        res <- NULL
    } else {
        res <- dplyr::select(res, select)
    }
    attr(obj, 'extra_info')$stored_results <- res
    obj
}

#' @rdname SimCollect
#' @param ... not used
#' @export
aggregate_simulations <- function(...){
    .Deprecated('SimCollect')
    SimCollect(...)
}

Try the SimDesign package in your browser

Any scripts or data that you put into this service are public.

SimDesign documentation built on Sept. 11, 2024, 8 p.m.