R/resample.R

Defines functions initialAggregateBootstrapNetCDF4DataOne getProcessNameAndTableName ReportBootstrapNetCDF4 aggregateRelevantBootstrapData ReportBootstrap ResampleMeanNASCData ResampleBioticAssignmentByPSU ResampleBioticAssignmentByStratum ResampleBioticAssignment ResampleMeanSpeciesCategoryCatchData ResampleMeanLengthDistributionData resampleOne resampleDataBy getResampledCount resampleDataBy_new4.0.0_excludingPSUsThatAreNotResampled estimateTimeOfProcesses progressOfProcesses stopProcesses renameDataFolders getMaxNchar selectRobust runOneBootstrap_NetCDF4 runOneBootstrapSaveOutput createReplaceData addFunctionNameToReplaceData createReplaceDataSansFunctionName rbindSublistByName rbindlistByName drawSeedList getDataTypeFromBootstrapNetCDF4 getProcessNamesAndTableNames getVariableNameElementsList getProcessNameTableNameVariableName getSelection getNumberOfBootstrapsFromNetCDF4Data getBootstrapNetCDF4Data prepareBootstrap BootstrapNetCDF4 addBootstrapID Bootstrap

Documented in Bootstrap BootstrapNetCDF4 estimateTimeOfProcesses progressOfProcesses ReportBootstrap ReportBootstrapNetCDF4 ResampleBioticAssignmentByPSU ResampleBioticAssignmentByStratum ResampleMeanLengthDistributionData ResampleMeanNASCData ResampleMeanSpeciesCategoryCatchData stopProcesses

#' Bootstrap the baseline model
#' 
#' Run a subset of the baseline model a number of times after resampling e.g. Hauls in each Stratum, EDUSs in each Stratum.
#' 
#' @param outputData The output of the function from an earlier run.
#' @param projectPath The path to the project to containing the baseline to bootstrap.
#' @param BootstrapMethodTable A table of the columns ProcessName, ResampleFunction and Seed, where each row defines the resample function to apply to the output of the given process, and the seed to use in the resampling. The seed is used to draw one seed per bootstrap run using \code{\link[RstoxBase]{getSeedVector}}. Run RstoxFramework::getResampleFunctions() to get a list of the implemented resample functions. Note that if a process is selected inn \code{BootstrapMethodTable} that is not used in the model up to the \code{OutputProcesses}, the bootstrapping of that process will not be effective on the end result (e.g. select the correct process that returns BioticAssignment data type).
#' @param NumberOfBootstraps Integer: The number of bootstrap replicates.
#' @param OutputProcesses A vector of the processes to save from each bootstrap replicate.
#' @param UseOutputData Logical: Bootstrapping can be time consuming, and by setting \code{UseOutputData} to TRUE the output file generated by a previous run of the process will be used instead of re-running the bootstrapping. Use this parameter with caution. Any changes made to the Baseline model or to the parameters of the Bootstrap itself will not be accounted for unless UseOutputData = FALSE. The option UseOutputData = TRUE is intended only for saving time when one needs to generate a report from an existing Bootstrap run."
#' @param NumberOfCores The number of cores to use for parallel processing. A copy of the project is created in tempdir() for each core, also when using only one core. Note that this will require disc space equivalent to the \code{NumberOfCores} time the size of the project folder (excluding the output/analysis/Bootstrap folder, which will be deleted before copies are made).
#' @param BaselineSeedTable A table of ProcessName and Seed, giving the seed to use for the Baseline processes that requires a Seed parameter. The seed is used to draw one seed per bootstrap run using \code{\link[RstoxBase]{getSeedVector}}.
#' 
#' @details A copy of the project is made for each core given by \code{NumberOfCores}. In the case that NumberOfCores == 1, this is still done for safety.
#' Note that for acoustic-trawl survey estimates, if the AcousticPSUs of a Stratum have different assigned Hauls (not using the Stratum assignment method  in \code{\link[RstoxBase]{DefineBioticAssignment}}), there is a probability that none the assigned Hauls of an AcousticPSU are re-sampled in a bootstrap replicate. This will lead to missing acoustic density for that PSU for the target species, which will propagate throughout to the reports. This forces the use of RemoveMissingValues = TRUE, which implies some degree of under-estimation from what the estimate would be if none of the AcousticPSUs came out with missing acoustic density.
#' 
#' Note on limitatiton on \code{NumberOfBootstraps}: All output requested data from all the bootstrap runs are accumulated in R memory, and written to one RData file at the end of the function, which effectively imposes a restriction of some hundred bootstrap runs for large StoX projects. Use \code{\link{Bootstrap}} instead. Backwards compatibility sets the function Bootstrap to Bootstrap_3.6.0 for StoX projects saved in StoX 3.6.0 and older.
#' 
#' @return
#' A \code{\link{BootstrapData}} object, which is a list of the RstoxData \code{\link[RstoxData]{DataTypes}} and RstoxBase \code{\link[RstoxBase]{DataTypes}}.
#' 
#' @export
#' 
Bootstrap <- function(
        outputData, 
        projectPath, 
        # Table with ProcessName, ResamplingFunction, ResampleWithin, Seed:
        BootstrapMethodTable = data.table::data.table(), 
        NumberOfBootstraps = 1L, 
        OutputProcesses = character(), 
        #OutputVariables = character(), # Uncomment in StoX 4.0.0
        UseOutputData = FALSE, 
        NumberOfCores = 1L, 
        BaselineSeedTable = data.table::data.table()
) {
    # Move this to arguments in StoX 4.0.0:
    OutputVariables = character()
    
    # Use preivously generated output data if specified:
    if(UseOutputData) {
        warning("StoX: Using UseOutputData = TRUE in the function Bootstrap implies reading the BootstrapData from a previous run (stored in the output folder)). Any changes made to the Baseline model or to the parameters of the Bootstrap itself will not be accounted for unless UseOutputData = FALSE. The option UseOutputData = TRUE is intended only for saving time when one needs to generate a report from an existing Bootstrap run.")
        # This was moved to getFunctionArguments() on 2020-10-22:
        #outputData <- get(load(outputDataPath))
        
        # Read the outputData (from a former run of the proecss). Use functionArguments["outputData"] to add the data, since using functionArguments$outputData will delete this element if trying to giev it the value NULL:
        if(is.character(outputData)) {
            if(length(outputData)) {
                if(!file.exists(outputData)) {
                    stop("The file ", outputData, " does not exist. Please re-run the Bootstrap process.")
                }
                else {
                    outputData <- tryCatch(
                        get(load(outputData)), 
                        error = function(err) list(NULL)
                    )
                }
            }
            else {
                outputData <- list(NULL)
            }
        }
        
        return(outputData)
    }
    
    # Prepare the bootstrapping:
    bootstrapSpec <- prepareBootstrap(projectPath, BootstrapMethodTable, OutputProcesses, NumberOfBootstraps, NumberOfCores, BaselineSeedTable)
    
    # Grap the outputs from the bootstrap preparation:
    processesSansProcessData = bootstrapSpec$processesSansProcessData
    NumberOfBootstrapsFile = bootstrapSpec$NumberOfBootstrapsFile
    bootstrapProgressFile = bootstrapSpec$bootstrapProgressFile
    projectPath_copies = bootstrapSpec$projectPath_copies
    stopBootstrapFile = bootstrapSpec$stopBootstrapFile
    replaceDataList = bootstrapSpec$replaceDataList
    BaselineSeedList = bootstrapSpec$BaselineSeedList
    #processNames = bootstrapSpec$processNames
    dataTypes = bootstrapSpec$dataTypes
    
    
    # Delete files on exit:
    on.exit(unlink(NumberOfBootstrapsFile, force = TRUE, recursive = TRUE))
    on.exit(unlink(bootstrapProgressFile, force = TRUE, recursive = TRUE), add = TRUE)
    on.exit(unlink(projectPath_copies, force = TRUE, recursive = TRUE), add = TRUE)
    on.exit(if(file.exists(NumberOfBootstrapsFile)) unlink(NumberOfBootstrapsFile, force = TRUE, recursive = TRUE), add = TRUE)
    on.exit(if(file.exists(bootstrapProgressFile)) unlink(bootstrapProgressFile, force = TRUE, recursive = TRUE), add = TRUE)
    on.exit(if(file.exists(stopBootstrapFile)) unlink(stopBootstrapFile, force = TRUE, recursive = TRUE), add = TRUE)
    
    
    # Prepare for progress info:
    writeLines(as.character(NumberOfBootstraps), NumberOfBootstrapsFile)
    
     # Run the bootstrap:
    bootstrapIndex <- seq_len(NumberOfBootstraps)
    BootstrapData <- RstoxData::mapplyOnCores(
        FUN = runOneBootstrapSaveOutput, 
        NumberOfCores = NumberOfCores, 
        ind = bootstrapIndex, 
        replaceArgsList = BaselineSeedList, # This has length equal to NumberOfCores, so is will be recycled
        replaceDataList = replaceDataList, # This has length equal to NumberOfCores, so is will be recycled
        projectPath = projectPath_copies, # This has length equal to NumberOfCores, so is will be recycled
        
        # Other inputs:
        MoreArgs = list(
            projectPath_original = projectPath, 
            startProcess = min(processesSansProcessData$processIndex), 
            endProcess = max(processesSansProcessData$processIndex), 
            outputProcessesIDs = OutputProcesses, 
            outputVariableNames = OutputVariables, 
            bootstrapProgressFile = bootstrapProgressFile, 
            stopBootstrapFile = stopBootstrapFile
        )
    )
    
    allMessages <- unlist(lapply(BootstrapData, "[[", "message"))
    allWarnings <- unlist(lapply(BootstrapData, "[[", "warning"))
    allErrors <- unlist(lapply(BootstrapData, "[[", "error"))
    BootstrapData <- lapply(BootstrapData, "[[", "processOutput")
    
    # Issue the (unique) messages, warnings and errors, so that these can be picked up:
    allMessages <- uniqueText(allMessages, tag = "messages")
    allWarnings <- uniqueText(allWarnings, tag = "warnings")
    allErrors <- uniqueText(allErrors, tag = "errors")
    if(length(allMessages)) {
        lapply(allMessages, message)
    }
    if(length(allWarnings)) {
        lapply(allWarnings, warning)
    }
    if(length(allErrors)) {
        lapply(allErrors, stop)
    }
    
    
    # Here we need to merge the NeCDF4 bootstrap files, when we get these files implemented. For now all bootstrap data are accumulated in memory and dumped to an RData file.
    
    # Changed on 2020-11-02 to run the baseline out after bootstrapping (with no modification, so a clean baseline run):
    ### # Reset the model to the last process before the bootstrapped processes:
    ### resetModel(
    ###     projectPath = projectPath, 
    ###     modelName = "baseline", 
    ###     processID = processesSansProcessData$processID[1], 
    ###     processDirty = FALSE, 
    ###     shift = -1
    ### )
    
    
    ## Rerun the baseline processes that were run in the bootstrapping:
    #temp <- runProcesses(
    #    projectPath, 
    #    modelName = "baseline", 
    #    startProcess = min(processesSansProcessData$processIndex), 
    #    endProcess = max(processesSansProcessData$processIndex, activeProcessIndex), 
    #    save = FALSE, 
    #    # Be sure to not touch the process data and file output:
    #    saveProcessData = FALSE, 
    #    fileOutput = FALSE
    #)
    
    # Add the process names after rbinding each output:
    bootstrapDataNames <- names(BootstrapData[[1]])
    BootstrapData <- lapply(bootstrapDataNames, rbindlistByName, x = BootstrapData)
    names(BootstrapData) <- bootstrapDataNames
    
    # Drop the list for data types with only one table:
    for(bootstrapDataName in names(BootstrapData)) {
        if(isProcessOutputDataType(BootstrapData[[bootstrapDataName]])) {
            BootstrapData[[bootstrapDataName]] <- BootstrapData[[bootstrapDataName]][[1]]
        }
    }
    
    # Set the dataTypes as attributes:
    for(bootstrapDataName in names(BootstrapData)) {
        attr(BootstrapData[[bootstrapDataName]], "dataType") <- dataTypes[[bootstrapDataName]]
    }
    
    # Set the class to control the output file to an RData file:
    class(BootstrapData) <- "BootstrapData"
    
    return(BootstrapData)
}

addBootstrapID <- function(x, ID) {
    for(process in names(x)) {
        for(data in names(x[[process]])) {
            # For some reason this did not stick (as in https://stackoverflow.com/questions/51877642/adding-a-column-by-reference-to-every-data-table-in-a-list-does-not-stick):
            #x[[process]][[data]][, BootstrapID := ID]
            x[[process]][[data]] <- data.table::data.table(x[[process]][[data]], BootstrapID = ID)
        }
    }
    return(x)
}


# Change noRd to export in StoX 4.0.0:

#' Bootstrap the baseline model, write to NetCDF4 file
#' 
#' Run a subset of the baseline model a number of times after resampling e.g. Hauls in each Stratum, EDUSs in each Stratum.
#' 
#' @inheritParams Bootstrap
#' @param outputMemoryFile The path to the output memory file to copy the \code{outputData} to in the case that \code{UseOutputData} is TRUE.
#' @param OutputVariables An optional list of variables to keep in the output. A typical set of variables could be c("Survey", "Stratum", "SpeciesCategory", "IndividualTotalLength", "IndividualAge", "Abundance", "Biomass"), which should cover the most frequently used variables in reports. Any variable that is used in a report must be present in \code{OutputVariables}. Empty list (the default) implies to keep all variables. This parameter is included to facilitate smaller disc space for the bootstrap objects. The \code{OutputVariables} are kept across the \code{OutputProcesses}.
#' 
#' @details A copy of the project is made for each core given by \code{NumberOfCores}. In the case that NumberOfCores == 1, this is still done for safety.
#' Note that for acoustic-trawl survey estimates, if the AcousticPSUs of a Stratum have different assigned Hauls (not using the Stratum assignment method  in \code{\link[RstoxBase]{DefineBioticAssignment}}), there is a probability that none the assigned Hauls of an AcousticPSU are re-sampled in a bootstrap  replicate. This will lead to missing acoustic density for that PSU for the target species, which will propagate throughout to the reports. This forces the use of RemoveMissingValues = TRUE, which implies some degree of under-estimation from what the estimate would be if none of the AcousticPSUs came out with missing acoustic density.
#' 
#' @return
#' A \code{\link{BootstrapNetCDF4Data}} object, which is a list of the RstoxData \code{\link[RstoxData]{DataTypes}} and RstoxBase \code{\link[RstoxBase]{DataTypes}}.
#' 
#' @export
#' 
BootstrapNetCDF4 <- function(
    outputData, 
    outputMemoryFile, 
    projectPath, 
    # Table with ProcessName, ResamplingFunction, ResampleWithin, Seed:
    BootstrapMethodTable = data.table::data.table(), 
    NumberOfBootstraps = 1L, 
    OutputProcesses = character(), 
    OutputVariables = character(), 
    UseOutputData = FALSE, 
    NumberOfCores = 1L, 
    BaselineSeedTable = data.table::data.table()
) {
    
    # Use preivously generated output data if specified:
    if(UseOutputData) {
        warning("StoX: Using UseOutputData = TRUE in the function BootstrapNetCDF4 implies reading the BootstrapNetCDF4Data from a previous run (stored in the output folder)). Any changes made to the Baseline model or to the parameters of the BootstrapNetCDF4 itself will not be accounted for unless UseOutputData = FALSE. The option UseOutputData = TRUE is intended only for saving time when one needs to generate a report from an existing BootstrapNetCDF4 run.")
        
        # Read the outputData (from a former run of the proecss). Use functionArguments["outputData"] to add the data, since using functionArguments$outputData will delete this element if trying to giev it the value NULL:
        if(is.character(outputData)) {
            if(length(outputData)) {
                if(!file.exists(outputData)) {
                    stop("The file ", outputData, " does not exist. Please re-run the BootstrapNetCDF4 process.")
                }
                else {
                    outputData <- tryCatch(
                        {
                            if(!file.exists(dirname(outputMemoryFile))) {
                                dir.create(dirname(outputMemoryFile), recursive = TRUE)
                            }
                            file.copy(outputData, outputMemoryFile)
                            # Restore the class:
                            outputMemoryFile <- createStoXNetCDF4FileDataType(outputMemoryFile)
                            return(outputMemoryFile)
                        }, 
                        error = function(err) list(NULL)
                    )
                }
            }
            else {
                outputData <- list(NULL)
            }
        }
        else {
            stop("outputData must be the path to the output NetCDF4 file from a preivous BootstrapNetCDF4() run.")
        }
        
        return(outputData)
    }
    
    
    # Prepare the bootstrapping:
    bootstrapSpec <- prepareBootstrap(projectPath, BootstrapMethodTable, OutputProcesses, NumberOfBootstraps, NumberOfCores, BaselineSeedTable)
    
    # Grap the outputs from the bootstrap preparation:
    processesSansProcessData = bootstrapSpec$processesSansProcessData
    NumberOfBootstrapsFile = bootstrapSpec$NumberOfBootstrapsFile
    bootstrapProgressFile = bootstrapSpec$bootstrapProgressFile
    projectPath_copies = bootstrapSpec$projectPath_copies
    stopBootstrapFile = bootstrapSpec$stopBootstrapFile
    replaceDataList = bootstrapSpec$replaceDataList
    BaselineSeedList = bootstrapSpec$BaselineSeedList
    processNames = bootstrapSpec$processNames
    dataTypes = bootstrapSpec$dataType
    
    # Delete files on exit:
    on.exit(unlink(NumberOfBootstrapsFile, force = TRUE, recursive = TRUE))
    on.exit(unlink(bootstrapProgressFile, force = TRUE, recursive = TRUE), add = TRUE)
    on.exit(unlink(projectPath_copies, force = TRUE, recursive = TRUE), add = TRUE)
    on.exit(if(file.exists(NumberOfBootstrapsFile)) unlink(NumberOfBootstrapsFile, force = TRUE, recursive = TRUE), add = TRUE)
    on.exit(if(file.exists(bootstrapProgressFile)) unlink(bootstrapProgressFile, force = TRUE, recursive = TRUE), add = TRUE)
    on.exit(if(file.exists(stopBootstrapFile)) unlink(stopBootstrapFile, force = TRUE, recursive = TRUE), add = TRUE)
    
     # Run the bootstrap:
    bootstrapIndex <- seq_len(NumberOfBootstraps)
    temp <- RstoxData::mapplyOnCores(
        FUN = runOneBootstrap_NetCDF4, 
        NumberOfCores = NumberOfCores, 
        ind = bootstrapIndex, 
        replaceArgsList = BaselineSeedList, 
        replaceDataList = replaceDataList, 
        projectPath = projectPath_copies, 
        # Other inputs:
        MoreArgs = list(
            projectPath_original = projectPath, 
            startProcess = min(processesSansProcessData$processIndex), 
            endProcess = max(processesSansProcessData$processIndex), 
            outputProcessesIDs = OutputProcesses, 
            outputVariableNames = OutputVariables, 
            bootstrapProgressFile = bootstrapProgressFile, 
            stopBootstrapFile = stopBootstrapFile
        ), 
        SIMPLIFY = FALSE
    )
    # Store the dimensions:
    dims <- lapply(temp, "[[", "dims")
    nchars <- lapply(temp, "[[", "nchars")
    
    # Treat messages, warnings and errors:
    allMessages <- unlist(lapply(temp, "[[", "message"))
    allWarnings <- unlist(lapply(temp, "[[", "warning"))
    allErrors <- unlist(lapply(temp, "[[", "error"))
    # Issue the (unique) messages, warnings and errors, so that these can be picked up:
    allMessages <- uniqueText(allMessages, tag = "messages")
    allWarnings <- uniqueText(allWarnings, tag = "warnings")
    allErrors <- uniqueText(allErrors, tag = "errors")
    if(length(allMessages)) {
        lapply(allMessages, message)
    }
    if(length(allWarnings)) {
        lapply(allWarnings, warning)
    }
    if(length(allErrors)) {
        lapply(allErrors, stop)
    }
    
     
    # List all saved memory data folders:
    memoryDataFolders <- lapply(projectPath_copies, getProjectPaths, "dataModelsFolder")
    memoryDataSubFolders <- unlist(lapply(memoryDataFolders, list.dirs, recursive = FALSE))
    
    # Get the original folders:
    originalFolders <- memoryDataSubFolders[basename(memoryDataSubFolders) == "baseline"]
    # Start by deleting all original folders (for all cores):
    unlink(originalFolders, recursive = TRUE)
    
    # Get only those with suffix:
    hasSuffix <- grepl("_", basename(memoryDataSubFolders))
    memoryDataSubFolders <- memoryDataSubFolders[hasSuffix]
    
    # Order the sub folders by suffix:
    suffix <- as.numeric(sub("^[^_]*_", "", basename(memoryDataSubFolders)))
    memoryDataSubFolders <- memoryDataSubFolders[order(suffix)]
    
    # Create a tempfile to write the bootstrap data to, and return the path of this tempfile at the end. Then runProcess() will recognize the tempfile and rename it to a memory file and make a copy to the output file.
    filePath <- tempfile()
    nc <- NULL
    
    # Run through the bootstrap runs and rename the saved baseline folders to the original name:
    message("Writing NetCDF4 file...")
    
    for(ind in seq_along(memoryDataSubFolders)) {
        
        # Get the project path:
        thisProjectPath <- dirname(dirname(dirname(dirname(dirname(memoryDataSubFolders[ind])))))
        # Rename the sub folder:
        originalFolder <- file.path(dirname(memoryDataSubFolders[ind]), "baseline")
        file.rename(memoryDataSubFolders[ind], originalFolder)
        
        # Read the data:
        processOutput <- getModelData(
            projectPath = thisProjectPath, 
            modelName = "baseline", 
            processes = OutputProcesses, 
            drop.datatype = FALSE
        )
        
        # If outputVariableNames is given keep only those variables in all tables:
        if(length(OutputVariables)) {
            processOutput <- lapply(processOutput, function(x) lapply(x, selectRobust, OutputVariables))
        }
        
        # Rename back:
        file.rename(originalFolder, memoryDataSubFolders[ind])
        
        
        # Set the dataTypes as attributes:
        attr(processOutput, "dataType") <- unname(dataTypes[OutputProcesses])
        attr(processOutput, "processName") <- OutputProcesses
        
        # Write to NetCDF4:
        ###write_list_NetCDFF4(processOutput, filePath, step = ind, append = TRUE, ow = ind == 1, missval = -9, numberOfSteps = NumberOfBootstraps, verbose = FALSE)
        
        # The following test was done to performance for different compression settings on the tobis test project with NumberOfBootstraps = 200:
        # Compression 9:
        #         Old: (time used: 115.822 s)
        #     NetCDF4: (time used: 291.861 s)
        #     FileSize: 27.3 MB
        # Compression 1:
        #         Old: (time used: 133.591 s)
        #     NetCDF4: (time used: 213.191 s)
        #     FileSize: 29.1 MB
        # Compression NA:
        #         Old: (time used: 107.817 s)
        #     NetCDF4: (time used: 170.776 s)
        #     FileSize: 65.9 MB
        # 
        # However, using compression does not work well with appending to the file. The file may actually become larger with compression than without, so we do not use compression at all:
        
        nc <- write_list_as_tables_NetCDFF4(processOutput, filePath, nc = nc, index = ind, dims = dims, nchars = nchars, append = ind > 1, ow = FALSE, missval = -9, compression = NA, verbose = FALSE)
        #print(file.info(filePath)[, "size"])
    }
    
    filePath <- createStoXNetCDF4FileDataType(filePath)
    
    #names(filePath) <- "StoXNetCDF4File"
    return(filePath)
}




prepareBootstrap <- function(projectPath, BootstrapMethodTable, OutputProcesses, NumberOfBootstraps, NumberOfCores, BaselineSeedTable) {
    # Identify the first process set for resampling by the BootstrapMethodTable (here BootstrapMethodTable$ProcessName can be a vector, in which case the table is returned from the first process of these and onwards):
    processesSansProcessData <- getProcessesSansProcessData(projectPath, modelName = "baseline", startProcess = BootstrapMethodTable$ProcessName, endProcess = OutputProcesses, return.processIndex = TRUE, only.valid = TRUE)
    
    # Check the output processes:
    if(length(OutputProcesses) && !all(OutputProcesses %in% processesSansProcessData$processName)) {
        stop("StoX: The following processes specified in OutputProcesses were not recognized: ", paste(setdiff(OutputProcesses, processesSansProcessData$processName), collapse = ", "))
        OutputProcesses <- intersect(OutputProcesses, processesSansProcessData$processName)
    }
    if(!length(OutputProcesses)) {
        stop("StoX: At least one valid process name must be given in OutputProcesses")
    }
    
    # The baseline must have been run until the last output process:
    activeProcessIndex <- getProcessIndexFromProcessID(
        projectPath = projectPath, 
        modelName = "baseline", 
        processID = getActiveProcess(
            projectPath = projectPath, 
            modelName = "baseline"
        )$processID
    )
    
    baselineMustHaveBeenRunTo <- max(processesSansProcessData$processIndex)
    if(activeProcessIndex < baselineMustHaveBeenRunTo) {
       stop("The Baseline model must be run at least until the last output process \"", getProcessNameFromProcessID(projectPath, modelName = "baseline", processID = processesSansProcessData[processIndex == baselineMustHaveBeenRunTo, processID]), "\"")
    }
    
    # Draw the seeds in a table, and then split into a list for each bootstrap run:
    #SeedTable <- data.table::as.data.table(lapply(BootstrapMethodTable$Seed, RstoxBase::getSeedVector, size = NumberOfBootstraps))
    #names(SeedTable) <- BootstrapMethodTable$ProcessName
    #SeedList <- split(SeedTable, seq_len(nrow(SeedTable)))
    NumberOfBootstraps <- NumberOfBootstraps
    SeedList <- drawSeedList(
        table = BootstrapMethodTable, 
        NumberOfBootstraps = NumberOfBootstraps, 
        listOf = "table"
    )
    # Create the replaceDataList input to runProcesses, which defines the seed for each bootstrap run:
    replaceDataList <- createReplaceData(SeedList = SeedList, BootstrapMethodTable = BootstrapMethodTable)
    
    # Scan through the baseline processes to be run and look for processes with the parameter Seed:
    hasSeed <- sapply(processesSansProcessData$functionParameters, function(x) "Seed" %in% names(x))
    # Error if the BaselineSeedTable does not contain exactly the processes using Seed in the Baseline processes to be run:
    if(any(hasSeed)) {
        presentInBaselineButNotInBaselineSeedTable <- setdiff(
            processesSansProcessData$processName[hasSeed], 
            BaselineSeedTable$ProcessName
        )
        if(length(presentInBaselineButNotInBaselineSeedTable)) {
            stop("The BaselineSeedTable must contain Seed for the processes ", paste(processesSansProcessData$processName[hasSeed], collapse = ", "), ".")
        }
    }
    # Construct a list of lists, where each list contains a list of Seed named by the processes using Seed in the Baseline:
    BaselineSeedList <- drawSeedList(
        table = BaselineSeedTable, 
        NumberOfBootstraps = NumberOfBootstraps, 
        listOf = "list"
    )
    
    # Get the number of cores to open:
    NumberOfCores <- RstoxData::getNumberOfCores(NumberOfCores, n  = NumberOfBootstraps)
    
    # Copy the project to the tempdir for each core:
    projecName <- basename(projectPath)
    # As of 2021-05-27 (v3.0.23) make a copy even if running on only one core. This for safety:
    #if(NumberOfCores > 1)  {
    projectPath_copies <- file.path(tempdir(), paste0(projecName, seq_len(NumberOfCores)))
    message("Copying the project ", projecName, " to the ", length(projectPath_copies), " projects \n\t", paste(projectPath_copies, collapse = "\n\t"))
    temp <- RstoxData::mapplyOnCores(
        copyProject, 
        projectPath = projectPath, 
        newProjectPath = projectPath_copies, 
        MoreArgs = list(
            ow = TRUE, 
            empty.memory = c("analysis", "report"), 
            empty.output = TRUE
        ), 
        NumberOfCores = NumberOfCores
    )
    
    # Run the subset of the baseline model:
    bootstrapProgressFile <- getProjectPaths(projectPath, "progressFile")$analysis
    NumberOfBootstrapsFile <- getProjectPaths(projectPath, "NFile")$analysis
    stopBootstrapFile <- getProjectPaths(projectPath, "stopFile")$analysis
    if(file.exists(stopBootstrapFile)) {
        unlink(stopBootstrapFile, force = TRUE, recursive = TRUE)
    }
    
    # Prepare for progress info:
    writeLines(as.character(NumberOfBootstraps), NumberOfBootstrapsFile)
    
    
    # Get the dataTypes:
    processNames <- processesSansProcessData$processName
    processIDs <- unlist(
        mapply(
            getProcessIDFromProcessName, 
            processName = processNames, 
            MoreArgs = list(
                projectPath = projectPath, 
                modelName = "baseline"
            ), 
            only.processID = TRUE, 
            SIMPLIFY = FALSE
        )
    )
    dataTypes <- unlist(
        mapply(
            getDataType, 
            processID = processIDs, 
            MoreArgs = list(
                projectPath = projectPath, 
                modelName = "baseline"
            ), 
            SIMPLIFY = FALSE
        )
    )
    
    
    output <- list(
        processesSansProcessData = processesSansProcessData, 
        NumberOfBootstrapsFile = NumberOfBootstrapsFile, 
        bootstrapProgressFile = bootstrapProgressFile, 
        projectPath_copies = projectPath_copies, 
        stopBootstrapFile = stopBootstrapFile, 
        replaceDataList = replaceDataList, 
        BaselineSeedList = BaselineSeedList, 
        processNames = processNames, 
        dataTypes = dataTypes
    )
    
    
    return(output)
}




# Change noRd to export in StoX 4.0.0:

#' Get bootstrap data saved in a NetCDF4 file
#' 
#' @param nc The open NetCDF4 file object.
#' @param selection Hierarchical list of names of the groups/variables. The last element must be a vector of the variables to return from the table specified by the other elements. E.g., list("ImputeSuperIndividuals", "SuperIndividualsData", c("Stratum", "IndividualAge", "Abundance")) will return a data.table of the three columns "Stratum", "IndividualAge" and "Abundance", added the BootstrapID specified in \code{BootstrapID}.
#' @param BootstrapIDStart,BootstrapIDEnd The start and end bootstrap IDs, i.e., the indices of the bootstrap replicates. The default returns all bootstrap replicates.
#' @param dropList Logical: If FALSE (the default) return a list named by the \code{selection}.
#' 
#' @noRd
#' 
getBootstrapNetCDF4Data <- function(nc, selection = list(), BootstrapIDStart = 1, BootstrapIDEnd = Inf, dropList = FALSE) {
    
    # Get the variables: 
    var <- nc$var
    
    ndims <- lapply(var, "[[", "ndims")
    
    
    if(!length(selection)) {
        warning("No selection of table/variables. Returning NULL. Use selection = NA to get all variables.")
        return(NULL)
    }
    else if(length(selection) == 1 && is.na(selection)) {
        selection <- sapply(nc$var, "[[", "name")
    }
    selectionList <- getSelection(selection)
    listNames <- selectionList$listNames
    requestedVariables <- selectionList$requestedVariables
    requestedVariablesFullName <- selectionList$requestedVariablesFullName
    
    # Get the nrow variable, which determine what to read, and which will be added as BootstrapID to the output table:
    nrowsVariable <- paste(paste(listNames, collapse = "/"), "nrow", sep = "/")
    nrows <- ncdf4::ncvar_get(nc, nrowsVariable)
    start <- 1 + sum(nrows[seq_len(BootstrapIDStart - 1)])
    BootstrapIDEnd <- min(BootstrapIDEnd, length(nrows))
    count <- sum(nrows[seq(BootstrapIDStart, BootstrapIDEnd)])
    
    # Read the variables:   
    list <- lapply(requestedVariablesFullName, function(var) ncdf4::ncvar_get(nc, var, start = if(ndims[[var]] == 2) c(1, start) else start, count = if(ndims[[var]] == 2) c(-1, count) else count))
    # Set names to the variables and combine to a data.table:
    names(list) <- requestedVariables
    table <- data.table::setDT(list)
    
    # Add the BootstrapID:
    BootstrapID <- rep(seq(BootstrapIDStart, BootstrapIDEnd), nrows[seq(BootstrapIDStart, BootstrapIDEnd)])
    data.table::set(table, j = "BootstrapID", value = BootstrapID)
    #table[, BootstrapID := ..BootstrapID]
    
    # Expand to a list like the output from a Bootstrap process:
    if(!dropList) {
        for(sel in rev(listNames)) {
            table <- structure(list(table), names = sel)
        }
    }
    
    
    return(table)
}



getNumberOfBootstrapsFromNetCDF4Data <- function(nc, selection = list()) {
    
    # Get the variables: 
    var <- nc$var
    
    ndims <- lapply(var, "[[", "ndims")
    
    if(!length(selection)) {
        warning("No selection of table/variables. Returning NULL.")
        return(NULL)
    }
    selectionList <- getSelection(selection)
    listNames <- selectionList$listNames
    requestedVariables <- selectionList$requestedVariables
    requestedVariablesFullName <- selectionList$requestedVariablesFullName
    
    # Get the nrow variable,, which determine what to read, and which will be added as BootstrapID to the output table:
    nrowsVariable <- paste(paste(listNames, collapse = "/"), "nrow", sep = "/")
    nrows <- ncdf4::ncvar_get(nc, nrowsVariable)
    NumberOfBootstraps <- length(nrows)
    
    return(NumberOfBootstraps)
}




getSelection <- function(selection) {
    if(!length(selection)) {
        warning("No selection of table/variables. Returning NULL.")
        return(NULL)
    }
    
    # If the selection is given as a list, paste all but the last element to the last element to get full names of the variables:
    if(is.list(selection)) {
        lens <- lengths(selection)
        if(any(utils::head(lens, -1) != 1)) {
            stop("Can only get one table at the time, but possible more than one variables of a table (last element of selection can have length > 1, but the other elements must have length 1).")
        }
        
        # Paste the table name and the variable names given by the selection, to create full paths of the variables:
        requestedVariablesFullName <- do.call(paste, c(selection, list(sep = "/")))
        # Get the names of the list of lists holding the final table to return (process name, data type, table name):
        listNames <- unlist(utils::head(selection, -1))
        listNames <- unlist(strsplit(listNames, "/", fixed = TRUE))
        # The variable names are stored in the last element of the selection:
        requestedVariables <- selection[[length(selection)]]
    }
    else {
        listNames <- unlist(utils::head(strsplit(selection[1], "/", fixed = TRUE)[[1]], -1))
        requestedVariables <- unlist(lapply(strsplit(selection, "/", fixed = TRUE), utils::tail, 1))
        requestedVariablesFullName <- selection
    }
    
    list(
        listNames = listNames,
        requestedVariables = requestedVariables,
        requestedVariablesFullName = requestedVariablesFullName
    )
}

### #' Get the variable names of a BootstrapNetCDF4Data.
### #' 
### #' @inheritParams getBootstrapNetCDF4Data
### #' @param discardNrow Logical: If TRUE discard the nrow variable for each table.
### #' 
### #' @export
### #' 
### getBootstrapNetCDF4Variables <- function(BootstrapNetCDF4Data, discardNrow = TRUE) {
###     #  Open the file:
###     nc <- ncdf4::nc_open(unlist(BootstrapNetCDF4Data))
###     on.exit(ncdf4::nc_close(nc))
###     # Get the variables: 
###     var <- nc$var
###     prec <- sapply(var, "[[", "prec")
###     var <- sapply(var, "[[", "name")
###     
###     output <- data.table::data.table(
###         variable = var, 
###         type = prec
###     )
###     
###     # Discard the nrow variables:
###     if(discardNrow) {
###         output <- subset(output, !endsWith(variable, "/nrow"))
###     }
###     
###     return(output)
### }


### #' Get the process names of the processes stored in a BootstrapNetCDF4Data.
### #' 
### #' For processes with function output as a list with the two elements Data and Resolution, only the Data are used in reports.
### #' 
### #' @inheritParams getBootstrapNetCDF4Data
### #' 
### #' @export
### #' 
### getBootstrapNetCDF4ProcessNames <- function(BootstrapNetCDF4Data) {
###     #  Open the file:
###     nc <- ncdf4::nc_open(unlist(BootstrapNetCDF4Data))
###     on.exit(ncdf4::nc_close(nc))
###     # Get the dataTypes:
###     processNames <- unique(sapply(lapply(lapply(sapply(nc$var, "[[", "name"), strsplit, "/", fixed = TRUE), unlist), utils### ::head, 1))
###     
###     return(processNames)
### }


### #' Get the data of a process stored in a BootstrapNetCDF4Data.
### #' 
### #' For processes with function output as a list with the two elements Data and Resolution, only the Data is returned.
### #' 
### #' @inheritParams getBootstrapNetCDF4Data
### #' 
### #' @export
### #' 
### getBootstrapNetCDF4ProcessNames <- function(BootstrapNetCDF4Data) {
###     #  Open the file:
###     nc <- ncdf4::nc_open(unlist(BootstrapNetCDF4Data))
###     on.exit(ncdf4::nc_close(nc))
###     # Get the dataTypes:
###     processNames <- unique(sapply(lapply(lapply(sapply(nc$var, "[[", "name"), strsplit, "/", fixed = TRUE), unlist), utils::head, 1))
###     
###     return(processNames)
### }


getProcessNameTableNameVariableName <- function(nc) {
    list <- getVariableNameElementsList(nc)
    table <- data.table::rbindlist(lapply(list, as.list))
    data.table::setnames(table, c("ProcessName", "TableName", "VariableName"))
    return(table)
}

getVariableNameElementsList <- function(nc) {
    lapply(lapply(sapply(nc$var, "[[", "name"), strsplit, "/", fixed = TRUE), unlist)
}

getProcessNamesAndTableNames <- function(nc) {
    variableNameElementsList <- getVariableNameElementsList(nc)
    processNamesAndTableNames <- data.table::rbindlist(lapply(lapply(variableNameElementsList, utils::head, 2), as.list))
    names(processNamesAndTableNames) <- c("processName", "tableName")
    processNamesAndTableNames <- unique(processNamesAndTableNames)
    return(processNamesAndTableNames)
}

# Get the datatype of a specific processName in a BootstrapNetCDF4 file:
getDataTypeFromBootstrapNetCDF4 <- function(nc, processName) {
    
    # Get the dataType and processName global attributes:
    processNames <- ncdf4::ncatt_get(nc, varid = 0, attname = "processName")$value
    dataTypes <- ncdf4::ncatt_get(nc, varid = 0, attname = "dataType")$value
    
    if(! processName %in% processNames) {
        warning("The process ", processName, " is not present with a dataType in the file ", BootstrapNetCDF4Data, ". Present processes are ", paste(processNames, collapse = ", "), ".")
        return(NULL)
    }
    atProcecssName <- which(processName == processNames)
    
    if(length(atProcecssName) > 1) {
        stop("The file ", BootstrapNetCDF4Data, " specifies more than one dataType for the process ", processName, ".")
    }
    
    thisDataType <- dataTypes[atProcecssName]
    
    return(thisDataType)
}



### #' Get the data of a process stored in a BootstrapNetCDF4Data.
### #' 
### #' For processes with function output as a list with the two elements Data and Resolution, only the Data is returned.
### #' 
### #' @inheritParams getBootstrapNetCDF4Data
### #' 
### #' @export
### #' 
### getBootstrapNetCDF4FullProcessName <- function(processName, BootstrapNetCDF4Data) {
###     #  Open the file:
###     nc <- ncdf4::nc_open(unlist(BootstrapNetCDF4Data))
###     on.exit(ncdf4::nc_close(nc))
###     # Get the dataTypes:
###     processNames <- unique(sapply(lapply(lapply(sapply(nc$var, "[[", "name"), strsplit, "/", fixed = TRUE), unlist), utils::head, 1))
###     
###     return(processNames)
### }






# Funcion to draw seeds for each process given in the table and each bootstrap run, and reshape the seeds into a list of either data.table with rows 
drawSeedList <- function(table, NumberOfBootstraps, listOf = c("table", "list")) {
    if(!length(table)) {
        SeedList <- vector("list", NumberOfBootstraps)
        if(listOf == "list") {
            SeedList <- lapply(SeedList, as.data.table)
        }
    }
    else {
        SeedTable <- data.table::as.data.table(lapply(table$Seed, RstoxBase::getSeedVector, size = NumberOfBootstraps))
        names(SeedTable) <- table$ProcessName
        SeedList <- split(SeedTable, seq_len(nrow(SeedTable)))
        if(listOf == "list") {
            SeedList <- lapply(SeedList, function(x) structure(as.list(x), names = names(x)))
        }
    }
    
    return(SeedList)
}


rbindlistByName <- function(name, x) {
    subnames <- names(x[[1]][[name]])
    out <- lapply(subnames, rbindSublistByName, name, x)
    names(out) <- subnames
    return(out)
}
rbindSublistByName <- function(subname, name, x) {
    data.table::rbindlist(lapply(x, function(y) y[[name]][[subname]]))
}


# Function to create a list of the ReplaceData input to runProcesses(), 
createReplaceDataSansFunctionName <- function(Seed) {
    lapply(Seed, function(x) list(MoreArgs = list(Seed = x)))
}
addFunctionNameToReplaceData <- function(replaceData, BootstrapMethodTable) {
    out <- lapply(names(replaceData), function(name) 
        c(
            # Add the full name of the function:
            list(FunctionName = paste("RstoxFramework", BootstrapMethodTable[ProcessName == name, ResampleFunction], sep = "::")), 
            replaceData[[name]]
        ))
    # Add the process names to the list to enable replaceDataList in runProcesses():
    names(out) <- names(replaceData)
    return(out)
}
createReplaceData <- function(SeedList, BootstrapMethodTable) {
    replaceDataList <- lapply(SeedList, createReplaceDataSansFunctionName)
    replaceDataList <- lapply(replaceDataList, addFunctionNameToReplaceData, BootstrapMethodTable = BootstrapMethodTable)
    return(replaceDataList)
}


# Define a function to run processes and save the output of the last process to the output folder:
runOneBootstrapSaveOutput <- function(ind, replaceArgsList, replaceDataList, projectPath, projectPath_original, startProcess, endProcess, outputProcessesIDs, outputVariableNames, bootstrapProgressFile, stopBootstrapFile) {
    
    # Stop if the file stopBootstrap.txt exists:
    if(file.exists(stopBootstrapFile)) {
        stop("Bootstrap aborted by the user.")
    }
    
    # Run the part of the baseline that containes the processes to be modified to those to be returned:
    temp <- runFunction(
        what = "runProcesses", 
        args = list(
            projectPath = projectPath, 
            modelName = "baseline", 
            startProcess = startProcess, 
            endProcess = endProcess, 
            save = FALSE, 
            # Be sure to not touch the process data and file output, 
            # the latter mostly for speed if the bootstrap is run on a copy of the project 
            # (see if this is the case in the code of Bootstrap()):
            saveProcessData = FALSE, 
            #  Do not save the output (text) files:
            fileOutput = FALSE, 
            setUseProcessDataToTRUE = FALSE, 
            replaceArgsList = replaceArgsList, 
            replaceDataList = replaceDataList, 
            try = FALSE, 
            msg = FALSE
        ), 
        onlyStoxMessages = FALSE, 
        onlyStoxWarnings = FALSE, 
        maxLength.Message = 10000, 
        maxLength.Warning = 10000, 
        maxLength.Error = 10000, 
        collapseMessages = FALSE, 
        collapseWarnings = FALSE, 
        collapseErrors = FALSE,
        header = NULL
    )
    
    
    ## Issue the messages, warnings and errors, so that these can be picked up:
    #if(length(temp$message)) {
    #    lapply(temp$message, message)
    #}
    #if(length(temp$warning)) {
    #    lapply(temp$warning, warning)
    #}
    #if(length(temp$error)) {
    #    lapply(temp$error, stop)
    #}
    
    #runProcesses(
    #    projectPath, 
    #    modelName = "baseline", 
    #    startProcess = startProcess, 
    #    endProcess = endProcess, 
    #    save = FALSE, 
    #    # Be sure to not touch the process data and file output, 
    #    # the latter mostly for speed if the bootstrap is run on a copy of the project 
    #    # (see if this is the case in the code of Bootstrap()):
    #    saveProcessData = FALSE, 
    #    #  Do not save the output (text) files:
    #    fileOutput = FALSE, 
    #    setUseProcessDataToTRUE = FALSE, 
    #    replaceArgsList = replaceArgsList, 
    #    replaceDataList = replaceDataList, 
    #    try = FALSE, 
    #    msg = FALSE
    #)
    
    
    # Get the requested outputs:
    processOutput <- getModelData(
        projectPath = projectPath, 
        modelName = "baseline", 
        processes = outputProcessesIDs, 
        drop.datatype = FALSE
    )
    
    # If outputVariableNames is given keep only those variables in all tables:
    if(length(outputVariableNames)) {
        processOutput <- lapply(processOutput, function(x) lapply(x, selectRobust, outputVariableNames))
    }
    
    # Add bootstrap IDs to the processOutput:
    for(dataType in names(processOutput)) {
        for(table in names(processOutput[[dataType]])) {
            # This did not work for some reason (https://stackoverflow.com/questions/51877642/adding-a-column-by-reference-to-every-data-table-in-a-list-does-not-stick):
            # processOutput[[dataType]][[table]][, BootstrapID := ..ind]
            # So we do it not by reference:
            processOutput[[dataType]][[table]]$BootstrapID <- ind
        }
    }
    
    # Create a list with the messages, warnings and errors alongside the data:
    processOutput <- list(
        message = temp$message,  
        warning = temp$warning,  
        error = temp$error,  
        processOutput = processOutput
    )
    
    # Add a dot to the progess file:
    cat(".", file = bootstrapProgressFile, append = TRUE)
    
    return(processOutput)
}

# Define a function to run processes and save the output of the last process to the output folder:
runOneBootstrap_NetCDF4 <- function(ind, replaceArgsList, replaceDataList, projectPath, projectPath_original, startProcess, endProcess, outputProcessesIDs, outputVariableNames, bootstrapProgressFile, stopBootstrapFile) {
    
    # Stop if the file stopBootstrap.txt exists:
    if(file.exists(stopBootstrapFile)) {
        stop("Bootstrap aborted by the user.")
    }
    
    # Run the part of the baseline that contains the processes to be modified to those to be returned:
    #runProcesses(
    #    projectPath, 
    #    modelName = "baseline", 
    #    startProcess = startProcess, 
    #    endProcess = endProcess, 
    #    save = FALSE, 
    #    # Be sure to not touch the process data and file output,
    #    # the latter mostly for speed if the bootstrap is run on a copy of the project 
    #    # (see if this is the case in the code of Bootstrap()):
    #    saveProcessData = FALSE, 
    #    #  Do not save the output (text) files:
    #    fileOutput = FALSE, 
    #    setUseProcessDataToTRUE = FALSE, 
    #    replaceArgsList = replaceArgsList, 
    #    replaceDataList = replaceDataList, 
    #    msg = FALSE
    #)
    
    temp <- runFunction(
        what = "runProcesses", 
        args = list(
            projectPath = projectPath, 
            modelName = "baseline", 
            startProcess = startProcess, 
            endProcess = endProcess, 
            save = FALSE, 
            # Be sure to not touch the process data and file output,
            # the latter mostly for speed if the bootstrap is run on a copy of the project 
            # (see if this is the case in the code of Bootstrap()):
            saveProcessData = FALSE, 
            #  Do not save the output (text) files:
            fileOutput = FALSE, 
            setUseProcessDataToTRUE = FALSE, 
            replaceArgsList = replaceArgsList, 
            replaceDataList = replaceDataList, 
            try = FALSE, 
            msg = FALSE
        ), 
        onlyStoxMessages = FALSE, 
        onlyStoxWarnings = FALSE, 
        maxLength.Message = 10000, 
        maxLength.Warning = 10000, 
        maxLength.Error = 10000, 
        collapseMessages = FALSE, 
        collapseWarnings = FALSE, 
        collapseErrors = FALSE,
        header = NULL
    )
    
    
    
    
    # Get the output dimensions for use when writing NetCDF4 later:
    processOutput <- getModelData(
        projectPath = projectPath, 
        modelName = "baseline", 
        processes = outputProcessesIDs, 
        drop.datatype = FALSE
    )
    # If outputVariableNames is given keep only those variables in all tables. This must also be done when writing the final NetCDF4 file:
    if(length(outputVariableNames)) {
        processOutput <- lapply(processOutput, function(x) lapply(x, selectRobust, outputVariableNames))
    }
    dims <- lapply(processOutput, function(x) lapply(x, dim))
    nchars <- lapply(processOutput, function(x) lapply(x, function(y) lapply(y, getMaxNchar)))
    
    # Rename the baseline output data memory folder. This effectively saves the output:
    renameDataFolders(
        projectPath = projectPath, 
        modelName = "baseline", 
        processes = outputProcessesIDs, 
        suffix = ind, 
        restore = TRUE
    )
    
    # Add a dot to the progess file:
    cat(".", file = bootstrapProgressFile, append = TRUE)
    #cat(paste(".", ind, basename(projectPath), "\n"), file = bootstrapProgressFile, append = TRUE)
    
    return(list(
        dims = dims, 
        nchars = nchars, 
        message = temp$message,  
        warning = temp$warning,  
        error = temp$error
    ))
}


# Select variable of a table, ignoring variable names that are not present:
selectRobust <- function(x, var) {
    var <- intersect(var, names(x))
    if(length(var)) {
        x <- subset(x, select = var)
    }
    
    return(x)
}


getMaxNchar <- function(x) {
    if("POSIXct" %in% class(x)) {
        max(nchar(format(x, format = "%Y-%m-%dT%H:%M:%OS3Z")), na.rm = TRUE)
    }
    else {
        max(nchar(x), na.rm = TRUE)
    }
}


renameDataFolders <- function(
    projectPath, 
    modelName, 
    processes, 
    suffix, 
    sep = "_", 
    restore = TRUE
){
    
    # Get the processes to get rename data folder for:
    processTable <- readProcessIndexTable(
        projectPath = projectPath, 
        modelName = modelName, 
        processes = processes
    )
    processID = processTable$processID
            
    # Get the current and new dataModelsFolder:
    outputMemoryDataFolder <- getProjectPaths(projectPath)[["dataModelsFolders"]][modelName]
    outputMemoryDataFolders <- file.path(outputMemoryDataFolder, processID)
    outputMemoryDataFolder_new <- paste(outputMemoryDataFolder, suffix, sep = sep)
    outputMemoryDataFolders_new <- file.path(outputMemoryDataFolder_new, processID)
    
    # Create the new folder:
    dir.create(outputMemoryDataFolder_new)
    
    # Rename:
    file.rename(outputMemoryDataFolders, outputMemoryDataFolders_new)
    
    return(outputMemoryDataFolders_new)
}


### #' Stop a bootstrap run.
### #' 
### #' @inheritParams general_arguments
### #' 
### #' @export
### #' 
### stopBootstrap <- function(projectPath) {
###     write("", file = getProjectPaths(projectPath, "stopBootstrapFile"))
### }
#' Stop runProcesses().
#' 
#' @inheritParams general_arguments
#' 
#' @export
#' 
stopProcesses <- function(projectPath, modelName) {
    stopFile <- getProjectPaths(projectPath, "stopFile")[[modelName]]
    write("", file = stopFile)
}
#' Get process progress
#' 
#' @inheritParams general_arguments
#' @param percent Logical: If TRUE return the progress in percent, otherwise in [0,1].
#' 
#' @export
#' 
progressOfProcesses <- function(projectPath, modelName, percent = FALSE) {
    # Get the paths to the progress and N files:
    ProgressFile <- getProjectPaths(projectPath, "progressFile")[[modelName]]
    NFile <- getProjectPaths(projectPath, "NFile")[[modelName]]
    if(!file.exists(ProgressFile)) {
        return(NA)
    }
    # Read number of dots and compare the N:
    N <- as.numeric(readLines(NFile, warn = FALSE))
    Progress <- readLines(ProgressFile, warn = FALSE)
    n <- lengths(regmatches(Progress, gregexpr(".", Progress, fixed = TRUE)))
    Progress <- n / N
    if(percent) {
        Progress <- Progress * 100
    }
    return(Progress)
}
#' Get process progress
#' 
#' @inheritParams general_arguments
#' @param string.out Logical: If TRUE return a string listing the time remaining and the total estimated time.
#' 
#' @export
#' 
estimateTimeOfProcesses <- function(projectPath, modelName, string.out = TRUE) {
    # Get progress:
    progress <- progressOfProcesses(projectPath, modelName)
    # Read time of NFile:
    NFile <- getProjectPaths(projectPath, "NFile")$analysis
    if(!file.exists(NFile)) {
        return(NA)
    }
    startTime <- file.info(NFile)$ctime
    # And compare to the present time:
    now <- Sys.time()
    timeUsed <- now - startTime
    
    # Predict the total time:
    totalTimePredicted <- timeUsed / progress
    
    # Predict time remaninig:
    remainingTimePredicted <- totalTimePredicted - timeUsed
    
    # Paste to a string message:
    if(string.out) {
        output <- paste0("Estimated ", round(remainingTimePredicted, 1), " ", attr(remainingTimePredicted, "units"), " remaining (", 100 * progress, " percent elapsed of total estimated time ", round(totalTimePredicted, 1), " ", attr(totalTimePredicted, "units"), ")")
    }
    else {
        output <- remainingTimePredicted
    }
    
    
    return(output)
}



#addBootstrapID <- function(x, ind) {
#    if(is.list(x) && !data.table::is.data.table(x)){
#        lapply(x, addBootstrapIDOne, ind = ind)
#    }
#    else {
#        addBootstrapIDOne(x, ind = ind)
#    }
#}
#
#addBootstrapIDOne <- function(x, ind) {
#    x[, BootstrapID := ..ind]
#}




#readBootstrap <- function(projectPath, processName) {
#    # Bootstrap lives in the model "analysis" only:
#    modelName = "analysis"
#    # Get the folder holding the output form the bootstrap:
#    processID <- getProcessIDFromProcessName(projectPath = projectPath, modelName = modelName, processName = processName)
#    folderPath <- getProcessOutputFolder(projectPath = projectPath, modelName = modelName, processID = processID, type = "output")
#    
#    # Return NULL if the folder does not exist:
#    if(!file.exists(folderPath)) {
#        warning("StoX: The process ", folderPath, " does not have an output folder. Has the proccess been run?")
#        return(NULL)
#    }
#    
#    # Get a nested list of all output files from the bootstrap:
#    bootstrapOutputFiles <- getFilesRecursiveWithOrder(folderPath)
#    
#    # Read all files:
#    processOutput <- rapply(
#        bootstrapOutputFiles, 
#        readProcessOutputFile, 
#        how = "replace"
#    )
#    
#}


# # Function to get the name of the bootsrap run, which will be used as the subfolder name hoding the output from the bootstrap:
# getBootstrapRunName <- function(ind, NumberOfBootstraps, prefix = "Run") {
#     paste0(
#         prefix, 
#         formatC(ind, width = NumberOfBootstraps, format = "d", flag = "0")
#     )
# }


#getReplaceData <- function(x, size) {
#    # Get seeds:
#    seeds <- RstoxBase::getSeedVector(x$Seed, size)
#    x_expanded <- data.table::data.table(
#        x[, !"Seed"], 
#        Seed = seeds
#    )
#    
#    # Split rows into a list:
#    x_split <- split(x_expanded, by = "Seed")
#    
#    x_split <- lapply(x_split, function(x) list(x$ResampleFunction, as.list(x[, !c("ResampleFunction")])))
#    
#    return(x_split)
#}




# This function resamples varToResample with replacement by altering the input data. Used in ResampleMeanLengthDistributionData(), ResampleBioticAssignment() and ResampleMeanNASCData():
resampleDataBy_new4.0.0_excludingPSUsThatAreNotResampled <- function(data, seed, varToScale, varToResample, resampleBy) {
    
    # Get the unique resampleBy, and sort in C-locale for consistensy across platforms:
    #uniqueResampleBy <- unique(data[[resampleBy]])
    #uniqueResampleBy <- stringi::stri_sort(unique(data[[resampleBy]]), locale = "C")
    
    # Get the unique resampleBy, and sort in en_US_POSIX for consistensy across platforms:
    uniqueResampleBy <- stringi::stri_sort(unique(data[[resampleBy]]), locale = "en_US_POSIX", na_last = FALSE)
    
    # Build a table of (usually) Stratum and Seed and merge with the MeanLengthDistributionData:
    seedTable <- data.table::data.table(
        resampleBy = uniqueResampleBy, 
        seed = RstoxBase::getSeedVector(seed, size = length(uniqueResampleBy))
    )
    data.table::setnames(seedTable, c(resampleBy, "seed"))
    data <- merge(data, seedTable, by = resampleBy, all = TRUE)
    
    # Resample the data:
    data[, BootstrapSampleFactor := getResampledCount(.SD, seed = seed[1], varToResample = varToResample), by = resampleBy]
    
    # Scale the varToScale
    for(var in varToScale) {
        data[, eval(var) := get(var) * BootstrapSampleFactor]
    }
    
    # Subset out those that have BootstrapSampleFactor = 0:
    data <- subset(data, BootstrapSampleFactor > 0)
    
    data[, seed := NULL]
    
    return(data)
}


# Function to resample Hauls of one subset of the data:
getResampledCount <- function(subData, seed, varToResample) {
    
    # Resample the unique:
    if(length(varToResample) != 1) {
        stop("StoX: varToResample must be a single string naming the variable to resample")
    }
    resampled <- RstoxBase::sampleSorted(unique(subData[[varToResample]]), seed = seed, replace = TRUE)
    # Tabulate the resamples:
    resampleTable <- data.table::as.data.table(table(resampled, useNA = "ifany"))
    # Set an unmistakable name to the counts:
    data.table::setnames(resampleTable, c("resampled","N"), c(varToResample, "BootstrapSampleFactor"))
    
    # Merge the resampled counts into the data:
    # The sort = FALSE is very important, as it retains the order of the data. This should probably be replaced by a more robust solution, e.g. by merging in resampleDataBy():
    count <- merge(subData, resampleTable, by = varToResample, all.x = TRUE, sort = FALSE)
    
    # Insert the new count into varToScale (with NAs replaced by 0):
    count[, BootstrapSampleFactor := ifelse(
        is.na(count$BootstrapSampleFactor), 
        0, 
        count$BootstrapSampleFactor
    )]
    
    return(count$BootstrapSampleFactor)
}


# This function resamples varToResample with replacement by altering the input data. Used in ResampleMeanLengthDistributionData(), ResampleBioticAssignment() and ResampleMeanNASCData():
resampleDataBy <- function(data, seed, varToScale, varToResample, resampleBy) {
    
    # Get the unique resampleBy, and sort in C-locale for consistensy across platforms:
    #uniqueResampleBy <- unique(data[[resampleBy]])
    #uniqueResampleBy <- stringi::stri_sort(unique(data[[resampleBy]]), locale = "C")
    
    # Get the unique resampleBy, and sort in en_US_POSIX for consistensy across platforms:
    uniqueResampleBy <- stringi::stri_sort(unique(data[[resampleBy]]), locale = "en_US_POSIX")
    
    # Build a table of resampleBy and Seed and merge with the MeanLengthDistributionData:
    seedTable <- data.table::data.table(
        resampleBy = uniqueResampleBy, 
        seed = RstoxBase::getSeedVector(seed, size = length(uniqueResampleBy))
    )
    data.table::setnames(seedTable, c(resampleBy, "seed"))
    data <- merge(data, seedTable, by = resampleBy)
    
    # Resample the data:
    for(var in varToScale) {
        data[, eval(var) := resampleOne(.SD, seed = seed[1], varToScale = var, varToResample = varToResample), by = resampleBy]
    }
    
    data[, seed := NULL]
    #return(MeanLengthDistributionData)
}

# Function to resample Hauls of one subset of the data:
resampleOne <- function(subData, seed, varToResample, varToScale) {
    
    # Resample the unique:
    if(length(varToResample) != 1) {
        stop("StoX: varToResample must be a single string naming the variable to resample")
    }
    resampled <- RstoxBase::sampleSorted(unique(subData[[varToResample]]), seed = seed, replace = TRUE)
    # Tabulate the resamples:
    #resampleTable <- table(resampled)
    #if(!length(resampleTable)) {
    #    
    #}
    resampleTable <- data.table::as.data.table(table(resampled, useNA = "ifany"))
    # Set an unmistakable name to the counts:
    data.table::setnames(resampleTable, c("resampled","N"), c(varToResample, "resampledCountWithUniqueName"))
    
    # Merge the resampled counts into the data:
    # The sort = FALSE is very important, as it retains the order of the data. This should probably be replaced by a more robust solution, e.g. by merging in resampleDataBy():
    count <- merge(subData, resampleTable, by = varToResample, all.x = TRUE, sort = FALSE)
    
    # Insert the new count into varToScale (with NAs replaced by 0):
    count[, resampledCountWithUniqueName := ifelse(
        is.na(count$resampledCountWithUniqueName), 
        0, 
        count$resampledCountWithUniqueName
    )]
    
    #count[, eval(varToScale) := lapply(get(varToScale), "*", resampledCountWithUniqueName)]
    count[, eval(varToScale) := resampledCountWithUniqueName * get(varToScale)]
    
    return(count[[varToScale]])
}


#' Resamples biotic PSUs within Stratum in MeanLengthDistributionData
#' 
#' This function resamples biotic PSUs with replacement within each Stratum, changing the MeanLengthDistributionWeight.
#' 
#' @inheritParams RstoxBase::ModelData
#' @param Seed The seed, given as a sinigle initeger.
#' 
#' @export
#' 
ResampleMeanLengthDistributionData <- function(MeanLengthDistributionData, Seed) {
    
    # This function will be renamed to ResampleBioticPSUsInStratum
    
    # Resample PSUs within Strata, modifying the weighting variable of MeanLengthDistributionData:
    MeanLengthDistributionData$Data <- resampleDataBy(
        data = MeanLengthDistributionData$Data, 
        seed = Seed, 
        #varToScale = RstoxBase::getRstoxBaseDefinitions("dataTypeDefinition")[["MeanLengthDistributionData"]]$weighting, 
        varToScale = "WeightedNumber", 
        # If any other values than varToResample = "PSU" and resampleBy = "Stratum" are used in the future, warnings for only one and not all varToResample in resampleBy should be added in RstoxBase!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!:
        varToResample = "PSU", 
        resampleBy = "Stratum"
    )
    
    return(MeanLengthDistributionData)
}



#' Resamples biotic PSUs within Stratum in MeanSpeciesCategoryCatchData
#' 
#' This function resamples biotic PSUs with replacement within each Stratum, changing the MeanSpeciesCategoryCatchData
#' 
#' @inheritParams RstoxBase::ModelData
#' @param Seed The seed, given as a sinigle initeger.
#' 
#' @export
#' 
ResampleMeanSpeciesCategoryCatchData <- function(MeanSpeciesCategoryCatchData, Seed) {
    
    # This function will be renamed to ResampleBioticPSUsInStratum
    
    # Resample PSUs within Strata, modifying the weighting variable of MeanSpeciesCategoryCatchData:
    MeanSpeciesCategoryCatchData$Data <- resampleDataBy(
        data = MeanSpeciesCategoryCatchData$Data, 
        seed = Seed, 
        varToScale = c("TotalCatchWeight", "TotalCatchNumber"), 
        # If any other values than varToResample = "PSU" and resampleBy = "Stratum" are used in the future, warnings for only one and not all varToResample in resampleBy should be added in RstoxBase!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!:
        varToResample = "PSU", 
        resampleBy = "Stratum"
    )
    
    return(MeanSpeciesCategoryCatchData)
}


### #' Resamples biotic PSUs
### #' 
### #' This function resamples biotic PSUs with replacement within each Stratum, changing the MeanLengthDistributionWeight.
### #' 
### #' @param MeanLengthDistributionData The \code{\link[RstoxBase]{MeanLengthDistributionData}} data.
### #' @param Seed The seed, given as a sinigle initeger.
### #' 
### #' @export
### #' 
### ResampleAssignmentLengthDistributionData <- function(AssignmentLengthDistributionData, Seed) {
###     # Resample PSUs within Strata, modifying the weighting variable of AssignmentLengthDistributionData:
###     AssignmentLengthDistributionData <- resampleDataBy(
###         data = AssignmentLengthDistributionData, 
###         seed = Seed, 
###         #varToScale = RstoxBase::getRstoxBaseDefinitions("dataTypeDefinition")[["AssignmentLengthDistributionData"]]$weighting, 
###         varToScale = "WeightedNumber", 
###         varToResample = "PSU", 
###         resampleBy = "Stratum"
###     )
###     
###     return(AssignmentLengthDistributionData)
### }


#' Resamples biotic PSUs
#' 
#' This function resamples biotic PSUs with replacement within each Stratum, changing the MeanLengthDistributionWeight.
#' 
#' @inherit RstoxBase::ProcessData
#' @param Seed The seed, given as a sinigle initeger.
#' 
#' @noRd
#' 
ResampleBioticAssignment <- function(BioticAssignment, Seed) {
    
    # This function will be renamed to ResampleAssignedHaulsInStratum
    
    # Resample Hauls within Strata, modifying the weighting variable of BioticAssignment:
    BioticAssignment <- resampleDataBy(
        #data = BioticAssignment$BioticAssignment, 
        data = BioticAssignment, 
        seed = Seed, 
        varToScale = "WeightingFactor", 
        # If any other values than varToResample = "Haul" and resampleBy = "Stratum" are used in the future, warnings for only one and not all varToResample in resampleBy should be added in RstoxBase!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!:
        varToResample = "Haul", 
        resampleBy = "Stratum"
    )
    
    return(BioticAssignment)
}


#' Resamples biotic PSUs
#' 
#' This function resamples biotic PSUs with replacement within each Stratum, changing the MeanLengthDistributionWeight.
#' 
#' @inherit RstoxBase::ProcessData
#' @param Seed The seed, given as a sinigle initeger.
#' 
#' @export
#' 
ResampleBioticAssignmentByStratum <- function(BioticAssignment, Seed) {
    
    # This function will be renamed to ResampleAssignedHaulsInStratum
    
    # Resample Hauls within Strata, modifying the weighting variable of BioticAssignment:
    BioticAssignment <- resampleDataBy(
        #data = BioticAssignment$BioticAssignment, 
        data = BioticAssignment, 
        seed = Seed, 
        varToScale = "WeightingFactor", 
        # If any other values than varToResample = "Haul" and resampleBy = "Stratum" are used in the future, warnings for only one and not all varToResample in resampleBy should be added in RstoxBase!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!:
        varToResample = "Haul", 
        resampleBy = "Stratum"
    )
    
    return(BioticAssignment)
}

#' Resamples biotic PSUs
#' 
#' This function resamples biotic PSUs with replacement within each Stratum, changing the MeanLengthDistributionWeight.
#' 
#' @inherit RstoxBase::ProcessData
#' @param Seed The seed, given as a sinigle initeger.
#' 
#' @export
#' 
ResampleBioticAssignmentByPSU <- function(BioticAssignment, Seed) {
    
    # This function will be renamed to ResampleAssignedHaulsInStratum
    
    # Resample Hauls within Strata, modifying the weighting variable of BioticAssignment:
    BioticAssignment <- resampleDataBy(
        #data = BioticAssignment$BioticAssignment, 
        data = BioticAssignment, 
        seed = Seed, 
        varToScale = "WeightingFactor", 
        # If any other values than varToResample = "Haul" and resampleBy = "Stratum" are used in the future, warnings for only one and not all varToResample in resampleBy should be added in RstoxBase!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!:
        varToResample = "Haul", 
        resampleBy = "PSU"
    )
    
    return(BioticAssignment)
}



#' Resamples acoustic PSUs
#' 
#' This function resamples acoustic PSUs with replacement within each Stratum, changing the MeanNASC
#' 
#' @inheritParams RstoxBase::ModelData
#' @param Seed The seed, given as a sinigle initeger.
#' 
#' @export
#' 
ResampleMeanNASCData <- function(MeanNASCData, Seed) {
    
    # This function will be renamed to ResampleAcousticPSUsInStratum
    
    # Resample PSUs within Strata, modifying the weighting variable of MeanLengthDistributionData:
    MeanNASCData$Data <- resampleDataBy(
        data = MeanNASCData$Data, 
        seed = Seed, 
        #varToScale = RstoxBase::getRstoxBaseDefinitions("dataTypeDefinition")[["MeanNASC"]]$weighting, 
        varToScale = "NASC", 
        varToResample = "PSU", 
        resampleBy = "Stratum"
    )
    
    return(MeanNASCData)
}



#ResampleIndividualAge <- function(SuperIndividualsData, AgeErrorMatrix, Seed) {
#
#    
#
#    return(SuperIndividualsData)
#}



##################################################
##################################################
#' Report Bootstrap
#' 
#' Reports the sum, mean or other statistics on a variable of the \code{\link{BootstrapData}}.
#' 
#' @param BootstrapData The \code{\link{BootstrapData}} data output from \code{\link{Bootstrap}}.
#' @inheritParams RstoxBase::general_report_arguments
#' @param BaselineProcess A strings naming the baseline process to report from the \code{\link{BootstrapData}}. If a process with 
#' @param AggregationFunction The function to apply to each bootstrap run. This must be a function returning a single value.
#' @param BootstrapReportFunction The function to apply across bootstrap run, such as "cv" or "c".
#' @param Percentages The percentages to report Percentiles for when BootstrapReportFunction = "summaryStox".
#' @param AggregationWeightingVariable The variable to weight by in the \code{AggregationFunction}.
#' @param BootstrapReportWeightingVariable The variable to weight by in the \code{BootstrapReportFunction}.
#'
#' @details This function works in two steps. First, the \code{AggregationFunction} is applied to the \code{TargetVariable} of the table given by \code{BaselineProcess} for each unique combination of the \code{GroupingVariables} and for each bootstrap run. Second, a grid of all possible combinations of the \code{GroupingVariables} is formed and the result from the first step placed onto the grid. This creates 0 for each position in the grid where data from the first step are not present. E.g., if a particularly large fish is found in only one haul, and this haul by random is not selected in a bootstrap run, the \code{TargetVariable} will be 0 to reflect the variability in the data. To complete the second step, the \code{BootstrapReportFunction} is applied over the bootstrap runs for each cell in the grid.
#' 
#' The parameter \code{RemoveMissingValues} should be used with extreme caution. The effect of setting \code{RemoveMissingValues} to TRUE is that missing values (NAs) are removed in both the first and second step. This can be dangerous both in the first and in the second step. E.g., if the Abundance of \code{\link{SuperIndividualsData}} is positive for super-individuals with missing IndividualWeight, then the Biomass of those super-individuals will be missing as well. If one the wants to sum the Biomass by using \code{AggregationFunction} = "sum" one will get NA if \code{RemoveMissingValues} = FALSE. If \code{RemoveMissingValues} = TRUE one will ignore the missing Biomass, and the summed Biomass will only include the super-individuals that have non-missing IndividualWeight, effectively discarding a portion of the observed abundance. The summed Biomass will in this case be underestimated!
#' 
#' In the second step, setting \code{RemoveMissingValues} to TRUE can be even more dangerous, as the only option currently available for the \code{BootstrapReportFunction} is the function RstoxBase::summaryStox(), which includes average and standard deivation which are highly influenced by removing missing data.
#' 
#' Instead of setting \code{RemoveMissingValues} to TRUE, it is advised to apply the function \code{\link{ImputeSuperIndividuals}} to fill in e.g. IndividualWeight where missing. Missing values in the output from \code{\link{ReportBootstrap}} can also be avoided by adding variables to \code{GroupingVariables}, such as adding "Stratum" e.g. if there are strata that are known from Baseline to contain no fish. These strata will then be present but with missing values, but these missing values will not affect other strata if "Stratum" is included in \code{GroupingVariables}. It is also  recommended to include "Survey" and "SpeciesCategory" in the \code{GroupingVariables}, as these are key variables for which summary statistics should rarely be computed across.
#' 
#' @return
#' A \code{\link{ReportBootstrapData}} object.
#' 
#' @export
#' 
ReportBootstrap <- function(
    BootstrapData, 
    BaselineProcess = character(), 
    TargetVariable = character(), 
    TargetVariableUnit = character(), 
    AggregationFunction = RstoxBase::getReportFunctions(getMultiple = FALSE), 
    BootstrapReportFunction = RstoxBase::getReportFunctions(getMultiple = TRUE), 
    Percentages = double(), 
    GroupingVariables = character(), 
    InformationVariables = character(), 
    Filter = character(), 
    RemoveMissingValues = FALSE, 
    AggregationWeightingVariable = character(), 
    BootstrapReportWeightingVariable = character()
) 
{
    
    AggregationFunction <- RstoxData::match_arg_informative(AggregationFunction)
    BootstrapReportFunction <- RstoxData::match_arg_informative(BootstrapReportFunction)
    
    if(!length(BootstrapData)) {
        stop("The Bootstrap process must been run.")
    }
    #if(!length(BootstrapData[[BaselineProcess]])) {
    #    stop("The Baseline process ", BaselineProcess, " is not included in the BootstrapData. Please include it in the parameter OutputProcesses in the Bootstrap process.")
    #}
    
    # Issue a warning if RemoveMissingValues = TRUE:
    if(isTRUE(RemoveMissingValues)) {
        warning(RstoxBase::getRstoxBaseDefinitions("RemoveMissingValuesWarning")(TargetVariable))
    }
    
    
    if(! BaselineProcess %in% names(BootstrapData)) {
        # If the BootstrapData is empty, stop:
        if(!length(BootstrapData)) {
            warning("Empty BootstrapData.")
        }
        else {
            stop("The BaselineProcess ", BaselineProcess, " is not one of the outputs of the Bootstrap. Possible values are ", paste(names(BootstrapData), collapse = ", "), ".")
        }
    }
    else {
        if(is.list(BootstrapData[[BaselineProcess]]) && !data.table::is.data.table(BootstrapData[[BaselineProcess]]) ) {
            if("Data" %in% names(BootstrapData[[BaselineProcess]])) {
                dataType <- attr(BootstrapData[[BaselineProcess]], "dataType")
                BootstrapData[[BaselineProcess]] <- BootstrapData[[BaselineProcess]]$Data
                attr(BootstrapData[[BaselineProcess]], "dataType") <- dataType
            }
            else {
                stop("StoX: For multi-table process requested by the BaselineProcess, the table \"Data\" must be present.")
            }
        }
    }
    
    # Subset the BootstrapData to only the TargetVariable, GroupingVariables, InformationVariables, AggregationWeightingVariable, BootstrapReportWeightingVariable and "BootstrapID"
    toKeep <- c(TargetVariable, GroupingVariables, InformationVariables, AggregationWeightingVariable, BootstrapReportWeightingVariable, "BootstrapID")
    relevantBootstrapData <- BootstrapData[[BaselineProcess]][, ..toKeep]
    
    # Store any dataType attribute:
    dataType <- attr(BootstrapData[[BaselineProcess]], "dataType")
    # Set the unit of the target variable:
    relevantBootstrapData[[TargetVariable]] <- RstoxBase::setUnitRstoxBase(
        relevantBootstrapData[[TargetVariable]], 
        dataType =  dataType, 
        variableName = TargetVariable, 
        unit = TargetVariableUnit
    )
    
    # Free the memory of the large object:
    rm(BootstrapData)
    gc()
    
    
    output <- aggregateRelevantBootstrapData(
        relevantBootstrapData = relevantBootstrapData, 
        AggregationFunction = AggregationFunction, 
        TargetVariable = TargetVariable, 
        GroupingVariables = GroupingVariables, 
        InformationVariables = InformationVariables, 
        RemoveMissingValues = RemoveMissingValues, 
        AggregationWeightingVariable = AggregationWeightingVariable, 
        BootstrapReportFunction = BootstrapReportFunction, 
        BootstrapReportWeightingVariable = BootstrapReportWeightingVariable, 
        Percentages = Percentages, 
        Filter = Filter
    )
    
    
    return(output)
}



aggregateRelevantBootstrapData <- function(
        relevantBootstrapData, 
        AggregationFunction, 
        TargetVariable, 
        GroupingVariables, 
        InformationVariables, 
        RemoveMissingValues, 
        AggregationWeightingVariable, 
        BootstrapReportFunction, 
        BootstrapReportWeightingVariable, 
        Percentages, 
        Filter
) {
    
    # Run the initial aggregation (only applicable for functions with output of length 1):
    output <- RstoxBase::aggregateBaselineDataOneTable(
        stoxData = relevantBootstrapData, 
        TargetVariable = TargetVariable, 
        aggregationFunction = AggregationFunction, 
        GroupingVariables = c(GroupingVariables, "BootstrapID"), 
        InformationVariables = InformationVariables, 
        na.rm = RemoveMissingValues, 
        WeightingVariable = AggregationWeightingVariable
    )
    
    
    # Get the name of the new TargetVariable:
    TargetVariableAfterInitialAggregation <- RstoxBase::getReportFunctionVariableName(
        functionName = AggregationFunction, 
        TargetVariable = TargetVariable
    )
    
    
    # Store the unique combinations of the GroupingVariables from the relevantBootstrapData:
    uniqueGroupingVariablesToKeep <- unique(subset(relevantBootstrapData, select = GroupingVariables))
    setorder(uniqueGroupingVariablesToKeep)
    
    # Run the report function of the bootstraps:
    output <- RstoxBase::aggregateBaselineDataOneTable(
        stoxData = output, 
        TargetVariable = TargetVariableAfterInitialAggregation, 
        aggregationFunction = BootstrapReportFunction, 
        GroupingVariables = GroupingVariables, 
        InformationVariables = InformationVariables, 
        na.rm = RemoveMissingValues, 
        padWithZerosOn = "BootstrapID", 
        WeightingVariable = BootstrapReportWeightingVariable, 
        SpecificationParameter = Percentages, 
        uniqueGroupingVariablesToKeep = uniqueGroupingVariablesToKeep
    )
    
    ### # Discard all rows with combinations of the GroupingVariables that are not present in the relevantBootstrapData:
    ### output <- output[uniqueCombinations, , on = names(uniqueCombinations)]
    
    # Add the unit to the report:
    if(RstoxData::hasUnit(relevantBootstrapData[[TargetVariable]], property = "shortname")) {
        unit <- RstoxData::getUnit(relevantBootstrapData[[TargetVariable]], property = "shortname")
        output <- cbind(output, Unit = unit)
    }
    
    output <- RstoxBase::filterTable(output, filter = Filter)
    
    # Add the grouping and information variables as attributes, for use e.g. when plotting:
    attr(output, "GroupingVariables") <- GroupingVariables
    attr(output, "InformationVariables") <- InformationVariables
    
    return(output)
}









# Change noRd to export in StoX 4.0.0:

##################################################
##################################################
#' Report BootstrapNetCDF4
#' 
#' Reports the sum, mean or other statistics on a variable of the \code{\link{BootstrapNetCDF4Data}}.
#' 
#' @inheritParams ModelData
#' @inheritParams RstoxBase::general_report_arguments
#' @param BaselineProcess A strings naming the baseline process to report from the \code{\link{BootstrapNetCDF4Data}}. If a process with 
#' @param AggregationFunction The function to apply to each bootstrap run. This must be a function returning a single value.
#' @param BootstrapReportFunction The function to apply across bootstrap run, such as "cv" or "c".
#' @param Percentages The percentages to report Percentiles for when BootstrapReportFunction = "summaryStox".
#' @param AggregationWeightingVariable The variable to weight by in the \code{AggregationFunction}.
#' @param BootstrapReportWeightingVariable The variable to weight by in the \code{BootstrapReportFunction}.
#' @param NetCDF4ChunkSize The number of bootstraps to read in at each step in the report generation. Setting this to e.g. 100 will limit the memory occupied by the function to 100 bootstraps, allowing for report generation from practically unlimited nnumber of bootstraps.
#'
#' @details This function works in two steps. First, the \code{AggregationFunction} is applied to the \code{TargetVariable} of the table given by \code{BaselineProcess} for each unique combination of the \code{GroupingVariables} and for each bootstrap run. Second, a grid of all possible combinations of the \code{GroupingVariables} is formed and the result from the first step placed onto the grid. This creates 0 for each position in the grid where data from the first step are not present. E.g., if a particularly large fish is found in only one haul, and this haul by random is not selected in a bootstrap run, the \code{TargetVariable} will be 0 to reflect the variability in the data. To complete the second step, the \code{BootstrapReportFunction} is applied over the bootstrap runs for each cell in the grid.
#' 
#' The parameter \code{RemoveMissingValues} should be used with extreme caution. The effect of setting \code{RemoveMissingValues} to TRUE is that missing values (NAs) are removed in both the first and second step. This can be dangerous both in the first and in the second step. E.g., if the Abundance of \code{\link{SuperIndividualsData}} is positive for super-individuals with missing IndividualWeight, then the Biomass of those super-individuals will be missing as well. If one the wants to sum the Biomass by using \code{AggregationFunction} = "sum" one will get NA if \code{RemoveMissingValues} = FALSE. If \code{RemoveMissingValues} = TRUE one will ignore the missing Biomass, and the summed Biomass will only include the super-individuals that have non-missing IndividualWeight, effectively discarding a portion of the observed abundance. The summed Biomass will in this case be underestimated!
#' 
#' In the second step, setting \code{RemoveMissingValues} to TRUE can be even more dangerous, as the only option currently available for the \code{BootstrapReportFunction} is the function RstoxBase::summaryStox(), which includes average and standard deivation which are highly influenced by removing missing data.
#' 
#' Instead of setting \code{RemoveMissingValues} to TRUE, it is advised to apply the function \code{\link{ImputeSuperIndividuals}} to fill in e.g. IndividualWeight where missing. Missing values in the output from \code{\link{ReportBootstrap}} can also be avoided by adding variables to \code{GroupingVariables}, such as adding "Stratum" e.g. if there are strata that are known from Baseline to contain no fish. These strata will then be present but with missing values, but these missing values will not affect other strata if "Stratum" is included in \code{GroupingVariables}. It is also  recommended to include "Survey" and "SpeciesCategory" in the \code{GroupingVariables}, as these are key variables for which summary statistics should rarely be computed across.
#' 
#' @return
#' A \code{\link{ReportBootstrapNetCDF4Data}} object.
#' 
#' @export
#' 
ReportBootstrapNetCDF4 <- function(
    BootstrapNetCDF4Data, 
    BaselineProcess = character(), 
    TargetVariable = character(), 
    TargetVariableUnit = character(), 
    AggregationFunction = RstoxBase::getReportFunctions(getMultiple = FALSE), 
    BootstrapReportFunction = RstoxBase::getReportFunctions(getMultiple = TRUE), 
    Percentages = double(), 
    GroupingVariables = character(), 
    InformationVariables = character(), 
    Filter = character(), 
    RemoveMissingValues = FALSE, 
    AggregationWeightingVariable = character(), 
    BootstrapReportWeightingVariable = character(), 
    NetCDF4ChunkSize = Inf
) 
{
    
    AggregationFunction <- RstoxData::match_arg_informative(AggregationFunction)
    BootstrapReportFunction <- RstoxData::match_arg_informative(BootstrapReportFunction)
    
    if(!length(BootstrapNetCDF4Data)) {
        stop("The Bootstrap process must been run.")
    }
    
    # Issue a warning if RemoveMissingValues = TRUE:
    if(isTRUE(RemoveMissingValues)) {
        warning(RstoxBase::getRstoxBaseDefinitions("RemoveMissingValuesWarning")(TargetVariable))
    }
    
    #  Open the file:
    nc <- ncdf4::nc_open(unlist(BootstrapNetCDF4Data))
    on.exit(ncdf4::nc_close(nc))
    
    
    # Read the baseline process names:
    processNameAndTableName <- getProcessNameAndTableName(BaselineProcess, nc)
    processNameSlashTableName <- paste(processNameAndTableName, collapse = "/")
    
    
    # Read the relevant variables from the BootstrapNetCDF4Data file:
    toKeep <- c(TargetVariable, GroupingVariables, InformationVariables, AggregationWeightingVariable, BootstrapReportWeightingVariable)
    
    selection <- list(processNameSlashTableName, toKeep)
    
    
    
    # Get the data type:
    dataType <- getDataTypeFromBootstrapNetCDF4(nc, BaselineProcess)
    
    
    # Get the number of bootstraps:
    NumberOfBootstraps <- getNumberOfBootstrapsFromNetCDF4Data(nc, selection = selection)
    bootstrapIDs <- seq_len(NumberOfBootstraps)
    if(length(NetCDF4ChunkSize)) {
        bootstrapIDs <- split(bootstrapIDs, ceiling(bootstrapIDs / NetCDF4ChunkSize))
    }
    
    
    
    # Run the initial aggregation on each bootstrap replicated, reading from the file at each step instead of reading all steps and then aggregating:
    output <- lapply(bootstrapIDs, initialAggregateBootstrapNetCDF4DataOne, 
        nc = nc, 
        selection = selection, 
        AggregationFunction = AggregationFunction, 
        TargetVariable = TargetVariable, 
        TargetVariableUnit = TargetVariableUnit, 
        GroupingVariables = GroupingVariables, 
        InformationVariables = InformationVariables, 
        RemoveMissingValues = RemoveMissingValues, 
        AggregationWeightingVariable = AggregationWeightingVariable, 
        dataType = dataType
    )
    unit <- output[[1]]$unit
    output <-  lapply(output, "[[", "output")
    
    # Rbind the output:
    output <- data.table::rbindlist(output, idcol = "BootstrapID")
    
    
    
    
    # Get the name of the new TargetVariable:
    TargetVariableAfterInitialAggregation <- RstoxBase::getReportFunctionVariableName(
        functionName = AggregationFunction, 
        TargetVariable = TargetVariable
    )
    
    
    # Store the unique combinations of the GroupingVariables from the output:
    uniqueGroupingVariablesToKeep <- unique(subset(output, select = GroupingVariables))
    setorder(uniqueGroupingVariablesToKeep)
    
    # Run the report function of the bootstraps:
    output <- RstoxBase::aggregateBaselineDataOneTable(
        stoxData = output, 
        TargetVariable = TargetVariableAfterInitialAggregation, 
        aggregationFunction = BootstrapReportFunction, 
        GroupingVariables = GroupingVariables, 
        InformationVariables = InformationVariables, 
        na.rm = RemoveMissingValues, 
        padWithZerosOn = "BootstrapID", 
        WeightingVariable = BootstrapReportWeightingVariable, 
        SpecificationParameter = Percentages, 
        uniqueGroupingVariablesToKeep = uniqueGroupingVariablesToKeep
    )
    
    
    # Add the unit to the report:
    if(length(unit)) {
        output <- cbind(output, Unit = unit)
    }
    
    output <- RstoxBase::filterTable(output, filter = Filter)
    
    # Add the grouping and information variables as attributes, for use e.g. when plotting:
    attr(output, "GroupingVariables") <- GroupingVariables
    attr(output, "InformationVariables") <- InformationVariables
    
    return(output)
}


### ReportBootstrapNetCDF4 <- function(
###     BootstrapNetCDF4Data, 
###     BaselineProcess = character(), 
###     TargetVariable = character(), 
###     TargetVariableUnit = character(), 
###     AggregationFunction = RstoxBase::getReportFunctions(getMultiple = FALSE), 
###     BootstrapReportFunction = RstoxBase::getReportFunctions(getMultiple = TRUE), 
###     Percentages = double(), 
###     GroupingVariables = character(), 
###     InformationVariables = character(), 
###     Filter = character(), 
###     RemoveMissingValues = FALSE, 
###     AggregationWeightingVariable = character(), 
###     BootstrapReportWeightingVariable = character()
### ) 
### {
###     
###     AggregationFunction <- RstoxData::match_arg_informative(AggregationFunction)
###     BootstrapReportFunction <- RstoxData::match_arg_informative(BootstrapReportFunction)
###     
###     if(!length(BootstrapNetCDF4Data)) {
###         stop("The Bootstrap process must been run.")
###     }
###     
###     # Issue a warning if RemoveMissingValues = TRUE:
###     if(isTRUE(RemoveMissingValues)) {
###         warning(RstoxBase::getRstoxBaseDefinitions("RemoveMissingValuesWarning")(TargetVariable))
###     }
###     
###     
###     #  Open the file:
###     nc <- ncdf4::nc_open(unlist(BootstrapNetCDF4Data))
###     on.exit(ncdf4::nc_close(nc))
###     
###     
###     # Read the baseline process names:
###     processNameAndTableName <- getProcessNameAndTableName(BaselineProcess, nc)
###     processNameSlashTableName <- paste(processNameAndTableName, collapse = "/")
###     
###     
###     # Read the relevant variables from the BootstrapNetCDF4Data file:
###     toKeep <- c(TargetVariable, GroupingVariables, InformationVariables, AggregationWeightingVariable, BootstrapReportWeightingVariable)
###     
###     
###     relevantBootstrapData <- getBootstrapNetCDF4Data(nc, selection = list(processNameSlashTableName, toKeep), BootstrapIDStart = 1, BootstrapIDEnd = Inf)
###     
###     # Unlist to the specific table:
###     relevantBootstrapData <- relevantBootstrapData[[processNameAndTableName]]
###     
###     # Get the data type:
###     dataType <- getDataTypeFromBootstrapNetCDF4(nc, BaselineProcess)
###     
###     # Set the unit of the target variable:
###     relevantBootstrapData[[TargetVariable]] <- RstoxBase::setUnitRstoxBase(
###         relevantBootstrapData[[TargetVariable]], 
###         dataType =  dataType, 
###         variableName = TargetVariable, 
###         unit = TargetVariableUnit
###     )
###     
###     
###     output <- aggregateRelevantBootstrapData(
###         relevantBootstrapData = relevantBootstrapData, 
###         AggregationFunction = AggregationFunction, 
###         TargetVariable = TargetVariable, 
###         GroupingVariables = GroupingVariables, 
###         InformationVariables = InformationVariables, 
###         RemoveMissingValues = RemoveMissingValues, 
###         AggregationWeightingVariable = AggregationWeightingVariable, 
###         BootstrapReportFunction = BootstrapReportFunction, 
###         BootstrapReportWeightingVariable = BootstrapReportWeightingVariable, 
###         Percentages = Percentages, 
###         Filter = Filter
###     )
###     
###     
###     return(output)
### }



getProcessNameAndTableName <- function(BaselineProcess, nc) {
    # Read the baseline process names:
    processNamesAndTableNames <- getProcessNamesAndTableNames(nc)
    processNames <- unique(processNamesAndTableNames$processName)
    if(! BaselineProcess %in% processNames) {
        # If the BootstrapNetCDF4Data is empty, stop:
        if(!length(processNames)) {
            warning("Empty BootstrapNetCDF4Data.")
            return(NULL)
        }
        else {
            stop("The BaselineProcess ", BaselineProcess, " is not one of the outputs of the Bootstrap. Possible values are ", paste(processNames, collapse = ", "), ".")
        }
    }
    else {
        # Expand the BaselineProcess with the table name, where for multi table outputs with Data and Resolution the Data are used:
        processNamesAndTableNames <- subset(processNamesAndTableNames, processName %in% BaselineProcess)
        
        if(nrow(processNamesAndTableNames) == 1) {
            return(unlist(processNamesAndTableNames))
        }
        else if("Data" %in% processNamesAndTableNames$tableName) {
            return(c(processNamesAndTableNames$processName[1], "Data"))
        }
        else {
            stop("StoX: For multi-table process requested by the BaselineProcess, the table \"Data\" must be present.")
        }
    }
}














initialAggregateBootstrapNetCDF4DataOne <- function(
    bootstrapID, 
    nc, 
    selection, 
    AggregationFunction, 
    TargetVariable, 
    TargetVariableUnit,
    GroupingVariables, 
    InformationVariables, 
    RemoveMissingValues, 
    AggregationWeightingVariable, 
    dataType
) {
    
    # Get the table of current bootstrapID:
    relevantBootstrapNetCDF4DataOne <- getBootstrapNetCDF4Data(nc, selection = selection, BootstrapIDStart = min(bootstrapID), BootstrapIDEnd = max(bootstrapID), dropList = TRUE)
    
    # Set the unit of the target variable:
    relevantBootstrapNetCDF4DataOne[[TargetVariable]] <- RstoxBase::setUnitRstoxBase(
        relevantBootstrapNetCDF4DataOne[[TargetVariable]], 
        dataType =  dataType, 
        variableName = TargetVariable, 
        unit = TargetVariableUnit
    )
    
    # Store the unit for output:
    unit <- RstoxData::getUnit(relevantBootstrapNetCDF4DataOne[[TargetVariable]], property = "shortname")
    
    
    # Run the initial aggregation (only applicable for functions with output of length 1):
    output <- RstoxBase::aggregateBaselineDataOneTable(
        stoxData = relevantBootstrapNetCDF4DataOne, 
        TargetVariable = TargetVariable, 
        aggregationFunction = AggregationFunction, 
        GroupingVariables = GroupingVariables, 
        InformationVariables = InformationVariables, 
        na.rm = RemoveMissingValues, 
        WeightingVariable = AggregationWeightingVariable
    )
    
    output <- list(
        output = output, 
        unit = unit
    )
    
    return(output)
}

    
StoXProject/RstoxFramework documentation built on Oct. 17, 2023, 1:24 p.m.