R/manageProjectData.R

Defines functions getRundates listQueries dropScenarios listScenarios mergeProjects saveProject loadProject

Documented in dropScenarios getRundates listQueries listScenarios loadProject mergeProjects saveProject

################################################################################
#### manageProjectData.R:  Functions for managing project data
################################################################################

#' Load a project data file.
#'
#' This function will load a project data file and return the project data
#' structure.  Project files can be created with \code{\link{addScenario}}.
#'
#' Project data is stored as an R 'save' file with a single variable called
#' 'prjdata'.  If the data file doesn't have the expected variable in it, the
#' function will throw an error.  It also does some minimal checks to ensure
#' that the data is valid, but there is a limit to what it can do.
#' Specifically, it checks to see that the structure loaded is:
#'     1. a list,
#'     2. with at least one element, and
#'     3.   all of those elements are lists,
#'     4.   with at least one element, and
#'     5.     all of those elements are data frames,
#'     6.     with a scenario column, and
#'     7.     at least one other column.
#' If the validation fails, the error message will indicate which of these steps
#' it failed on.
#'
#' Despite these checks, it is possible to construct a data set that passes and
#' yet still contains bad data.  When in doubt load the file directly and check
#' to see that it contains the data you expect it to.
#' @param proj Project to add extracted results to.  Can be either a project
#' data structure or the name of a project data file.  The file will be created
#' if it doesn't already exist.
#' @return The project dataset which may or may not have acutally had to be loaded.
#' @export
loadProject <- function(proj) {

    if(is.character(proj)) {
        projFile <- proj
        if(file.exists(projFile)) {
            #projFile <- normalizePath(projFile)
            if(file.access(projFile, mode=6)!=0) { # 6 == read and write permission
                ## file.access returns 0 on success
                stop("File ", projFile,
                     " exists but lacks either read or write permission.")
            }
            load(projFile)                  # Loads the variable prjdata.
            if(!exists("prjdata", inherits=FALSE)) {
                ## Something went wrong with the load.  Probably projFile exists but
                ## isn't a valid project file.
                message(paste("File", projFile,
                              "does not contain valid project data."))
                message("Try loading the file into an R session and verify that it contains the variable 'prjdata'.")
                stop("Unable to load project file ", projFile)
            }

            stat <- project.valid(prjdata)
            if(stat != 0) {
                stop("Invalid project data in ",
                     projFile,".  Validation failed at step", stat)
            }

            ## Set the 'file' attribute so that if a user attempts to save the project
            ## again it can know where to save it.
            attr(prjdata, 'file') <- projFile
        }
        else {
            prjdata <- list()
            attr(prjdata, 'file') <- projFile
        }
    }
    else if(project.valid(proj) %in% c(0, 2)) {
        # This is an already loaded and valid project so we can just use it as is
        prjdata <- proj
    }
    else {
        stop("loadProject: invalid object passed as proj argument; proj must be a filename or project data object.")
    }

    prjdata
}

#' Save project data to a backing file.
#'
#' Write a project data structure to its data file.  This should rarely be
#' necessary, since most of the functions that modify a project file have an
#' option to write the data back to the file.  Mostly it is intended for
#' internal use, but it's exported in case it's useful to someone for some
#' reason.
#'
#' The project data structure includes the name of its backing file, which is
#' why you can call this function with no file argument.  When you save with an
#' explicit file name, the function returns a version of the data structure with
#' an updated file attribute.  If you want the data structure in the calling
#' frame to have its backing file updated, you have to assign this return value
#' back to the original object; otherwise you will write a copy of the data set,
#' but your working copy will continue to be backed by the original file.
#'
#' Note: when the project is written to disk the file attribute is stripped first
#' so users can safely move, rename, share a project file without any problems.
#'
#' For example:
#' \preformatted{
#' > prj <- loadProject(file1.dat)
#' > saveProject(prj, 'file2.dat')       # prj is still backed by 'file1.dat'
#' ### ... modify prj ...
#' > saveProject(prj)                    # new data written to 'file1.dat'
#' > prj <- saveProject(prj,'file3.dat') # prj now backed by 'file3.dat'
#' ### ... modify prj again ...
#' > saveProject(prj)                    # newest data written to 'file3.dat'
#' }
#' @param prjdata Project data object.
#' @param file Filename to save to.  If \code{NULL}, use the file the project
#' was loaded from.
#' @param compress The compress param to be passed on to \code{base::save}. Given
#' we call this method indirectly from so many methods if not explicitly provided
#' then we check the global options \code{options("rgcam.saved_compressed")} and if
#' that is not set we fall back to the default behavior which continues to be "xz".
#' @export
saveProject <- function(prjdata, file=NULL, compress=NULL) {
    ## validate data first
    stat <- project.valid(prjdata)
    if(stat != 0) {
        stop("saveProject:  invalid project data object, stat=", stat)
    }
    if(is.null(file)) {
        file <- attr(prjdata, 'file')
    }

    # determine if we should compress the project file on save
    # if the user did not explicitly provide a value we will check
    # the global option "rgcam.saved_compressed"
    if(is.null(compress)) {
        compress <- options("rgcam.saved_compressed")[[1]]
        # if it is still not provided we fall back to the default which
        # is "xz"
        if(is.null(compress)) {
            compress = "xz"
        }
    }

    # strip file attribute before saving so the data file can be moved or
    # renamed and not cause any issues when it is reloaded.
    attr(prjdata, 'file') <- NULL
    save(prjdata, file=file, compress=compress)
    # add the file attribute back on so users can continue to use it as
    # before.
    # Note if the file param was specified this is implicitly reseting
    # the file attribute on prjdata
    attr(prjdata, 'file') <- file
    invisible(prjdata)
}

#' Merge a list projects into a single project
#'
#' Users can use this function to collapse multiple project into a single
#' project.  This could be useful for instance to import someone else's
#' data into your project.  A user must explicitly specify the new name
#' the new project will recieve.  In addition they can control what
#' happens with scenario/query collisions with the \code{clobber} param.
#'
#' @param prjname The name of a project data file for the merged project.
#' @param prjlist A list of projects that need to be merged together.  Note
#' each project will be run through \code{\link{loadProject}} in case it has
#' not yet been loaded.
#' @param clobber If \code{TRUE}, overwrite any existing scenario of the same
#' name; otherwise, fail if scenario/query already exists in the data set.
#' @param saveProj A flag to save the project to disk after data has been added.
#' A user may want to avoid it if they are for instance calling this method several
#' times and would prefer to save at the end.  Users can always save at anytime by
#' calling \code{saveProject}.
#' @return The project dataset with the projects merged.
#' @export
mergeProjects <- function(prjname, prjlist, clobber=FALSE, saveProj=TRUE) {
    finalproj <- loadProject(prjname)

    # for loops!
    for(prj in prjlist) {
        prjdata <- loadProject(prj)
        for(scn in names(prjdata)) {
            for(qn in names(prjdata[[scn]])) {
                if(!clobber && !is.null(finalproj[[scn]]) && !is.null(finalproj[[scn]][[qn]])) {
                    warning(paste("Skipping data in",scn,"/",qn,"as clobber is false."))
                } else {
                    finalproj[[scn]][[qn]] <- prjdata[[scn]][[qn]]
                }
            }
        }
    }

    if(saveProj) {
        saveProject(finalproj)
    }

    finalproj
}

#' List the scenarios in a project data set
#'
#' Return the names of the scenarios available in a project data set.  The input
#' can be either the name of a file containing the data set, or the data
#' structure previously loaded from such a file.
#'
#' @param projData The data set to report on.
#' @export
listScenarios <- function(projData) {
    if(is.character(projData)) {
        pd <- load(projData)
    }
    else {
        pd <- projData
    }

    names(pd)
}

#' Remove a scenario from a project data set
#'
#' This function removes one or more scenarios from a project data set and
#' returns the newly modified data.  If this operation is performed on a file,
#' the file is also modified.  If it is performed on a data set that has already
#' been loaded, then the change is passed onto the file only if \code{writeback}
#' is set to TRUE.
#'
#' Technically is is possible to pass a filename for \code{proj} with
#' \code{writeback} set to \code{FALSE}.  This usage will cause the data set to
#' be loaded and filtered without modifying the original, albeit in a somewhat
#' nonintuitive way.
#'
#' @param proj Project data or data file name.
#' @param scenarios Name(s) of the scenario(s) to drop
#' @param invert If \code{TRUE} then delete all scenarios \emph{except} the ones
#' given in \code{scenarios}.
#' @param writeback If \code{TRUE} then write the change into the backing data
#' file.
#' @export
dropScenarios <- function(proj, scenarios, invert=FALSE, writeback=is.character(proj)) {
    writeback <- writeback              # magic!
    if(is.character(proj)) {
        proj <- loadProject(proj)
    }
    else {
        proj <- proj
    }

    for(scen in listScenarios(proj)) {
        if(invert && !(scen %in% scenarios) )
            proj[[scen]] <- NULL
        else if(!invert && scen %in% scenarios)
            proj[[scen]] <- NULL
    }

    if(writeback) {
        saveProject(proj)
    }

    proj
}

#' List the queries available for a scenario
#'
#' Return the names of the queries available for a scenario in a project data
#' set.  Unlike \code{\link{listScenarios}}, this function requires the data set
#' to have been previously loaded, so it cannot take a file name.
#'
#' @param projData The data set to report on.
#' @param scenarios The name(s) of the scenario(s) to report on.  If NULL,
#' report on all of them.
#' @param anyscen If \code{TRUE}, then list queries that are in any scenario.
#' If \code{FALSE}, list queries that are in all scenarios.
#' @export
listQueries <- function(projData, scenarios=NULL, anyscen=TRUE) {
    if(is.character(projData)) {
        stop("This function requires the data set to have been already loaded.")
    }

    if(is.null(scenarios)) {
        scenarios <- listScenarios(projData)
    }
    else {
        scenok <- scenarios %in% names(projData)
        if(!all(scenok)) {
            for(scen in scenarios[!scenok])
                warning('listQueries: Scenario ', scen,
                        ' is not in this data set.')
            scenarios <- scenarios[scenok]
            if(length(scenarios) == 0)
                stop('listQueries: No valid scenarios given.')
        }
    }

    ## fetch the names of the valid scenarios
    sqlist <- lapply(scenarios, function(scen) {names(projData[[scen]])})

    combine <- if(anyscen) union else intersect
    Reduce(combine, sqlist)
}


#' Return the run date for one or more scenarios in a data set.
#'
#' The run dates are recorded in the tables produced by the GCAM Model Interface
#' and copied into the project data set on import.  This function retrieves the
#' run dates for the selected scenarios and returns them as a named vector.
#'
#' @param projData The data set to report on.
#' @param scenarios The names of the scenarios
#' @export
getRundates <- function(projData, scenarios=NULL)
{
    if(is.null(scenarios))
        scenarios <- listScenarios(projData)

    ## Calls like sapply (annoyingly) convert date objects to numeric, losing
    ## the information about the epoch.  This should give us a named vector of
    ## POSIXct objects.
    datevec <-
        lapply(scenarios, function(scen) {attr(projData[[scen]], 'date')}) %>%
            do.call(c,.)
    names(datevec) <- scenarios
    datevec
}

#' Retrieve a query for one or more scenarios
#'
#' Return a data frame with the results for a query for all of the selected
#' scenarios.
#'
#' @param projData The data set to extract from.
#' @param query The name of the query to extract.
#' @param scenarios Vector of scenario names.  If NULL, use all scenarios in the
#' data set.
#' @importFrom dplyr bind_rows
#' @export
getQuery <- function(projData, query, scenarios=NULL) {
    if(is.null(scenarios)) {
        scenarios <- listScenarios(projData)
    }

    if(! query %in% listQueries(projData))
        stop('getQuery: Query ', query,
             ' is not in any scenarios in the data set.')

    queries <- lapply(scenarios, function(s) {projData[[s]][[query]]})

    bind_rows(queries)
}

#' Drop specified queries from scenarios.
#'
#' This function removes the specified queries from a data set.  By default the
#' queries are removed from every scenario they appear in, but the operation can
#' be limited to certain scenarios if desired.  If the operation is performed on
#' a file, the file is also modified.  If it is performed on a data set that has
#' already been loaded, then the change is written back to the file only if
#' \code{writeback} is set to TRUE.  Either way, the modified data set is
#' returned.
#'
#' As with \code{\link{dropScenarios}}, it is possible and perhaps occasionally
#' useful to specify a file but to force the file not to be updated by
#' specifying \code{writeback=FALSE}.
#'
#' @param proj Project data or data file name.
#' @param queries Name(s) of the queries to drop.
#' @param invert If \code{TRUE} then delete all queries \emph{except} the ones
#' in \code{queries}.
#' @param writeback If \code{TRUE} then write the change to the backing data
#' file.
#' @param scenarios Drop the queries only from the specified scenarios (default
#' is to drop from all scenarios).
#' @param invertScenario Drop queries from the scenarios \emph{not} listed in
#' \code{scenarios}.
#' @export
dropQueries <- function(proj, queries, invert=FALSE,
                        writeback=is.character(proj), scenarios=NULL,
                        invertScenario=FALSE) {
    writeback <- writeback
    if(is.character(proj)) {
        proj <- loadProject(proj)
    }

    if(is.null(scenarios)) {
        scenarios <- listScenarios(proj)
    }
    if(invertScenario) {
        allscen <- listScenarios(proj)
        scenarios <- allscen[! allscen %in% scenarios]
    }

    for(scen in scenarios) {
        n <- names(proj[[scen]])
        rd <- attr(proj[[scen]], 'date')
        ## This line picks the queries to keep.  If invert==TRUE, then that's
        ## the queries in the list; otherwise it's the ones not in the list.
        proj[[scen]] <- proj[[scen]][(n %in% queries) == invert]
        ## Have to restore the date attribute
        attr(proj[[scen]], 'date') <- rd
    }

    if(writeback) {
        saveProject(proj)
    }

    proj
}

#' Check whether a project data structure is valid.
#'
#' This function will check the 7 conditions described in
#' \code{\link{loadProject}}.  It will return 0 if they pass, or the number of
#' the first condition to fail otherwise.
#' @param prjdata The data structure to test.
#' @keywords internal
project.valid <- function(prjdata) {
    if(!is.list(prjdata))
        return(1)

    if(length(prjdata) == 0)
        return(2)

    scenario.checks <- sapply(prjdata, scenario.valid)
    if(any(scenario.checks>0))
        return(min(scenario.checks[scenario.checks>0]))
    else
        return(0)
}

#' Check whether a scenario component in a project data structure is valid.
#'
#' This function will check conditions 3-7 for a single scenario component in a
#' project data structure.  It will return the number of the first check to
#' fail, or 0 if all pass.
#'
#' @param scendata The data structure to test
#' @keywords internal
scenario.valid <- function(scendata) {
    if(!is.list(scendata))
        return(3)

    if(length(scendata) == 0)
        return(4)

    query.checks <- sapply(scendata, query.valid)
    if(any(query.checks>0))
        return (min(query.checks[query.checks>0]))
    else
        return(0)
}

#' Check whether a query table in a scenario component structure is valid.
#'
#' This function will check conditions 5-7 for a single query table in a
#' scenario data structure.  It will return the number of the first check to
#' fail, or 0 if all pass.
#' @param querytable The query table to test.
#' @keywords internal
query.valid <- function(querytable) {
    if(!is.data.frame(querytable))
        return(5)

    if(! 'scenario' %in% names(querytable))
        return(6)

    if(ncol(querytable) < 2)
        return(7)
    else
        return(0)
}
JGCRI/rgcam documentation built on July 2, 2022, 10:20 a.m.