R/constructor.R

# GCAMFigsBase() ----------------------------------------------------------
#' Constructs figure generating objects
#'
#' An rgcam project file is restructured so that queries are the top-layer of a list,
#' with entry holding query data for all scenarios.
#'
#' Any queries that have a 'GHG' column are separated by GHG. See breakoutGHGs() for details.
#'
#' The processed project data is assigned class according to the queries it holds. See
#' system.file("classes.csv") for what queries compose each class. Currently throws an error
#' if the project data is assigned more than 1 query-class because this would disrupt
#' method dispatch.
#'
#' The final step is to calculate whatever additional queries are possible given those
#' already contained in project data. For example, in the land class, if queries
#' "Land Allocation" and "Ag Production by Crop Type" are present, "Average Yield" will be
#' calculated automatically. See calculateQueries() in script calculations.R for
#' the queries that can be calculated for each class.
#'
#' @param proj rgcam project, a list of scenarios
#' @param transf list of functions, each named for query it restructures
#' @return list of dataframes
#' @export
GCAMFigs <- function(proj, transf) {

  # check for missing input
  if (missing(proj)) {
    error("Need to provide rgcam project file")
  }
  if (missing(transf)) {
    error("Need to provide list of transformation functions")
  }

  # grab scenarios and queries from project file
  scenarios <- names(proj)
  queries <- names(proj[[1]]) # TODO: check we have queries for all scenarios

  # pull queries to top-level of list. transformation functions applied to queries
  list.queries <- dataRestructure(proj, transf, scenarios, queries)

  # make groups of GHG's into own queries
  list.queries <- breakoutGHGs(list.queries)

  # assign classes according to queries in project data
  # each column holds queries for an individual class
  lookup <- system.file("classes.csv", package="ValidationFigures") %>%
    read.csv()
  # assign the class of each column in lookup whose queries appear in project data
  for (qclass in names(lookup)) {
    if( any( queries %in% lookup[[qclass]])) {
      class(list.queries) <- c(qclass, class(list.queries))
    }
  }

  # class(list.queries) contains "list" and all query-classes identified by lookup
  # throw error if more than one query-class identified by lookup
  # TODO: allow multiple query-classes ->
  #  execute all relevent methods of calculateQueries()
  #  execute all relevent methods of barchart()
  if ( length(class(list.queries)) > 2 ) {
    stop("Queries in project file belong to multiple classes! Check system.file('classes.csv')")
  }

  # add calcualted queries to list
  print("Calculating queries...")
  list.queries <- calculateQueries(list.queries)
  print("Done!")

  # return final figure generation object
  list.queries

}

# dataRestructure() -------------------------------------------------------
#' Pulls queries to top level of project data
#'
#' Used in constructor() to initialize figure generation object. Native rgcam project data
#' is structured a list of scenarios, where each scenario is a list of queries. Project data
#' needs to be structured as a list of queries, where each query is a single dataframe. Query
#' data for all scenarios needs to be saved as a single data.frame in order to produce plots
#' that compare scenarios.
#'
#' User-provided transformation functions can aggregate over unnecessary columns, reformat
#' columns (eg: split on "_"), or map the keys in a column to a different set of keys with a
#' mapping file.
#' @param proj rgcam project, a list of scenarios
#' @param transf list of functions, each named for query it restructures
#' @param scenarios character vector, scenarios in project data
#' @param queries character vector, queries in project data
#' @return list of dataframes
dataRestructure <- function(proj, transf, scenarios, queries) {

  # function to make top level a list of queries, subsuming scenarios into the entries (df's)
  query.scens <-  function(query, scenarios, proj, transf) {
    q <- lapply(scenarios, function(scenario, proj, transf) {
      print(paste0("...", query, ", ", scenario))
      transf[[query]](proj[[scenario]][[query]])
    }, proj, transf) %>%
      bind_rows()
  }

  # call query.scens to pull queries to top-level of data
  print("Pulling queries...")
  list.queries <- lapply(queries, function(query) {

    list.scenarios <- lapply(scenarios, function(scenario) {

      # print log if scenario doesn't hold query
      if (! query %in% names(proj[[scenario]])) {
        print(paste0("...", query, " not found in ", scenario))
        return(NULL)
      # else returned transformed query data for scenario
      } else {
        print(paste0("...", query, ", ", scenario))
        return(transf[[query]](proj[[scenario]][[query]]))
      }

    })

    # bind all scenarios for same query
    return(bind_rows(list.scenarios))

  }) # return list of queries

  print("Queries pulled!")

  # named list of queries
  names(list.queries) <- queries

  list.queries
}

# breakoutGHGs() ----------------------------------------------------------
#' Reformats emissions queries to be grouped by GHG
#'
#' Some pollutant species appear as three versions ("CH4", "CH4_AGR", "CH4_AWB"). This data would originally
#' appear under a query such as "GHG emissions by region". This function reformats list.queries into
#' separate emissions queries for each GHG. "GHG emissions by region (CH4)" would be its own entry
#' in list. queries, while still distinguishing between the three species under the GHG column of that
#' query's dataframe.
#'
#' SO2 appears to have 4 variants: SO2_1, SO2_1_AWB, SO2_2, ..., SO2_4_AWB. These 8 species are all
#' contained in the "GHG emissions by region (SO2)" query after this function is applied to
#' list.queries.
#'
#' @param list.queries list of queries, each query a single data.frame
#' @return list of dataframes
breakoutGHGs <- function(list.queries) {

  # grab list of queries
  queries <- names(list.queries)

  # find queries that have a "GHG" column
  for (query in queries) {
    # grab query entry
    df <- list.queries[[query]]
    # look for ghg col in query entry
    if ("ghg" %in% names(df)) {

      # replace single emissions query with multiple emissions queries, separated by GHG
      list.queries[[query]] <- NULL

      # get set of unique 'base' ghgs (CH4 would be the base for CH4, CH4_AWB, and CH4_AGR)
      ghgs <- unique(df$ghg)
      ghgs.base <- str_split(ghgs, "_", simplify=TRUE)[,1] %>%
        unique()

      # populate list.queries with emissions broken out by ghg.base
      for (ghg.base in ghgs.base) {

        # log message showing all ghgs associated w/ ghg.base
        print(paste0(ghg.base, ": ", paste0(str_subset(unique(df$ghg), ghg.base), collapse=", ")))

        # grab rows that pertain to ghg.base
        df.filt <- filter(df, str_detect(ghg, ghg.base))

        # construct new emissions query title
        new_query <- paste0(query, " (", ghg.base, ")")

        # drop filtered emissions query into new emissions query title
        list.queries[[new_query]] <- df.filt
      } # end for(ghgs.base) loop

      # log message
      print(paste0("Regrouped GHG's in ", query))

    } # end if(ghg) statement
  } # end for(queries) loop

  # return project data
  list.queries
}

# End ---------------------------------------------------------------------
xavier-gutierrez/validation_figures documentation built on May 24, 2019, 9:58 p.m.