R/pipeline-helpers.R

Defines functions gather_years gdp_korea_deflator gdp_deflator missing_data unprotect_integer_cols protect_integer_cols standardize_iso change_iso_code repeat_add_columns approx_fun fast_left_join left_join_keep_first_only left_join_error_no_match PH_year_value_historical

Documented in approx_fun change_iso_code fast_left_join gather_years gdp_deflator left_join_error_no_match left_join_keep_first_only missing_data protect_integer_cols repeat_add_columns standardize_iso unprotect_integer_cols

# pipeline-helpers.R

# Pipeline shortcuts we use a lot

# Make sure year and value are numeric, and within historical years
PH_year_value_historical <- function(d) {
  year <- value <- NULL                 # silence notes in package check
  d %>%
    mutate(year = as.numeric(year),
           value = as.numeric(value)) %>%
    filter(year %in% HISTORICAL_YEARS)
}


#' left_join_error_no_match
#'
#' A restrictive version of \code{\link{left_join}}.
#'
#' @param d Data frame (typically from pipeline)
#' @param ... Rest of call to \code{\link{left_join}}
#' @param ignore_columns Optional column name(s) to ignore, character vector
#' @return Joined data.
#' @details Restrictive version of dplyr::left_join meant for replacing `match` calls.
# Ensures that number of rows of data doesn't change, and everything has matched data.
#' @export
left_join_error_no_match <- function(d, ..., ignore_columns = NULL) {
  assertthat::assert_that(tibble::is.tibble(d))
  dnames <- names(d)
  drows <- nrow(d)
  d <- left_join(d, ...)
  if(nrow(d) != drows) {
    stop("left_join_no_match: number of rows in data changed")
  }
  names_to_check <- setdiff(names(d), dnames) %>% setdiff(ignore_columns)
  if(any(is.na(d[names_to_check]))) {
    stop("left_join_no_match: NA values in new data columns")
  }
  d
}


#' Compute a left join, taking only the first match.
#'
#' In an ordinary \code{\link{left_join}}, if a row in the left operand has
#' multiple matches in the right operand, you get a copy of the row for each
#' match in the right operand.  Sometimes you want just one arbitrary member of
#' the matching set.  This could be because the right operand is a one-to-many
#' mapping, and you don't care which one you get (but you want only one), or it
#' could be that you're trying to reproduce the behavior of legacy code that
#' uses \code{\link{match}}, which has this behavior.  This function performs
#' such a join.
#'
#' This function performs a left join, except that if the right operand has
#' multiple matches for a row in the left operand, \emph{only} the first match
#' is kept.  \strong{Use this function with caution.}  The results will depend
#' on the order of the rows in the right operand, meaning that seemingly
#' innocuous changes can produce changes in output.  Consider yourself warned.
#'
#' @param x Left table to join
#' @param y Right table to join
#' @param by Vector of id columns.  Unlike in other join variants, these must be
#' supplied explicitly.
#' @return Joined table.  In case of multiple matches, only the first will be
#' included.
left_join_keep_first_only <- function(x, y, by) {
  ## Our strategy is to use "distinct" to filter y to a single element for
  ## each match category, then join that to x.
  . <- NULL                           # silence notes on package check
  ll <- as.list(by)
  names(ll) <- NULL
  do.call(distinct_, c(list(y), ll, list(.keep_all = TRUE))) %>%
    left_join(x, ., by = by)
}


#' Fast left join for large tables
#'
#' The dplyr join functions are a little on the slow side for very large
#' tables.  This version converts its inputs
#' \code{\link[data.table]{data.table}} structures, and uses that package's
#' faster indexing capabilities to do a faster join.
#'
#' Because there is some overhead associated with setting up and indexing the
#' data.table structures, this function is only useful when the right-side table
#' is big enough that the savings in the join to make up for the overhead.
#' Therefore, this function should only be used for joins that are demonstrably
#' causing bottlenecks due to the size of the tables involved.  This version
#' should \emph{never} be the first choice in development.  As a rule of thumb,
#' any join that is taking more than 500ms using the dplyr join functions is a
#' candidate for this function.
#'
#' When using this function, be aware that data.table has some slightly
#' different conventions for handling duplicated columns that are not being
#' joined on.  Suppose we have tables \code{A} and \code{B}, both of which have
#' a column \code{value} that is not being joined on.  Then,
#' \code{AB <- dplyr::left_join(A, B)} will have columns \code{AB$value.x} with
#' the values from table \code{A} and \code{AB$value.y} with the values from
#' table \code{B}.  In \code{AB <- gcamdata::fast_left_join(A, B)}, the
#' corresponding columns will be \code{AB$i.value} for the values from table
#' \code{A}, and \code{AB$value} \emph{(sic)} for the values from table
#' \code{B}.  This function makes no attempt to correct the column names in the
#' result to conform to the dplyr convention, and is therefore not exactly a
#' drop-in replacement for \code{left_join}.  However, it is usually easy enough
#' to make corrections on the returned value.
#'
#' Since this function is intended only for specialized use, we don't provide
#' any of the other join variants like first-only or error-no-match.  The cases
#' where that extra functionality is needed \emph{and} the tables involved are
#' too large for the slower version of join are uncommon enough that they can be
#' handled on a case by case basis.  (That's documentation-speak for "You're on
#' your own.")
#'
#' @param left The left-side table to join.  Any class inheriting from
#' \code{data.frame} is acceptable.
#' @param right The right-side table to join.  Any class inheriting from
#' \code{data.frame} is acceptable.
#' @param by Character vector of column names to join by.
#' @return The left join of \code{left} and \code{right}.  It will be returned
#' as a \code{tbl_df}, irrespective of the type of the inputs.
#' @importFrom data.table data.table
#' @importFrom assertthat assert_that
#' @importFrom tibble as_tibble
fast_left_join <- function(left, right, by) {
  assert_that(is.data.frame(left))
  assert_that(is.data.frame(right))

  ## To key or not to key?  A key is required for the right table, but it is
  ## optional for the left, *provided* that the join columns are in order and
  ## come before the non-join columns.  Keying takes time, but it makes the
  ## join eventually go a little faster.  In the one example we have, it
  ## keying the left table doesn't seem to pay for itself in the join, but
  ## it's possible that depends on the specifics of the input.  For now we
  ## *won't* key, instead opting to reorder the columns of the left table.
  dtl <- data.table(left[ , union(by, names(left))])
  dtr <- data.table(right, key=by)

  as_tibble(dtr[dtl, allow.cartesian=TRUE])
}


#' approx_fun
#'
#' \code{\link{approx}} (interpolation) for use in a dplyr pipeline.
#'
#' @param year Numeric year, in a melted tibble or data frame
#' @param value Numeric value to interpolate
#' @param rule Rule to use; see \code{\link{approx}} and details
#' @details This was \code{gcam_interp} in the original data system.
#' @return Interpolated values.
#' @importFrom assertthat assert_that
#' @export
#' @examples
#' df <- data.frame(year = 1:5, value = c(1, 2, NA, 4, 5))
#' approx_fun(df$year, df$value, rule = 2)
approx_fun <- function(year, value, rule = 1) {
  assert_that(is.numeric(year))
  assert_that(is.numeric(value))

  if(rule == 1 | rule == 2) {
    tryCatch(stats::approx(as.vector(year), value, rule = rule, xout = year)$y,
             error = function(e) NA)

  } else {
    stop("Use fill_exp_decay_extrapolate!")
  }
}

#' repeat_add_columns
#'
#' Repeat a data frame for each entry in a second, binding the columns together.
#'
#' @param x Data frame (tibble) to repeat
#' @param y A copy of \code{x} is created for each row of this tibble
#' @return A repeated \code{x} with columns from \code{y} added.
#' @details This corresponds to \code{repeat_and_add_vector} in the old data system.
#' @importFrom assertthat assert_that
#' @author BBL
#' @export
#' @examples
#' x <- tibble::tibble(x = 1:3)
#' y <- tibble::tibble(y = c(4, 5), z = c(6, 7))
#' repeat_add_columns(x, y)
repeat_add_columns <- function(x, y) {
  UNIQUE_JOIN_FIELD <- NULL           # silence package checks.
  assert_that(tibble::is_tibble(x))
  assert_that(tibble::is_tibble(y))

  x %>%
    mutate(UNIQUE_JOIN_FIELD = 1) %>%
    full_join(mutate(y, UNIQUE_JOIN_FIELD = 1), by = "UNIQUE_JOIN_FIELD") %>%
    select(-UNIQUE_JOIN_FIELD)
}

#' Change a particular ISO code in a table to another code.
#'
#' A few countries have changed their ISO codes over time.  Usually this is because countries
#' split or merged; occasionally it's because they just didn't like the old one.  This function
#' allows us to insert an ISO code change into a pipeline.
#'
#' If you're dealing with a split or a merger, then making this change will generally create
#' duplicate rows, so you should follow it up with an appropriate aggregation.
#'
#' @param d The data to be transformed
#' @param from The old ISO code
#' @param to The replacement ISO code
#' @param col The name of the columns with the ISO codes (default = 'iso')
#' @return Tibble with the old iso code replaced by the new one.
#' @export
change_iso_code <- function(d, from, to, col = "iso") {
  assertthat::assert_that(tibble::is_tibble(d))
  iso <- d[[col]]
  d[[col]] <- if_else(iso == from, to, iso)
  d
}

#' Standardize the ISO column by converting to lower case and renaming to 'iso'
#'
#' Most of the processing functions expect ISO codes to be in lower case and in
#' a column called ISO.
#' @param d The data to be transformed.
#' @param col The column currently containing the iso codes (default = 'iso')
#' @param delete_original Delete original column? Logical
#' @return Tibble with the iso codes converted to lower case. If the column with
#'   the codes was not called 'iso', it will be renamed to 'iso'.
#' @export
standardize_iso <- function(d, col = "iso", delete_original = TRUE) {
  assertthat::assert_that(tibble::is_tibble(d))
  d[["iso"]] <- tolower(d[[col]])
  if(delete_original && col != "iso") {
    # This is surprisingly hard to do using dplyr
    d[[col]] <- NULL
  }
  d
}

#' Protect column names that are bare integers from being interpreted as integers
#'
#' \code{protect_integer_cols} will prepend an 'X' to column names that are bare
#' integers to protect them from being misinterpreted during sensitive
#' operations (see below).  \code{unprotect_integer_cols} will reverse the
#' effect, reverting the column names to their original form.
#'
#' Some of our data frames have column names that are years, such as "2005".  In
#' some cases a column name of this form can be misinterpreted as a column
#' index, the (likely nonexistent) 2005th column in this example.  The *_if
#' functions in dplyr are examples of such circumstances.  Protecting the column
#' names with a leading character allows these functions to perform normally.
#'
#' Much of the data system is expecting years in column names to be bare
#' integers; therefore, you should unprotect the column names as soon as the
#' sensitive operations are complete.
#' @param d The data to have integer column names protected or unprotected
#' @return Tibble with the integer column names protected
#' @export
#' @examples
#' library(magrittr)
#' df <- dplyr::tibble(iso=c('bad','dum'), `2005`=c(123.45, NA), `2050`=c(867, 5309))
#' protect_integer_cols(df) %>%
#'     dplyr::select_if(function(col) {!any(is.na(col))}) %>%
#'     unprotect_integer_cols
protect_integer_cols <- function(d) {
  assertthat::assert_that(tibble::is_tibble(d))
  names(d) <- sub('^([0-9]+)$', 'X\\1', names(d))
  d
}

#' @rdname protect_integer_cols
#' @export
unprotect_integer_cols <- function(d) {
  assertthat::assert_that(tibble::is_tibble(d))
  names(d) <- sub('^X([0-9]+)$', '\\1', names(d))
  d
}


#' missing_data
#'
#' @return A tibble used to signal missing (not created) data
missing_data <- function() {
  tibble(x = NA_real_) %>%
    add_title("Data not created") %>%
    add_units("Data not created") %>%
    add_comments("Data not created") %>%
    add_flags(FLAG_NO_TEST, FLAG_NO_OUTPUT)
}

#' Calculate a gross domestic product (GDP) implicit price deflator between two years.
#'
#' The GDP deflator is a measure of price inflation with respect to a
#' specific base year; it allows us to back out the effects of inflation when we
#' compare prices over time.  This function calculates a deflator given a base
#' year (the year to convert from) and a conversion year (the year to convert
#' to).  To use the deflator, multiply prices in base-year dollars by the deflator; the
#' result will be prices in the converted dollar year.
#'
#' @param year Year to convert TO.
#' @param base_year Year to convert FROM.
#' @return GDP Deflator.  Multiply to convert FROM \code{base_year} dollars TO
#' \code{year} dollars.
#' @source U.S. Bureau of Economic Analysis, Gross domestic product (implicit
#' price deflator) [A191RD3A086NBEA], retrieved from FRED, Federal Reserve Bank
#' of St. Louis; https://fred.stlouisfed.org/series/A191RD3A086NBEA, April 12,
#' 2017
#' @author BBL
#' @export
#' @examples
#' gdp_bil_1990USD <- c(4770, 4779, 4937)
#' gdp_bil_2010USD <- gdp_bil_1990USD * gdp_deflator(2010, base_year = 1990)
gdp_deflator <- function(year, base_year) {
  # This time series is the BEA "A191RD3A086NBEA" product
  # Downloaded April 13, 2017 from https://fred.stlouisfed.org/series/A191RD3A086NBEA
  gdp_years <- 1929:2016
  gdp <- c(9.896, 9.535, 8.555, 7.553, 7.345, 7.749, 7.908, 8.001, 8.347,
           8.109, 8.033, 8.131, 8.68, 9.369, 9.795, 10.027, 10.288, 11.618,
           12.887, 13.605, 13.581, 13.745, 14.716, 14.972, 15.157, 15.298,
           15.559, 16.091, 16.625, 17.001, 17.237, 17.476, 17.669, 17.886,
           18.088, 18.366, 18.702, 19.227, 19.786, 20.627, 21.642, 22.784,
           23.941, 24.978, 26.337, 28.703, 31.361, 33.083, 35.135, 37.602,
           40.706, 44.377, 48.52, 51.53, 53.565, 55.466, 57.24, 58.395,
           59.885, 61.982, 64.392, 66.773, 68.996, 70.569, 72.248, 73.785,
           75.324, 76.699, 78.012, 78.859, 80.065, 81.887, 83.754, 85.039,
           86.735, 89.12, 91.988, 94.814, 97.337, 99.246, 100, 101.221,
           103.311, 105.214, 106.913, 108.828, 109.998, 111.445)
  names(gdp) <- gdp_years

  assert_that(year %in% gdp_years)
  assert_that(base_year %in% gdp_years)

  as.vector(unlist(gdp[as.character(year)] / gdp[as.character(base_year)]))
}

gdp_korea_deflator <- function(year, base_year) {
  # source: ECOS, https://ecos.bok.or.kr/ (accssed; 190603)
  # unit: 2010=100
  gdp_years <- 1953:2018
  gdp <- c(0.251, 0.324, 0.528, 0.696, 0.835, 0.82, 0.827, 0.915, 1.034,
           1.207, 1.567, 2.042, 2.141, 2.452, 2.768, 3.152, 3.596, 4.131,
           4.595, 5.32, 5.997, 7.83, 9.678, 11.739, 13.434, 16.386, 19.422,
           24.206, 28.222, 30.049, 31.506, 32.899, 34.213, 35.908, 37.685,
           40.363, 42.802, 47.128, 51.598, 55.594, 59.039, 63.823, 68.252,
           71.157, 74.049, 77.468, 76.55, 77.382, 80.207, 82.661, 85.468,
           88.018, 88.926, 88.802, 90.931, 93.619, 96.935, 100, 101.585,
           102.645, 103.521, 104.142, 106.636, 108.745, 111.208, 111.565)

  names(gdp) <- gdp_years

  assert_that(year %in% gdp_years)
  assert_that(base_year %in% gdp_years)

  as.vector(unlist(gdp[as.character(year)] / gdp[as.character(base_year)]))
}


#' Helper function: call \code{tidyr::gather} for year-like columns and convert them to integers
#'
#' @param d Data frame to operate on (a tibble)
#' @param value_col Name of the resulting (gathered) value column, string or unquoted column name
#' @param year_pattern Year pattern to match against
#' @return The gathered (reshaped) data frame.
#' @export
gather_years <- function(d, value_col = "value", year_pattern = YEAR_PATTERN) {
  assert_that(is_tibble(d))
  assert_that(is.character(value_col))
  assert_that(is.character(year_pattern))

  . <- year <- value <- NULL  # silence package check notes

  d %>%
    gather(year, value, matches(year_pattern)) %>%
    mutate(year = as.integer(year)) %>%
    stats::setNames(sub("value", value_col, names(.)))
}
rohmin9122/gcam-korea-release documentation built on Nov. 26, 2020, 8:11 a.m.