R/utilities.R

Defines functions default_aggregation_region_table_path replace_join sort_iea_df sample_eta_fu_table_path sample_fu_allocation_table_path sample_iea_data_path prod_tp_eiou_energy_carriers carnot_efficiency extract_TK insert_after meta_cols year_cols starts_with_any_of

Documented in carnot_efficiency default_aggregation_region_table_path extract_TK insert_after meta_cols prod_tp_eiou_energy_carriers replace_join sample_eta_fu_table_path sample_fu_allocation_table_path sample_iea_data_path sort_iea_df starts_with_any_of year_cols

#' Tell if a string starts with any of a vector of strings
#'
#' This function returns \code{TRUE} if \code{x}
#' starts with any of the strings in \code{target} and \code{FALSE} otherwise.
#'
#' This function is vectorized. If \code{x} is a vector or list of strings,
#' the return value has the same length as \code{x} and contains the result
#' of applying the test (does \code{x} start with any of \code{target})
#' for each item in \code{x}.
#' 
#' `target` can be either a vector or a list. 
#' To allow `target` to be a list, 
#' `target` is `unlist()`ed before use.
#'
#' @param x a string (or vector or list of strings)
#' @param target a vector or list of strings
#'
#' @return \code{TRUE} if \code{x} starts with any of the strings in \code{target},
#'         \code{FALSE} otherwise.
#'         If \code{x} is a vector or list of strings, 
#'         the return value is a vector of the same length as \code{x}
#'         containing the result of applying the test to each item in \code{x}.
#'
#' @export
#'
#' @examples
#' starts_with_any_of(x = "prefix - suffix", target = c("a", "b", "prefix"))
#' starts_with_any_of(x = "prefix - suffix", target = c("a", "b", "c"))
#' starts_with_any_of(x = "prefix - suffix", target = "suffix")
#' starts_with_any_of(x = c("Production - Crude", "Production - NG",
#'                          "Exports - Oil", "Exports - Crude"),
#'                    target = c("Production", "Imports"))
starts_with_any_of <- function(x, target){
  sapply(x, FUN = function(one_x){
    any(startsWith(x = one_x, prefix = unlist(target)))
  }) %>%
    magrittr::set_names(NULL)
}


#' Find year columns
#' 
#' It is sometimes helpful to know which columns are years.
#' This function returns a set of indices 
#' (or, optionally, the names) of columns in `.df` that represent years.
#' 
#' The default `year_pattern` is "`^-?\\d+$`", which matches columns whose names
#' have zero or one negative signs followed by any number of digits.
#' 
#' If `.df` is tidy, it may have a "Year" column which is included in the return value.
#' To disable this behavior, set `year = NULL`.
#'
#' @param .df A data frame with years spread to the right in columns.
#' @param year_pattern A regex pattern that identifies years. Default is "`^-?\\d+$`".
#' @param year See `IEATools::iea_cols$year`.
#' @param return_names A boolean which tells whether names are returned instead of column indices. 
#'        Default is `FALSE`.
#'
#' @return a vector of column indices (when `return_names = FALSE`, the default) or 
#'         a vector of column names (when `return_names = TRUE`)
#'         for those columns that represent years.
#' 
#' @export
#'
#' @examples
#' DF <- data.frame(a = c(1, 2), `1967` = c(3, 4), `-42` = c(5, 6), check.names = FALSE)
#' DF %>% year_cols()
#' DF %>% year_cols(return_names = TRUE)
#' DF2 <- data.frame(data.frame(a = c(1, 2), Year = c(1967, 2020)))
#' DF2 %>% year_cols(return_names = TRUE)
year_cols <- function(.df, year_pattern = "^-?\\d+$", year = IEATools::iea_cols$year, return_names = FALSE){
  if (is.numeric(year)) {
    year <- colnames(.df)[year]
  }
  colnames <- names(.df)
  indices <- c(which(grepl(year_pattern, x = colnames)), which(colnames(.df) == year)) 
  if (return_names) {
    return(colnames[indices])
  }
  return(indices)
}


#' Find metadata columns
#' 
#' Determines which columns are metadata columns in an IEA data frame 
#' by subtracting year columns and `cols_to_exclude` from all columns names in `.df`.
#' 
#' A non-tidy data frame will have an `e_dot` column, which is not a metadata column.
#' Thus, the default value for `not_meta` is `IEATools::iea_cols$e_dot`.
#' Overriding `not_meta` will change the default behavior to exclude different columns.
#' 
#' @param .df A (possibly) non-tidy data frame with years spread to the right in columns.
#' @param not_meta A vector of column names or position integers identifying columns to *not*
#'                 include in the return value.
#'                 Default is `IEATools::iea_cols$e_dot`.
#' @param years_to_keep A vector of years to retain. Default is `NULL`, which eliminates all year columns.
#' @param return_names A boolean which tells whether names are returned instead of column indices. 
#'        Default is `FALSE`.
#'
#' @return A vector of string names of metadata columns in `.df`.
#' 
#' @export
#'
#' @examples
#' DF <- data.frame(E.dot = 2020, a = c(1, 2), `1967` = c(3, 4), `-42` = c(5, 6), check.names = FALSE)
#' DF %>% meta_cols()
#' DF %>% meta_cols(return_names = TRUE)
meta_cols <- function(.df, 
                      not_meta = IEATools::iea_cols$e_dot, 
                      years_to_keep = NULL, 
                      return_names = FALSE) {
  column_names <- colnames(.df)
  if (is.numeric(not_meta)) {
    # Convert to character vector
    not_meta <- column_names[not_meta]
  }
  # Assume not_meta is a character vector
  year_columns <- year_cols(.df, return_names = TRUE)
  year_columns_to_remove <- year_columns %>% 
    setdiff(as.character(years_to_keep))
  out <- setdiff(column_names, c(year_columns_to_remove, not_meta))
  
  if (!return_names) {
    out <- which(column_names %in% out, arr.ind = TRUE) %>% as.numeric()
  }
  out
}


#' Insert after an item in a list
#' 
#' It is often helpful to insert an item into a list 
#' after another known item rather than at an index of the list as `base::append()` does.
#' This function provides that functionality.
#' 
#' If there are multiple copies of `after` in `x`, 
#' `values` is inserted after each `after`, unless `.after_all = FALSE`.
#' 
#' The positions at which insertions will occur are determined by the `==` operator.
#' I.e., `values` are inserted in `x` after each position in `x` where `x == after` is true.
#' 
#' Note that `length(after)` must be 1.
#' 
#' If `is.null(after)`, `values` is inserted once at the end of the list.
#'
#' @param x a list into which `values` is to be inserted
#' @param after the object in `x` after which `after` will be inserted
#' @param values the object to be inserted into `x`
#' @param .after_all a boolean telling whether to insert `values` after after all instances of `after` (when `TRUE`, the default)
#'        or only the first instance of `after` (when `FALSE`).
#' @param .equals_function insertion of `values` occurs at `which(.equals_function(x, after))`.
#'        Default is `==`.
#'
#' @return a modified version of `x`
#' 
#' @export
#'
#' @examples
#' insert_after(list("a", "b", "c", "d", "c"), after = "c", values = "1")
insert_after <- function(x, after = NULL, values, .after_all = TRUE, .equals_function = `==`){
  # Assume we insert at the end unless otherwise notified.
  insert_indices <- length(x) + 1
  if (!is.null(after)) {
    assertthat::assert_that(length(after) == 1)
    insert_indices <- which(.equals_function(x, after))
    if (!.after_all) {
      insert_indices <- insert_indices[1]
    }
  }
  # If we insert from the back to the front, we don't need to recalculate 
  # the indices after each insertion.
  insert_indices <- rev(insert_indices)
  for (i in insert_indices) {
    x <- append(x = x, values = values, after = i)
  }
  return(x)
}


#' Extract temperatures (in Kelvin) from heat types
#' 
#' In societal exergy analysis, converting heat to exergy requires knowledge 
#' of the temperature of that heat.
#' This function converts heat types (e.g., "`HTH.600.C`")
#' to temperatures by extracting the temperature (in Kelvin) from the middle of the string.
#' 
#' It is assumed that the heat type has the following structure: 
#' * a single letter (typically, "H", "M", or "L" for high, medium, or low, although the character doesn't matter)
#' * the string "TH." or "TC." (for "temperature heating" or "temperature cooling"), 
#' * the temperature value, and
#' * unit (one of ".C", ".F", ".R", or ".K", indicating ° Celsius, ° Fahrenheit, rankine, or kelvin, respectively).
#' 
#' If `heat_type` does not conform to the pattern shown above, `NA` is the likely result.
#' 
#' @param heat_types a string vector of heat types to be converted to temperatures
#' @param sep the separator between parts of the `heat_types` string. Default is ".".
#'
#' @return a numeric vector of same length as `heat_types` containing temperatures in Kelvin.
#' 
#' @export
#'
#' @examples
#' extract_TK(c("HTH.600.C", "LTH.-20.567.C", "LTH.-40.F", "LTH.-40.C"))
extract_TK <- function(heat_types, sep = "."){
  # # Grab the units
  # lens <- nchar(heat_types)
  # units <- Map(substring, heat_types, first = lens, last = lens) %>% unlist() %>% unname()
  # # Eliminate the leading *TH.
  # temporary <- sub(pattern = "^.TH\\.", replacement = "", x = heat_types)
  # # Eliminate the trailing .C, .F, .R, or .K.
  # temperatures <- suppressWarnings(sub(pattern = "\\.[C|F|R|K]$", replacement = "", x = temporary) %>% as.numeric())
  # # temperatures <- sub(pattern = "\\.[C|F|R|K]$", replacement = "", x = temporary)
  # # string_temperatures <- sub(pattern = "\\..$", replacement = "", x = temporary)
  # convert_to_K <- function(rawT, unit){
  #   if (is.na(rawT)) {
  #     # rawT can't be turned into a numeric.
  #     return(NA_real_)
  #   }
  #   if (unit == "K") {
  #     return(rawT)
  #   }
  #   if (unit == "R") {
  #     return(rawT / 1.8)
  #   }
  #   if (unit == "C") {
  #     return(rawT + 273.15)
  #   }
  #   if (unit == "F") {
  #     return((rawT + 459.67) / 1.8)
  #   }
  #   # If we get here, we had a non-NA rawT, but we don't recognize the unit.
  #   return(NA_real_)
  # }
  # # Convert to K based on unit and return
  # Map(convert_to_K, rawT = temperatures, unit = units) %>% unlist() %>% unname()
  # 
  if (sep == ".") {
    # Be careful with literal dots in the following regex code.
    sep <- paste0("\\", sep)
  }
  # Eliminate the leading *TH<sep> or *TC<sep>
  # If the string does not start with *TH<sep> or *TC<sep>, the leading characters will not be eliminated, and
  # an error will occur later in this function.
  temporary <- sub(pattern = paste0("(^.TH|^.TC)", sep), replacement = "", x = heat_types)
  # Grab the temperatures from the strings, which are everything before the last sep and unit character.
  # So delete everything including and after the last sep.
  temperature_strings <- sub(pattern = paste0(sep, ".$"), replacement = "", x = temporary)
  # Grab the units from the strings, which are everything after the last sep.
  # So delete everything before the last sep.
  unit_strings <- sub(pattern = paste0("^.*", sep), replacement = "", x = temporary)
  convert_to_K <- function(T_string, U_string){
    rawT <- suppressWarnings(T_string %>% as.numeric())
    if (is.na(rawT)) {
      # rawT can't be turned into a numeric.
      return(NA_real_)
    }
    if (U_string == "K") {
      return(rawT)
    }
    if (U_string == "R") {
      return(rawT / 1.8)
    }
    if (U_string == "C") {
      return(rawT + 273.15)
    }
    if (U_string == "F") {
      return((rawT + 459.67) / 1.8)
    }
    # If we get here, we had a non-NA rawT, but we don't recognize the unit.
    return(NA_real_)
  }
  Map(convert_to_K, T_string = temperature_strings, U_string = unit_strings) %>% unlist() %>% unname()
}


#' Calculate Carnot efficiencies from heat types
#' 
#' In societal exergy analysis, converting heat to exergy requires knowledge 
#' of the temperature of that heat and application of the Carnot efficiency.
#' This function first converts heat types (e.g., "HTH.600.C")
#' to temperatures by extracting the temperature from the middle of the string,
#' in a unit-aware manner.
#' Then, the Carnot efficiency is calculated from the temperature of the heat
#' by applying the Carnot efficiency equation: `abs(1 - T_0/T)`,
#' where T_0 and T are expected to be in kelvin units.
#' 
#' When the heat temperature is less than `T_0`, 
#' the Carnot efficiency is calculated as `1 - (heat temperature)/T_0`.
#' 
#' `T_0` can be supplied as a numeric vector of ambient temperatures of
#' same length as `heat_types`.
#'
#' @param heat_types a string vector of heat types of the form "HTH.600.C"
#' @param T_0 dead state temperature in kelvin. Default is `298.15` kelvin (25 C).
#' 
#' @seealso [extract_TK()]
#'
#' @return a numeric vector of Carnot efficiencies of same length as `heat_types`
#' 
#' @export
#'
#' @examples
#' carnot_efficiency(c("HTH.600.C", "MTH.200.C", "MTH.100.C", "LTH.20.C", "LTH.-10.C"))
#' carnot_efficiency("LTH.-30.F")
carnot_efficiency <- function(heat_types, T_0 = 298.15){
  # Calculate temperatures in K from heat_types
  TK <- extract_TK(heat_types)
  # Apply the carnot efficiency equation eta = 1 - T0/TK
  carnot_func <- function(T_kelvin, T0){
    if (is.na(T_kelvin)) {
      return(NA_real_)
    }
    if (T_kelvin > T0) {
      return(1 - T0/T_kelvin)
    }
    1 - T_kelvin/T0
  }
  Map(carnot_func, TK, T_0) %>% unlist()
}


#' Identify all energy types supplied or consumed by Production, Transformation processes, or Energy industry own use
#' 
#' Sometimes, it is helpful to know all types of energy supplied or consumed by Production, Transformation processes, or Energy industry own use.
#' This function (optionally) reads an IEA data file or loads an IEA data frame and builds a named list of energy types 
#' supplied or consumed by Production, Transformation processes, or Energy industry own use.
#' 
#' The names in the returned list are the Production, Transformation processes, or Energy industry own use industries in `iea_df`.
#' The items in the returned list are vectors of energy types produced or consumed by the corresponding industries.
#' 
#' @param file_path The path to the IEA data file (optional).
#' @param iea_df A data frame containing IEA data. Default is `IEATools::load_tidy_iea_df(file_path)`.
#' @param side Refers to the "Consumption" or "Supply" side of Production, Transformation processes, or Energy industry own use. 
#'             One of "Consumption" or "Supply". Default is "Consumption".
#' @param flow_aggregation_point The flow aggregation point column in `iea_df`. Default is "Flow.aggregation.point".
#' @param production The string indicating the production flow. Default is "Production".
#' @param transformation_processes The string indicating the transformation process stage. Default is "Transformation processes".
#' @param eiou The string indicating the energy industry own use flow. Default is "Energy industry own use".
#' @param stage The string indicating the stage for the analysis. One of `production`, `transformation_processes`, or `eiou`. 
#'              Default is `production`.
#' @param e_dot The energy flow rate column in `iea_df`.  Default is "E.dot".
#' @param flow The flow column in `iea_df`. Default is "Flow".
#' @param product The product column in `iea_df`.  Default is "Product".
#'
#' @return a list of string vectors
#' 
#' @export
#'
#' @examples
#' prod_tp_eiou_energy_carriers()
prod_tp_eiou_energy_carriers <- function(file_path = sample_iea_data_path(), 
                                         iea_df = IEATools::load_tidy_iea_df(file_path), 
                                         side = c("Consumption", "Supply"),
                                         flow_aggregation_point = "Flow.aggregation.point", 
                                         production = "Production", 
                                         transformation_processes = "Transformation processes", 
                                         eiou = "Energy industry own use",
                                         stage = c(production, transformation_processes, eiou),
                                         e_dot = "E.dot", 
                                         flow = "Flow", 
                                         product = "Product"){
  stage <- match.arg(stage)
  side <- match.arg(side)
  # First step is to focus on Supply or Consumption based on the side argument
  if (side == "Consumption" & stage == production) {
    # This should give a 0-row table, because Production, by definition, does not use any energy.
    out <- iea_df %>% dplyr::filter(!!as.name(flow) == stage & !!as.name(e_dot) < 0)
  } else if (side == "Supply" & stage == production) {
    out <- iea_df %>% dplyr::filter(!!as.name(flow) == stage & !!as.name(e_dot) > 0)
  } else if (side == "Consumption" & stage == eiou) {
    out <- iea_df %>% dplyr::filter(!!as.name(flow_aggregation_point) == eiou & !!as.name(e_dot) < 0)
  } else if (side == "Consumption" & stage == transformation_processes) {
    out <- iea_df %>% dplyr::filter(!!as.name(flow_aggregation_point) == stage & !!as.name(e_dot) < 0)
  } else if (side == "Supply" & stage == transformation_processes) {
    # Because we match.arg(type) above, we know the caller is interested in Supply here.
    out <- iea_df %>% dplyr::filter(!!as.name(flow_aggregation_point) == stage & !!as.name(e_dot) > 0)
  } else if (side == "Supply" & stage == eiou) {
    # This should give a 0-row table, because, EIOU, by definition, does not produce energy.
    out <- iea_df %>% dplyr::filter(!!as.name(flow_aggregation_point) == eiou & !!as.name(e_dot) > 0)
  }
  # Third step is to keep only the flow and product columns and
  # group by the flow variable which contains the names of the transformation process machines.
  grouped <- out %>% 
    # dplyr::select(!!as.name(flow), !!as.name(product)) %>% 
    dplyr::select(dplyr::all_of(c(flow, product))) %>% 
    unique() %>% 
    dplyr::group_by(!!as.name(flow))
  # Fourth step is to isolate the machines, which are the groups.
  machines <- grouped %>% 
    dplyr::group_keys() %>% 
    unlist() %>% 
    as.vector()
  # Fifth step is to isolate the energy types.
  # The energy types are obtained via the group_map function, with the default value of keep (FALSE),
  # which ensures that the grouping variables are lost.
  grouped %>% 
    dplyr::group_map(function(grp_tbl, key){
      # When we get here, grp_tbl is the rows of iea_df that have the value of flow in common.
      # key is that value of flow.
      grp_tbl %>% 
        unlist() %>% 
        as.vector()
    }) %>% 
    # Lastly, we set the names of each list of energy types to the transformation process with which they are associated,
    # either as inputs or outputs.
    magrittr::set_names(machines)  
}


#' The path to a sample IEA extended energy balances data file in .csv format
#' 
#' The sample data file is in .csv format and contains
#' IEA extended energy balances data for Ghana and South Africa for the years 1971 and 2000.
#' 
#' Permission to include this data was provided in a phone conversation between 
#' Nick Johnstone (IEA Chief Statistician) and Matthew Kuperus Heun (IEATools developer)
#' on Monday, 3 June 2019.
#'
#' @param version The desired version (expressed as the year of release) of sample data. 
#'                Options are `2018--2022`. `2022` is the default. 
#'
#' @return the path to a sample data file.
#' 
#' @export
#'
#' @examples
#' sample_iea_data_path()     # Assumes 2022
#' sample_iea_data_path(2022) # Same
sample_iea_data_path <- function(version = 2022) {
  if (version %in% IEATools::valid_iea_release_years) {
    return(file.path("extdata", paste0("GH-ZA-TJ-Extended-Energy-Balances-sample-", version, ".csv")) |> 
             system.file(package = "IEATools"))
  }
  stop("Only versions 2021 and later are supported in sample_iea_data_path()")
}


#' The path to a filled final-to-useful allocation table
#' 
#' @param version the desired version (expressed as the year of IEA data release) of 
#'                the sample final-to-useful allocation table. 
#'                Options are 2021 (default), 2020, 2019, and 2018. 
#'
#' @return the path to a final-to-useful allocation table
#' 
#' @export
#'
#' @examples
#' sample_fu_allocation_table_path()     # Assumes 2022
#' sample_fu_allocation_table_path(2022) # Same
#' # Returns path for sample allocation table appropriate for other IEA data releases
#' sample_fu_allocation_table_path(2021) 
sample_fu_allocation_table_path <- function(version = 2022) {
  if (version %in% IEATools::valid_iea_release_years) {
    return(file.path("extdata", paste0("GH-ZA-Allocation-sample-", version, ".xlsx")) |>  
             system.file(package = "IEATools"))
  }
  stop("Only versions 2021 and later are supported in sample_fu_allocation_table_path()")
}


#' The path to a filled final-to-useful efficiency table
#'
#' @param version the desired version (expressed as the year of IEA data release) of 
#'                the sample final-to-useful efficiencies table. 
#'                Options are 2021 (default), 2020, 2019, and 2018.
#'
#' @return The path to a final-to-useful efficiencies table.
#' 
#' @export
#'
#' @examples
#' sample_eta_fu_table_path()     # Assumes 2022
#' sample_eta_fu_table_path(2022) # Same
sample_eta_fu_table_path <- function(version = 2022) {
  if (version %in% IEATools::valid_iea_release_years) {
    return(file.path("extdata", paste0("GH-ZA-Efficiency-sample-", version, ".xlsx")) |> 
             system.file(package = "IEATools"))
  }
  stop("Only versions 2021 and later are supported in sample_eta_fu_table_path()")
}


#' Sort an IEA data frame in IEA row order
#' 
#' The IEA data frame to be sorted can be either 
#' (a) tidy (long) where each observation is on its own row and 
#' there is a `year` column present or 
#' (b) wide where year columns are spread to the right.
#' 
#' Sorting is accomplished (by default) using 
#' the values of the arguments `countries`, `methods`, 
#' `energy_type`, `last_stage`, `year` (if present),
#' `ledger_side`, `fap_flows`, and
#' `product`
#' (in that order of precedence).
#' 
#' Years are sorted if the `year` column is present (a tidy data frame).
#' If years are not present, they are assumed to be spread to the right
#' to create a wide data frame. 
#' Wide data frames are sorted in the same order.
#'
#' @param .iea_df the IEA data frame to be sorted
#' @param col_names a list of column names in IEA data frames. Default is `IEATools::iea_cols`.
#' @param country the name of the country column in `.tidy_iea_df`. Default is "Country".
#' @param method the name of the method column in `.tidy_iea_df`. Default is "Method".
#' @param energy_type the name of the energy type column in `.tidy_iea_df`. Default is "Energy.type".
#' @param last_stage the name of the last stage column in `.tidy_iea_df`. Default is "Last.stage".
#' @param year the name of the year column in `.tidy_iea_df`. Default is "Year".
#' @param ledger_side the name of the ledger side column in `.tidy_iea_df`. Default is "Ledger.side".
#' @param flow_aggregation_point the name of the flow aggregation point column in `.tidy_iea_df`. Default is "Flow.aggregation.point".
#' @param flow the name of the flow column in `.tidy_iea_df`. Default is "Flow".
#' @param sep a separator between the flow aggregation point column and the flow column. Used when uniting those two columns internally. Default is "_".
#' @param fap_flow the name of the united flow aggregation point and flow column to be created internally in `.tidy_iea_df`. Default is "Flow.aggregation.point_Flow".
#' @param product the name of the product column in `.tidy_iea_df`. Default is "Product".
#' @param country_order the order in which to sort the `country` column of `.tidy_iea_df`. Default is [countries].
#' @param method_order the order in which to sort the `method` column of `.tidy_iea_df`. Default is [methods].
#' @param energy_type_order the order in which to sort the `energy_type` column of `.tidy_iea_df`. Default is [energy_types].
#' @param last_stage_order the order in which to sort the `last_stage` column of `.tidy_iea_df`. Default is [last_stages].
#' @param ledger_side_iea_order the order in which to sort the `ledger_side` column of `.tidy_iea_df`. Default is [ledger_sides].
#' @param fap_flow_iea_order the order in which to sort the united `flow_aggregation_point` and `flow` columns of `.tidy_iea_df`. Default is [fap_flows].
#' @param product_iea_order the order in which to sort the `product` column of `.tidy_iea_df`. Default is [products].
#' @param .clean_flow the name of an internally-generated column in `.iea_df` that stores 
#'                    a de-specified version of the `flow` column. Default is ".clean_flow".
#' @param .clean_product the name of an internally-generated column in `.iea_df` that stores 
#'                       a de-specified version of the `product` column. Default is ".clean_product".
#'
#' @return a version of `.tidy_iea_df` sorted in IEA order
#' 
#' @export
#'
#' @examples
#' library(dplyr)
#' tidy <- load_tidy_iea_df()
#' # See first and last rows
#' head(tidy)
#' tail(tidy)
#' # Move the first row to the bottom to put everything out of order
#' unsorted <- tidy[-1, ] %>% 
#'   bind_rows(tidy[1, ])
#' head(unsorted)
#' tail(unsorted)
#' # Now sort it
#' sorted <- sort_iea_df(unsorted)
#' head(sorted)
#' tail(sorted)
sort_iea_df <- function(.iea_df,
                        col_names = IEATools::iea_cols,
                        country = col_names$country, 
                        method = col_names$method,
                        energy_type = col_names$energy_type,
                        last_stage = col_names$last_stage, 
                        year = col_names$year,
                        ledger_side = col_names$ledger_side,
                        flow_aggregation_point = col_names$flow_aggregation_point,
                        flow = col_names$flow,
                        sep = "_",
                        fap_flow = paste0(flow_aggregation_point, sep, flow),
                        product = col_names$product,
                        country_order = IEATools::countries,
                        method_order = IEATools::methods,
                        energy_type_order = IEATools::energy_types,
                        last_stage_order = IEATools::last_stages, 
                        ledger_side_iea_order = IEATools::ledger_sides,
                        fap_flow_iea_order = IEATools::fap_flows,
                        product_iea_order = IEATools::products, 
                        .clean_flow = ".clean_flow",
                        .clean_product = ".clean_product") {

  factorized <- .iea_df %>% 
    despecify_col(col = flow, despecified_col = .clean_flow) %>% 
    despecify_col(col = product, despecified_col = .clean_product) %>% 
    dplyr::mutate(
      "{country}" := factor(!!as.name(country), levels = country_order),
      "{method}" := factor(!!as.name(method), levels = method_order),
      "{energy_type}" := factor(!!as.name(energy_type), levels = energy_type_order),
      "{last_stage}" := factor(!!as.name(last_stage), levels = last_stage_order),
      "{ledger_side}" := factor(!!as.name(ledger_side), levels = ledger_side_iea_order),
      "{fap_flow}" := paste0(.data[[flow_aggregation_point]], sep, .data[[.clean_flow]]),
      "{fap_flow}" := factor(!!as.name(fap_flow), levels = fap_flow_iea_order),
      "{.clean_product}" := factor(.data[[.clean_product]], levels = product_iea_order)
    )
  
  if (year %in% names(factorized)) {
    # If we have the year column, we want to include it in the arranging.
    # Likely that we have a long, tidy data frame here.
    sorted <- factorized %>% 
      dplyr::arrange(!!as.name(year), !!as.name(country), !!as.name(method), !!as.name(energy_type), !!as.name(last_stage),
                     !!as.name(fap_flow), !!as.name(.clean_product))
  } else {
    # No year column. This might be a wide data frame here in which years are spread as columns to the right.
    # Exclude year from the arranging.
    sorted <- factorized %>% 
      dplyr::arrange(!!as.name(country), !!as.name(method), !!as.name(energy_type), !!as.name(last_stage),
                     !!as.name(fap_flow), !!as.name(.clean_product))
  }
  
  sorted %>% 
    dplyr::mutate(
      # Remove temporary columns
      !!as.name(fap_flow) := NULL, 
      "{.clean_flow}" := NULL,
      "{.clean_product}" := NULL
    ) %>% 
    # Remove factors from the sorting columns to return columns to character state.
    dplyr::mutate_at(c(country, method, energy_type, last_stage, ledger_side), as.character)
}


#' `full_join` with replacement
#' 
#' Perform a modified `dplyr::full_join()` on `x` and `y`, 
#' returning all columns from `x`, 
#' non-matching rows from `x`,
#' and all rows from `y`.
#' Essentially `replace_join()` replaces matching rows in `x` with corresponding rows from `y`
#' and adds all unmatched rows from `y`.
#' 
#' If `x` contains multiple matching rows, matching rows in `y` are inserted into `x` at each matching location.
#' If `y` contains multiple matching rows, all are inserted into `x` at each matching location.
#' See examples.
#' 
#' Columns of `x` and `y` named in `by` and `replace_col` should not be factors. 
#' 
#' If `replace_col` is not in both `x` and `y`, `x` is returned, unmodified.
#'
#' @param x object on which replace_join will be performed. 
#'        `x` is the data frame in which rows will be replaced by matching rows from `y`.
#' @param y object on which replace_join will be performed. 
#'        `y` is the data frame from which replacement rows are obtained when matching rows are found
#'        and from which unmatching rows are added to the outgoing data frame.
#' @param replace_col the string name of the column (common to both `x` and `y`)
#'        whose values in `y` will be inserted into `x` where row matches are found for the `by` columns.
#'        `replace_col` should not be in `by`.
#'        The default value of `by` ensures that `replace_col` is not in `by`.
#' @param by the string names of columns (common to `x` and `y`) on which matching rows will be determined.
#'        Default is `dplyr::intersect(names(x), names(y)) %>% dplyr::setdiff(replace_col)`.
#'        This default ensures that `replace_col` is not in `by`, as required.
#' @param copy passed to `dplyr::left_join()`. Default value is `FALSE`.
#' @param suffix appended to `replace_col` to form the names of columns created in `x` during the internal `dplyr::left_join()` operation.
#'        Default is `c(".x", ".y")`, same as the default for `dplyr::full_join()`.
#' @param ... passed to `dplyr::full_join()`
#'
#' @return a copy of `x` in which matching `by` rows are replaced by matched rows from `y` and unmatched rows from `y` are added to `x`.
#' 
#' @export
#'
#' @examples
#' DFA <- data.frame(x = c(1, 2), y = c("A", "B"), stringsAsFactors = FALSE)
#' DFB <- data.frame(x = c(2, 3), y = c("C", "D"), stringsAsFactors = FALSE)
#' replace_join(DFA, DFB, replace_col = "y")
#' replace_join(DFB, DFA, replace_col = "y")
#' DFC <- data.frame(x = c(2, 2), y = c("M", "N"), stringsAsFactors = FALSE)
#' replace_join(DFA, DFC, replace_col = "y")
#' replace_join(DFC, DFA, replace_col = "y")
#' DFD <- data.frame(x = c(2, 2), y = c("A", "B"), stringsAsFactors = FALSE)
#' replace_join(DFC, DFD, replace_col = "y")
replace_join <- function(x, y, replace_col, 
                         by = dplyr::intersect(names(x), names(y)) %>% dplyr::setdiff(replace_col), 
                         copy = FALSE, 
                         suffix = c(".x", ".y"), ...) {
  # Ensure that replace_col has length 1.
  assertthat::assert_that(length(replace_col) == 1, 
                          msg = paste0("length(replace_col) is ", length(replace_col), " in replace_join. Must be 1."))
  # Ensure that x and y both have replace_col. If not, just return x.
  if (! (replace_col %in% names(x) & replace_col %in% names(y))) {
    return(x)
  }
  # Check for factors and give error if any columns are factors.
  assertthat::assert_that(!any(lapply(x[c(by, replace_col)], is.factor) %>% unlist()) &
                            !any(lapply(y[c(by, replace_col)], is.factor) %>% unlist()), 
                          msg = "Columns should not contain factors in arguments to replace_join")
  # Ensure that replace_col is not in by.
  assertthat::assert_that(! replace_col %in% by, msg = "replace_col must not be in the by argument to replace_join")
  # Make the names of the .x and .y columns
  .x_col <- paste0(replace_col, suffix[[1]])
  .y_col <- paste0(replace_col, suffix[[2]])
  # Find the columns of y that are not in by or replace_col.
  # These columns need to be removed from y, 
  # else they appear in the output.
  remove_from_y <- dplyr::setdiff(names(y), dplyr::union(by, replace_col))
  trimmed_y <- y
  for (to_remove in remove_from_y) {
    trimmed_y <- trimmed_y %>% 
      dplyr::mutate(
        !!as.name(to_remove) := NULL
      )
  }
  # The algorithm is 
  #   * Do a full_join, which results in .x and .y columns for replace_col.
  #   * keep the .x version if .y is NA
  #   * keep the .y version if .y is not NA
  #   * Delete the .x and .y columns
  dplyr::full_join(x, trimmed_y, copy = copy, suffix = suffix, by = by, ... = ...) %>% 
    dplyr::mutate(
      !!as.name(replace_col) := dplyr::case_when(
        is.na(!!as.name(.y_col)) ~ !!as.name(.x_col),
        TRUE ~ !!as.name(.y_col)
      ), 
      !!as.name(.x_col) := NULL,
      !!as.name(.y_col) := NULL
    )
}


#' The path to the default aggregation table
#' 
#' @param version the desired version (expressed as the year of IEA data release) of 
#'                the aggregation table. 
#'                Options are 2019 (default), and 2020. 
#'
#' @return the path to the default aggregation table for the desired IEA data version
#' 
#' @export
#'
#' @examples
#' default_aggregation_region_table_path()     # Assumes 2019
#' default_aggregation_region_table_path(2019) # Same
#' # Returns path for default aggregation table appropriate for 2020 IEA data release
#' default_aggregation_region_table_path(2020) 
default_aggregation_region_table_path <- function(version = 2019) {
  if (version == 2019) {
    return(file.path("extdata", "aggregation_table_iea_exiobase_2019.xlsx") %>% 
             system.file(package = "IEATools"))
  } else if (version == 2020) {
    return(file.path("extdata", "aggregation_table_iea_exiobase_2020.xlsx") %>% 
             system.file(package = "IEATools"))
  }
  stop("Only 2019, and 2020 are supported in default_aggregation_region_table_path()")
}
# EAR - 29/09/2020
MatthewHeun/IEATools documentation built on Feb. 6, 2024, 3:29 p.m.