R/utils.R

Defines functions end_year_lookup infer_scenario_type infer_sectors_and_technologies stop_if_empty customise_output_path fail_if_input_args_are_missing format_indent_2 format_indent_1 paste_write get_iter_var report_missings report_dropped_company_names report_company_drops report_duplicates report_missing_col_combinations report_all_duplicate_kinds validate_data_has_expected_cols validate_file_exists path_dropbox_2dii degrees dropbox_exists

Documented in customise_output_path degrees dropbox_exists fail_if_input_args_are_missing get_iter_var path_dropbox_2dii report_all_duplicate_kinds report_company_drops report_dropped_company_names report_duplicates report_missing_col_combinations report_missings validate_data_has_expected_cols validate_file_exists

#' Does this computer have a local copy of 2dii's dropbox folder?
#'
#' @family miscellaneous utility functions
#'
#' @return Logical.
#'
#' @examples
#' r2dii.climate.stress.test:::dropbox_exists()
dropbox_exists <- function() {
  fs::dir_exists(path_dropbox_2dii())
}

#' Insert the symbol for degrees
#'
#' @family miscellaneous utility functions
#'
#' @return A string.
#'
#' @examples
#' r2dii.climate.stress.test:::degrees()
#' glue::glue("2{r2dii.climate.stress.test:::degrees()} Investing Initiative")
degrees <- function() {
  "\u00B0"
}

#' Easily access directories in your local copy of 2dii's Dropbox folder
#'
#' These functions create cross-platform paths pointing to 2dii's Dropbox
#' folder:
#' Your projects may need data stored in 2dii's Dropbox folder. Sometimes it is
#' convenient to have your projects close to the data. But, in this case, it is
#' a bad idea because the path to 2dii's Dropbox folder has a problematic space
#' and symbol. (For example, [RStudio's Git pane may not
#' work](https://github.com/2DegreesInvesting/resources/issues/51).) Instead,
#' place your projects somewhere with a sane path, such as
#' `C:/Users/You/git/project/`, and access the data you need with
#' `path_dropbox_2dii()`.
#'
#' @section Setup for a custom Dropbox folder:
#' If the name of your 2dii Dropbox folder is different from the default,
#' you may add this to .Rprofile (see [usethis::edit_r_profile()]):
#'
#' ```
#' options(r2dii_dropbox = "The name of your custom dropbox folder goes here")
#' ````
#'
#' @param ... Character vectors, if any values are `NA`, the result will also be
#'   `NA`.
#'
#' @seealso [degrees()], [fs::path_home()].
#' @family functions to output 2dii paths
#'
#' @return A character string.
path_dropbox_2dii <- function(...) {
  custom <- getOption("r2dii_dropbox")
  default <- sprintf("Dropbox (2%s Investing)", degrees())
  fs::path_home(custom %||% default, ...)
}

#' Validate that a file exists in a given directory
#'
#' Before performing an operation on a file assumed to be found in a given
#' directory, validate this file exists and give indicative error if not.
#'
#' @param path Character vector indicating the directory of a file.
#'
#' @return String holding provided `path`.
#' @export
validate_file_exists <- function(path) {
  valid_file_path <- file.exists(path)

  if (!valid_file_path) {
    rlang::abort(c(
      "Path must point to an existing file.",
      x = glue::glue("Invalid file path: {file.path(path)}."),
      i = "Did you set path to data correctly?."
    ))
  }
  invisible(path)
}

#' Validate that a data frame contains expected columns
#'
#' Validate that all expected columns for an operation are given in a data frame.
#'
#' @param data data frame that is to be validated
#' @param expected_columns Character vector listing the expected columns
#'
#' @return NULL
#' @export
validate_data_has_expected_cols <- function(data,
                                            expected_columns) {
  stopifnot(rlang::is_named(data))
  stopifnot(is.character(expected_columns))

  data_has_expected_columns <-
    all(expected_columns %in% colnames(data))

  if (!data_has_expected_columns) {
    affected_cols <- glue::glue_collapse(sort(setdiff(expected_columns, names(data))), sep = ", ")
    rlang::abort(c(
      "Must include expected columns in data set.",
      x = glue::glue("Missing columns: {affected_cols}."),
      i = "Please check that data have expected columns."
    ))
  }
  invisible()
}


#' Checks data for missings and duplicates
#'
#' Applies consistency checks to data concerning the combinations of columns
#' that should be unique in combination. In concrete:
#'
#' 1. it is checked if there are duplicate rows.
#' 1. it is checked if there are duplicate rows on `composite_unique_cols`.
#'
#'
#' @param data A tibble.
#' @param composite_unique_cols A vector of names of columns that shall be
#'   unique in their combination.
#' @param throw_error Boolean, if TRUE error is thrown on failures, otherwise a
#'   warning.
#'
#' @return input `data`.
report_all_duplicate_kinds <- function(data, composite_unique_cols, throw_error = TRUE) {
  validate_data_has_expected_cols(
    data = data,
    expected_columns = composite_unique_cols
  )

  report_duplicates(
    data = data,
    cols = names(data),
    throw_error = throw_error
  )

  report_duplicates(
    data = dplyr::distinct(data),
    cols = composite_unique_cols,
    throw_error = throw_error
  )

  return(invisible(data))
}

#' Identify and report missing value combinations
#'
#' Checks if all level combinations of `composite_unique_cols` are in`data` and
#' throws a warning on missing combinations.
#' NOTE:
#' 1. a combination of all levels is not necessarily required/useful, make sure
#' to use function only in adequate context.
#' 1. combinations of too many columns/values may exceed memory size.
#' .
#'
#' @inheritParams report_all_duplicate_kinds
#' @param col_names String holding names of columns.
#'
#' @return Returns `data` invisibly.
report_missing_col_combinations <- function(data, col_names, throw_error = FALSE) {
  all_combinations <- data %>%
    tidyr::expand(!!!rlang::syms(col_names))

  missing_rows <- all_combinations %>%
    dplyr::anti_join(data, by = col_names)

  if (nrow(missing_rows) > 0) {
    if (throw_error) {
      stop(paste0("Identified ", nrow(missing_rows), " missing combinations on columns ", paste(col_names, collapse = ", "), "."))
    } else {
      warning(paste0("Identified ", nrow(missing_rows), " missing combinations on columns ", paste(col_names, collapse = ", "), "."), call. = FALSE)
    }
  }

  return(invisible(data))
}

#' Report duplicate rows
#'
#' Reports duplicates in `data` on columns `cols`. Duplicates are reported via a
#' warning.
#'
#' @inheritParams report_all_duplicate_kinds
#' @param cols Cols to check for duplicate combinations on.
#'
#' @return NULL
report_duplicates <- function(data, cols, throw_error = TRUE) {
  duplicates <- data %>%
    dplyr::group_by(!!!rlang::syms(cols)) %>%
    dplyr::filter(dplyr::n() > 1) %>%
    dplyr::select(!!!rlang::syms(cols)) %>%
    dplyr::distinct_all()

  if (nrow(duplicates) > 0) {
    if (throw_error) {
      stop(paste0("Identified ", nrow(duplicates), " duplicates on columns ", paste(cols, collapse = ", "), "."))
    } else {
      warning(paste0("Identified ", nrow(duplicates), " duplicates on columns ", paste(cols, collapse = ", "), "."), call. = FALSE)
    }
  }

  return(invisible())
}

#' Report dropped companies
#'
#' Wrapper to report companies for which all results, or results for some technologies
#' are lsot due to a missing match in financial_data or price_data.
#'
#' @param data_list A list of imported stress test input data.
#' @param log_path String holding path to log file.
#'
#' @return NULL
report_company_drops <- function(data_list, log_path) {
  report_dropped_company_names(
    data_x = data_list$production_data,
    data_y = data_list$financial_data,
    name_y = "financial data",
    merge_cols = c("company_id"),
    log_path = log_path
  )

  report_dropped_company_names(
    data_x = data_list$production_data,
    data_y = data_list$df_price,
    name_y = "price data",
    merge_cols = c("ald_business_unit", "ald_sector", "year"),
    log_path = log_path
  )

  invisible()
}

#' Inner join datasets and report number of dropped rows
#'
#' Function conducts inner join on two datasets and reports number of dropped
#' rows on `data_x`.
#'
#' @param data_x Tibble with data that can be joined to `data_y`.
#' @param data_y Tibble with data that can be joined to `data_x`.
#' @param name_y Name of `data_x`.
#' @param merge_cols Vector holds columns to join on.
#' @param name_x Name of `data_x, defaults to PACTA results.
#' @inheritParams report_company_drops
#'
#' @return The merged dataset.
report_dropped_company_names <- function(data_x, data_y, name_y, merge_cols, name_x = "PACTA results", log_path) {
  data <- data_x %>%
    dplyr::inner_join(
      data_y,
      by = merge_cols
    )

  n_companies_x <- length(unique(data_x$company_name))
  n_companies <- length(unique(data$company_name))

  if (n_companies < n_companies_x) {
    percent_loss <- (n_companies_x - n_companies) * 100 / n_companies_x
    affected_companies <- sort(setdiff(data_x$company_name, data$company_name))
    paste_write(
      format_indent_1(), "When joining", name_x, "on", name_y, "on column(s)", paste0(merge_cols, collapse = ", "), "could not match entries for",
      n_companies_x - n_companies, "out of", n_companies_x, "companies.",
      log_path = log_path
    )
    paste_write(format_indent_2(), "percent loss:", percent_loss, log_path = log_path)
    paste_write(format_indent_2(), "affected companies:", log_path = log_path)
    purrr::walk(affected_companies, function(company) {
      paste_write(format_indent_2(), company, log_path = log_path)
    })
    paste_write("\n", log_path = log_path)
  }
  return(data)
}

#' Report missing
#'
#' Function reports number of missing values per variable.
#'
#' @inheritParams report_all_duplicate_kinds
#' @param data Tibble holding a result data set.
#' @param name_data Name of the data file.
#'
#' @return input `data`.
report_missings <- function(data, name_data, throw_error = FALSE) {
  missings_per_col <- purrr::map_df(data, function(x) sum(is.na(x)))

  has_missings <- rowSums(missings_per_col)

  if (has_missings) {
    cat("Reporting missings on dataset:", name_data, "\n")
    purrr::iwalk(as.list(missings_per_col), function(n_na, name) {
      if (n_na > 0) {
        cat("Counted", n_na, "missings on column", name, "\n")
      }
    })
    cat("\n\n")
  }

  if (throw_error && has_missings) {
    stop(paste0("Missings detected on ", name_data, ", please check dataset."), call. = FALSE)
  }

  invisible(data)
}


#' Get name of iterator variable
#'
#' Uses fallback if no iterator is used. Aborts if > 1 iterator is given.
#'
#' @param args_list Named list of default and provided arguments in function
#'   call to [run_trisk()].
#'
#' @return String holding name of iterator variable.
get_iter_var <- function(args_list) {
  iterate_arg <- purrr::map_int(args_list, length) %>%
    tibble::enframe() %>%
    dplyr::filter(.data$value > 1)

  if (nrow(iterate_arg) == 0) {
    iter_var <- "standard"
  } else if (nrow(iterate_arg) == 1) {
    iter_var <- iterate_arg$name
    if (iter_var %in% setup_vars_lookup) {
      rlang::abort(c(
        "Must not provide more than one value for argument that cannot be iterated",
        x = glue::glue("Arguments with multiple values: {toString(iter_var)}."),
        i = "Please correct your function call"
      ))
    }
  } else {
    rlang::abort(c(
      "Must provide no more than one argument with multiple values.",
      x = glue::glue("Arguments with multiple values: {toString(iterate_arg$name)}."),
      i = "Did you forget to pick only one?"
    ))
  }

  return(iter_var)
}

#' Helper function for logging
#'
#' Wrapper around [write()] that concatenates objects passed in `...` and
#' appends per default.
#' @noRd
paste_write <- function(..., log_path, append = TRUE) {
  text <- paste(...)
  write(text, file = log_path, append = append)
  invisible()
}

# helper functions to indent lines in logfile
format_indent_1 <- function() {
  ">>"
}
format_indent_2 <- function() {
  "  >>"
}


#' Checks if input args are missing
#'
#' Checks if any list entries in `args_list` are symbols. Called with
#' `args_list` as input argument this serves as a proxy for checking is
#' arguments are missing.
#'
#' @param args_list A named list.
#'
#' @return returns `args_list` invisibly.
fail_if_input_args_are_missing <- function(args_list) {
  missings <- purrr::map(args_list, is.symbol) %>%
    purrr::keep(isTRUE)

  if (length(missings) > 0) {
    missing_args <- glue::glue_collapse(names(missings), sep = ", ")
    rlang::abort(
      c(
        "All input arguments need to hold values.",
        x = glue::glue("Missing arguments: {missing_args}."),
        i = "Did you provide all input arguemnts correctly?."
      )
    )
  }

  invisible(args_list)
}

#' Customise output path
#'
#' Checks for existence of provided `output_path` and extends it to hold a
#' subdirectory named after the timestamp and `iter_var`.
#'
#' @param output_path String holding path to st output folder.
#' @param iter_var String holding name of iteration variable.
#' @param shock_scenario String holding name of shock scenario.
#' @param scenario_geography String holding name of scenario geography.
#' @param carbon_price_model String holding name of carbon price model for trisk
#' @param risk_type String holding the risk type.
#' @param financial_stimulus Numeric value describing additional financial support for low carbon companies.
#'
#' @return Path to subdirectory in st output folder.
customise_output_path <- function(output_path, iter_var, shock_scenario, scenario_geography, carbon_price_model, financial_stimulus, risk_type) {
  if (!dir.exists(output_path)) {
    rlang::abort(
      c(
        "Argument output_path must point to an existing directory.",
        x = glue::glue("Invalid file path: {output_path}."),
        i = "Did you set output_path correctly?."
      )
    )
  }

  if (risk_type == "trisk") {
    if (financial_stimulus > 1) {
      timestamp <- paste(format(Sys.time(), "%Y_%m_%d_%H_%M_%S"), iter_var, shock_scenario, scenario_geography, carbon_price_model, financial_stimulus, sep = "_")
    } else {
      timestamp <- paste(format(Sys.time(), "%Y_%m_%d_%H_%M_%S"), iter_var, shock_scenario, scenario_geography, carbon_price_model, sep = "_")
    }
  }

  if (risk_type == "lrisk") {
    timestamp <- paste(format(Sys.time(), "%Y_%m_%d_%H_%M_%S"), iter_var, shock_scenario, scenario_geography, sep = "_")
  }

  output_path_custom <- file.path(output_path, timestamp)

  dir.create(output_path_custom)

  if (risk_type == "trisk") {
    # FIXME: quick solution to avoid empty output dirs in case of failing calculations

    if (financial_stimulus > 1) {
      paste_write("Starting analysis.", log_path = file.path(output_path_custom, paste0("log_file_", iter_var, sep = "_", shock_scenario, sep = "_", scenario_geography, sep = "_", carbon_price_model, sep = "_", financial_stimulus, ".txt")))
    } else {
      paste_write("Starting analysis.", log_path = file.path(output_path_custom, paste0("log_file_", iter_var, sep = "_", shock_scenario, sep = "_", scenario_geography, sep = "_", carbon_price_model, ".txt")))
    }

    paste_write("Starting analysis.", log_path = file.path(output_path_custom, paste0("log_file_", iter_var, sep = "_", shock_scenario, sep = "_", scenario_geography, sep = "_", carbon_price_model, ".txt")))
  }

  if (risk_type == "lrisk") {
    # FIXME: quick solution to avoid empty output dirs in case of failing calculations
    paste_write("Starting analysis.", log_path = file.path(output_path_custom, paste0("log_file_", iter_var, sep = "_", shock_scenario, sep = "_", scenario_geography, ".txt")))
  }

  return(output_path_custom)
}

stop_if_empty <- function(data, data_name) {
  if (nrow(data) == 0) {
    rlang::abort(glue::glue("Stopping calculation, dataset {data_name} is empty."))
  }
  return(invisible(data))
}


#' Infer supported sectors and technologies
#'
#' Function returns supported sectors and technologies for provided combination
#' of `baseline_scenario`, `shock_scenario` and `scenario_geography`.
#'
#' @inheritParams run_trisk
#'
#' @return A list with entries sectors and technologies
#' @noRd
infer_sectors_and_technologies <-
  function(price_data,
           scenario_data,
           production_data,
           baseline_scenario,
           shock_scenario,
           scenario_geography) {


  baseline_type <- scenario_data %>%
    dplyr::distinct(.data$scenario, .data$scenario_type) %>%
    dplyr::filter(
      .data$scenario == !!baseline_scenario & .data$scenario_type == "baseline"
    )

  if (nrow(baseline_type)  == 0){

    available_baselines <- scenario_data %>%
      dplyr::filter(.data$scenario_type == "baseline") %>%
      dplyr::distinct(.data$scenario) %>%
      dplyr::pull()

    rlang::abort(
      c(
        "The selected baseline scenario is not of a baseline type",
        x = glue::glue("baseline scenario: {baseline_scenario}, shock_scenario: {shock_scenario}, scenario_geography: {scenario_geography}"),
        i = glue::glue("Available baseline scenarios : {available_baselines}")
      )
    )
  }


  shock_type <- scenario_data %>%
    dplyr::distinct(.data$scenario, .data$scenario_type) %>%
    dplyr::filter(
      .data$scenario == !!shock_scenario & .data$scenario_type == "shock"
    )

  if (nrow(shock_type)  == 0){

    available_shocks <- scenario_data %>%
      dplyr::filter(.data$scenario_type == "shock") %>%
      dplyr::distinct(.data$scenario) %>%
      dplyr::pull()

    rlang::abort(
      c(
        "The selected shock scenario is not of a shock type",
        x = glue::glue("baseline scenario: {baseline_scenario}, shock_scenario: {shock_scenario}, scenario_geography: {scenario_geography}"),
        i = glue::glue("Available shock scenarios : {available_shocks}")
      )
    )
  }


  available_scenario_data <- scenario_data %>%
    dplyr::distinct(.data$scenario, .data$ald_sector, .data$ald_business_unit, .data$scenario_geography) %>%
    dplyr::filter(.data$scenario %in% c(baseline_scenario, shock_scenario))

  available_scenario_geography_data <- available_scenario_data %>%
    dplyr::filter(.data$scenario_geography == .env$scenario_geography)

  if ((nrow(available_scenario_data) > 0) & (nrow(available_scenario_geography_data) == 0)){
    rlang::abort(
      c(
        "Could not find scenario data matching the provided geography.",
        x = glue::glue("baseline scenario: {baseline_scenario}, shock_scenario: {shock_scenario}, scenario_geography: {scenario_geography}"),
        i = "Please use function geographies_for_sector to identify a valid geography for those scenarios."
      )
    )
  }

  available_price_data <- price_data %>%
    dplyr::distinct(.data$scenario, .data$ald_sector, .data$ald_business_unit) %>%
    dplyr::filter(.data$scenario %in% c(baseline_scenario, shock_scenario))


  scenario_geography_x_ald_sector <- dplyr::inner_join( available_scenario_geography_data, available_price_data)

  if (nrow(scenario_geography_x_ald_sector) != nrow(available_scenario_geography_data) |
      nrow(scenario_geography_x_ald_sector) != nrow(available_price_data) ) {

    anti_joined_rows <- dplyr::bind_rows(
      dplyr::anti_join(available_scenario_geography_data, available_price_data),
      dplyr::anti_join(available_price_data, available_scenario_geography_data)
    )
    # Convert the dataframe to a string
    rows_as_string <- apply(anti_joined_rows, 1, function(x) paste(x, collapse = ", "))
    formatted_rows <- paste(rows_as_string, collapse = "\n")


    rlang::abort(
      c(
        "Could not match all the data points between price and scenario datasets.",
        x = glue::glue("baseline scenario: {baseline_scenario}, shock_scenario: {shock_scenario}, scenario_geography: {scenario_geography}"),
        i = glue::glue("Please solve the mismatch of datapoints that appear on the given perimeter, between price_data_long.csv and Scenarios_AnalysisInput.csv . \n Mismatching rows : \n {formatted_rows}")
      )
    )
  }

  available_production_data <- production_data %>%
    dplyr::distinct(.data$ald_sector, .data$ald_business_unit)

  scenario_geography_x_ald_sector <- dplyr::inner_join( scenario_geography_x_ald_sector, available_production_data)

  sectors_baseline <- scenario_geography_x_ald_sector %>%
    dplyr::filter(.data$scenario == !!baseline_scenario & .data$scenario_geography == !!scenario_geography) %>%
    dplyr::pull(.data$ald_sector)

  sectors_shock <- scenario_geography_x_ald_sector %>%
    dplyr::filter(.data$scenario == !!shock_scenario & .data$scenario_geography == !!scenario_geography) %>%
    dplyr::pull(.data$ald_sector)

  shared_sectors <- dplyr::intersect(sectors_baseline, sectors_shock)

  if (length(shared_sectors) == 0) {
    rlang::abort(
      c(
        "Could not find sectors that are supported for baseline and shock scenario for selected scenario_geography.",
        x = glue::glue("baseline scenario: {baseline_scenario}, shock_scenario: {shock_scenario}, scenario_geography: {scenario_geography}"),
        i = "Please use function scenario_for_sector_x_geography to identify a valid combination."
      )
    )
  }

  technologies_baseline <- scenario_geography_x_ald_sector %>%
    dplyr::filter(.data$scenario == !!baseline_scenario & .data$scenario_geography == !!scenario_geography) %>%
    dplyr::pull(.data$ald_business_unit)

  technologies_shock <- scenario_geography_x_ald_sector %>%
    dplyr::filter(.data$scenario == !!shock_scenario & .data$scenario_geography == !!scenario_geography) %>%
    dplyr::pull(.data$ald_business_unit)

  technologies <- dplyr::intersect(technologies_baseline, technologies_shock)
  technologies_baseline <- scenario_geography_x_ald_sector %>%
    dplyr::filter(.data$scenario == !!baseline_scenario & .data$scenario_geography == !!scenario_geography) %>%
    dplyr::pull(.data$ald_business_unit)

  technologies_shock <- scenario_geography_x_ald_sector %>%
    dplyr::filter(.data$scenario == !!shock_scenario & .data$scenario_geography == !!scenario_geography) %>%
    dplyr::pull(.data$ald_business_unit)

  technologies <- dplyr::intersect(technologies_baseline, technologies_shock)

  return(list(sectors = shared_sectors, technologies = technologies))
}

# TODO THIS FUNCTION IS ONLY USED IN LRISK
# REMOVE ASAP WHEN USING LRISK
infer_scenario_type <- function(baseline_scenario, shock_scenario) {
  if (startsWith(baseline_scenario, "NGFS") &
    startsWith(shock_scenario, "NGFS")) {
    return("is_ngfs")
  }

  if (startsWith(baseline_scenario, "IPR2023_") &
      startsWith(shock_scenario, "IPR2023_")) {
    return("is_ipr")
  }

  if (startsWith(baseline_scenario, "IPR2023Automotive_") &
      startsWith(shock_scenario, "IPR2023Automotive_")) {
    return("is_iprAuto")
  }


  if (startsWith(baseline_scenario, "GECO") &
    startsWith(shock_scenario, "GECO")) {
    return("is_geco")
  }

  if (startsWith(baseline_scenario, "Oxford") &
    startsWith(shock_scenario, "Oxford")) {
    return("is_oxford")
  }

  if (startsWith(baseline_scenario, "WEO") &
    startsWith(shock_scenario, "WEO")) {
    return("is_weo")
  } else {
    rlang::abort(
      c(
        "The chosen baseline and shock scenario cannot be combined with one another",
        x = glue::glue("baseline scenario: {baseline_scenario}, shock_scenario: {shock_scenario}"),
        i = "Use function get_scenario_geography_x_ald_sector(st_input_folder) to find an appropriate pair."
      )
    )
  }
}

end_year_lookup <- function(scenario_type) {
  end_year <- as.numeric(2040)

  if (scenario_type %in% c("is_ngfs", "is_oxford", "is_ipr")) {
    end_year <- as.numeric(2050)
  }

  return(end_year)
}
2DegreesInvesting/r2dii.climate.stress.test documentation built on June 6, 2024, 8:23 a.m.