R/prioritize.R

Defines functions relevel2 prioritize_at prioritize_level set_priority check_duplicated_score1 has_zero_rows prioritize

Documented in prioritize prioritize_level

#' Pick rows where `score` is 1 and `level` per loan is of highest `priority`
#'
#' When multiple perfect matches are found per loan (e.g. a match at
#' `direct_loantaker` level and `ultimate_parent` level), we must prioritize the
#' desired match. By default, the highest `priority` is the most granular match
#' (i.e. `direct_loantaker`).
#'
#' @template ignores-but-preserves-existing-groups
#'
#' @param data A data frame like the validated output of [match_name()]. See
#'  _Details_ on how to validate `data`.
#' @param priority One of:
#'   * `NULL`: defaults to the default level priority as returned by
#'   [prioritize_level()].
#'   * A character vector giving a custom priority.
#'   * A function to apply to the output of [prioritize_level()], e.g. `rev`.
#'   * A quosure-style lambda function, e.g. `~ rev(.x)`.
#'
#' @seealso [match_name()], [prioritize_level()].
#' @family main functions
#'
#' @details
#' **How to validate `data`**
#' @includeRmd vignettes/_validate-matches.md
#'
#' @return A data frame with a single row per loan, where `score` is 1 and
#'   priority level is highest.
#'
#' @export
#'
#' @examples
#' library(dplyr)
#'
#' # styler: off
#' matched <- tribble(
#'   ~sector, ~sector_abcd,  ~score, ~id_loan,                ~level,
#'    "coal",      "coal",       1,     "aa",     "ultimate_parent",
#'    "coal",      "coal",       1,     "aa",    "direct_loantaker",
#'    "coal",      "coal",       1,     "bb", "intermediate_parent",
#'    "coal",      "coal",       1,     "bb",     "ultimate_parent",
#' )
#' # styler: on
#'
#' prioritize_level(matched)
#'
#' # Using default priority
#' prioritize(matched)
#'
#' # Using the reverse of the default priority
#' prioritize(matched, priority = rev)
#'
#' # Same
#' prioritize(matched, priority = ~ rev(.x))
#'
#' # Using a custom priority
#' bad_idea <- c("intermediate_parent", "ultimate_parent", "direct_loantaker")
#'
#' prioritize(matched, priority = bad_idea)
prioritize <- function(data, priority = NULL) {
  if (has_zero_rows(data)) {
    return(data)
  }

  if (all(c("sector_ald", "sector_abcd") %in% names(data))) {

    rlang::abort(
      "too_many_sectors",
      message = glue(
        "Column `sector_ald` is deprecated as of r2dii.match 0.1.0, please use
        `sector_abcd` instead."
      )
    )

  } else if ("sector_ald" %in% names(data)) {

    rlang::warn(
      "deprecated_name",
      message = glue(
        "Column `sector_ald` is deprecated as of r2dii.match 0.1.0, please use
        `sector_abcd` instead."
      )
    )

    data <- dplyr::rename(data, sector_abcd = "sector_ald")
  }

  data %>%
    check_crucial_names(
      c("id_loan", "level", "score", "sector", "sector_abcd")
    ) %>%
    check_duplicated_score1()

  priority <- set_priority(data, priority = priority)

  old_groups <- dplyr::groups(data)
  perfect_matches <- filter(ungroup(data), .data$score == 1L)

  out <- perfect_matches %>%
    group_by(.data$id_loan, .data$sector, .data$sector_abcd) %>%
    prioritize_at(.at = "level", priority = priority) %>%
    ungroup()

  group_by(out, !!!old_groups)
}

has_zero_rows <- function(data) {
  !nrow(data) > 0L
}

check_duplicated_score1 <- function(data) {
  score_1 <- filter(data, .data$score == 1)
  # The only important side effect of this function if to abort duplicated rows
  loan_level <- suppressMessages(select(score_1, all_of(c("id_loan", "level"))))
  is_duplicated <- duplicated(loan_level)

  if (!any(is_duplicated)) {
    return(invisible(data))
  }

  duplicated_rows <- commas(rownames(data)[is_duplicated])
  abort(
    class = "duplicated_score1_by_id_loan_by_level",
    message = glue(
      "`data` where `score` is `1` must be unique by `id_loan` by `level`.
     Duplicated rows: {duplicated_rows}.
     Have you ensured that only one abcd-name per loanbook-name is set to `1`?"
    )
  )
}

set_priority <- function(data, priority) {
  priority <- priority %||% prioritize_level(data)

  if (inherits(priority, "function")) {
    f <- priority
    priority <- f(prioritize_level(data))
  }

  if (inherits(priority, "formula")) {
    f <- rlang::as_function(priority)
    priority <- f(prioritize_level(data))
  }

  known_levels <- sort(unique(data$level))
  unknown_levels <- setdiff(priority, known_levels)
  if (!identical(unknown_levels, character(0))) {
    rlang::warn(
      glue(
        "Ignoring `priority` levels not found in data: \\
        {paste0(unknown_levels, collapse = ', ')}
        Did you mean to use one of: {paste0(known_levels, collapse = ', ')}?"
      )
    )
  }

  priority
}

#' Arrange unique `level` values in default order of `priority`
#'
#' @param data A data frame, commonly the output of [match_name()].
#'
#' @family helpers
#'
#' @return A character vector of the default level priority per loan.
#' @export
#'
#' @examples
#' matched <- tibble::tibble(
#'   level = c(
#'     "intermediate_parent_1",
#'     "direct_loantaker",
#'     "direct_loantaker",
#'     "direct_loantaker",
#'     "ultimate_parent",
#'     "intermediate_parent_2"
#'   )
#' )
#' prioritize_level(matched)
prioritize_level <- function(data) {
  select_chr(
    # Sort sufixes: e.g. intermediate*1, *2, *n
    sort(unique(data$level)),
    tidyselect::matches("direct"),
    tidyselect::matches("intermediate"),
    tidyselect::matches("ultimate")
  )
}

#' Pick rows from a data frame based on a priority set at some columns
#'
#' @param data A data frame.
#' @param .at Most commonly, a character vector of one column name. For more
#'   general usage see the `.vars` argument to [dplyr::arrange_at()].
#' @param priority Most commonly, a character vector of the priority to
#'   re-order the column(x) given by `.at`.
#'
#' @return A data frame, commonly with less rows than the input.
#'
#' @examples
#' library(dplyr)
#'
#' # styler: off
#' data <- tibble::tribble(
#'   ~x,  ~y,
#'    1, "a",
#'    2, "a",
#'    2, "z",
#' )
#' # styler: on
#'
#' data %>% prioritize_at("y")
#'
#' data %>%
#'   group_by(x) %>%
#'   prioritize_at("y")
#'
#' data %>%
#'   group_by(x) %>%
#'   prioritize_at(.at = "y", priority = c("z", "a")) %>%
#'   arrange(x) %>%
#'   ungroup()
#' @noRd
prioritize_at <- function(data, .at, priority = NULL) {
  data %>%
    dplyr::arrange_at(.at, .funs = relevel2, new_levels = priority) %>%
    dplyr::filter(dplyr::row_number() == 1L)
}

relevel2 <- function(f, new_levels) {
  factor(f, levels = c(new_levels, setdiff(levels(f), new_levels)))
}

Try the r2dii.match package in your browser

Any scripts or data that you put into this service are public.

r2dii.match documentation built on Oct. 23, 2023, 5:09 p.m.