dbc: Dictionary-Based Cleaning

Documented in clean_categorical

#' Clean categorical variables within a dataset based on a dictionary of
#' value-replacement pairs
#'
#' @description
#' Applies a dictionary of value-replacement pairs to clean and standardize
#' values of categorical variables. Includes options for text standardization to
#' standardize minor differences in character case, spacing, and punctuation.
#'
#' @inheritParams check_categorical
#' @inheritParams clean_numeric
#'
#' @param x A data frame with one or more columns to clean
#' @param dict_clean Optional dictionary of value-replacement pairs (e.g.
#'   produced by [`check_categorical`]). Must include columns "variable",
#'   "value", "replacement", and, if specified as an argument, all columns
#'   specified by `vars_id`.
#' @param non_allowed_to_missing Logical indicating whether to replace values
#'   that remain non-allowed, even after cleaning and standardization, to NA.
#'   Defaults to TRUE.
#'
#' If no dictionary is provided, will simply standardize columns to match
#' allowed values specified in `dict_allowed`.
#'
#' @return
#' The original data frame `x` but with cleaned versions of the categorical
#' variables specified in argument `dict_allowed`
#'
#' @examples
#' # load example dataset, dictionary of allowed categorical values, and
#' # cleaning dictionary
#' data(ll1)
#' data(dict_categ1)
#' data(clean_categ1)
#'
#' # dictionary-based corrections to categorical vars
#' clean_categorical(
#'   ll1,
#'   dict_allowed = dict_categ1,
#'   dict_clean = clean_categ1
#' )
#'
#' # require exact matching, including character case
#' clean_categorical(
#'   ll1,
#'   dict_allowed = dict_categ1,
#'   dict_clean = clean_categ1,
#'   fn = identity
#' )
#'
#' # apply standardization to dict_allowed but no additional dict-based cleaning
#' clean_categorical(
#'   ll1,
#'   dict_allowed = dict_categ1
#' )
#'
#' @importFrom dplyr `%>%` select filter mutate any_of all_of case_when
#' @importFrom tidyr pivot_longer pivot_wider
#' @importFrom rlang .data .env
#' @export clean_categorical
clean_categorical <- function(x,
                              dict_allowed,
                              dict_clean = NULL,
                              vars_id = NULL,
                              col_allowed_var = "variable",
                              col_allowed_value = "value",
                              non_allowed_to_missing = TRUE,
                              fn = std_text,
                              na = ".na") {

  fn <- match.fun(fn)
  vars <- intersect(unique(dict_allowed[[col_allowed_var]]), names(x))

  # prep x
  x_prep <- x %>%
    mutate(ROWID_TEMP_ = seq_len(nrow(.)), .before = 1) %>%
    reclass_cols(cols = vars, fn = as.character)

  # prep dict_allowed
  dict_allowed_std <- dict_allowed %>%
    select(variable = all_of(col_allowed_var), value = all_of(col_allowed_value))

  # pivot vars to long format
  x_long <- x_prep %>%
    select(all_of("ROWID_TEMP_"), any_of(vars_id), all_of(vars)) %>%
    tidyr::pivot_longer(cols = -any_of(c("ROWID_TEMP_", vars_id)), names_to = "variable") %>%
    mutate(value_std = fn(.data$value))

  # apply dictionary-specified replacements
  if (!is.null(dict_clean)) {

    test_dict(dict_clean, fn, na)

    join_cols <- c(vars_id, "variable", "value_std")

    dict_clean_join <- dict_clean %>%
      mutate(value_std = fn(.data$value)) %>%
      select(any_of(vars_id), all_of(c("variable", "value_std", "replacement"))) %>%
      unique()

    x_long <- x_long %>%
      left_join(dict_clean_join, by = join_cols) %>%
      mutate(
        value = case_when(
          .data$replacement %in% .env$na ~ NA_character_,
          !is.na(.data$replacement) ~ .data$replacement,
          TRUE ~ .data$value
        )
      )
  }

  # pivot corrected numeric vars to wide form
  x_long_wide <- x_long %>%
    tidyr::pivot_wider(id_cols = "ROWID_TEMP_", names_from = "variable", values_from = "value")

  # standardize values and match to dict_allowed
  x_long_wide_match <- x_long_wide %>%
    match_coded(
      dict = dict_allowed_std,
      fn = fn,
      non_allowed_to_missing = non_allowed_to_missing
    )

  # merge corrected vars back into original dataset
  x_out <- x_prep %>%
    left_join_replace(x_long_wide_match, cols_match = "ROWID_TEMP_") %>%
    select(!all_of("ROWID_TEMP_"))

  # return
  return(x_out)
}