llutils: Linelist Utilities

Documented in recode_conditional

#' Conditionally recode values within a data frame column based on one or more
#' dictionaries containing columns to match against, and corresponding
#' replacement values to insert where a match is made
#'
#' @param df A \code{data.frame}-like object
#' @param dict The dictionary. A data frame, or list of data frames, each
#'   containing a column of replacement values, and one or more columns to match
#'   against \code{df} (matching via \code{dplyr::left_join}).
#' @param col_recode Name (character) of the column to be recoded. This column
#'   must be present both within \code{df} and \code{dict}.
#' @param flag_recoded Logical indicating whether to add a column to the output
#'   indicating whether the given row has been matched by a dictionary entry and
#'   therefore recoded. Defaults to FALSE. If TRUE, the new column will be
#'   called \code{<col_recode>_is_recoded}.
#'
#' @return
#' A data.frame corresponding to \code{df}, but with values in column
#' \code{col_recode} recoded based on the given dictionary.
#'
#' If \code{flag_recoded} is TRUE, the output will contain an additional column
#' \code{<col_recode>_is_recoded}, indicating whether the given row has been
#' matched by a dictionary entry and therefore recoded.
#'
#' @examples
#' # dictionaries
#' dict1 <- data.frame(
#'   x1 = c("a", "b", "c", "d"),
#'   y = c("alpha", "bravo", "charlie", "delta"),
#'   stringsAsFactors = FALSE
#' )
#'
#' dict2 <- data.frame(
#'   x1 = c("0", "1", "2", "3"),
#'   y = c("zero", "one", "two", "three"),
#'   stringsAsFactors = FALSE
#' )
#'
#' # data frame with column to be recoded ('y')
#' dat <- data.frame(
#'   x1 = c("a", "c", "c", "d", "b", "0"),
#'   y = c("???", "???", "???", "???", "???", "???"),
#'   stringsAsFactors = FALSE
#' )
#'
#' # recode column "y" based on dictionary 'dict1'
#' recode_conditional(dat, dict = dict1, col_recode = "y")
#'
#' # recode column "y" based on dictionaries 'dict1' and 'dict2'
#' recode_conditional(dat, dict = list(dict1, dict2), col_recode = "y")
#'
#' @importFrom vctrs vec_cast
#' @importFrom lubridate as_date
#' @importFrom dplyr left_join
#' @export recode_conditional
recode_conditional <- function(df, dict, col_recode, flag_recoded = FALSE) {

  # if dict is single df convert to list
  if ("data.frame" %in% class(dict)) {
    dict <- list(dict)
  }

  # number of dictionaries
  n_dict <- length(dict)

  # name of replacement columns, and replacement check columns
  cols_replace <- paste0("NC_REPLACE_", col_recode, "_", seq_len(n_dict))
  cols_repcheck <- paste0("NC_REPCHECK_", seq_len(n_dict))
  cols_dictrow <- paste0("NC_DICTROW_", seq_len(n_dict))

  # for each dictionary...
  for(i in seq_len(n_dict)) {

    # check dict[[i]] for non-matching columns
    names_df <- names(df)
    names_dict <- names(dict[[i]])

    if (!all(is.element(names_dict, names_df))) {
      warning("The following column(s) of dict[[", i, "]] are not found in df: ",
              paste(names_dict[!is.element(names_dict, names_df)], collapse = ", "),
              call. = FALSE)
    }

    # take unique rows of dictionary, to prevent adding new rows to df
    # TODO: perhaps add optional warning here
    dict[[i]] <- unique(dict[[i]])

    # reclass dictionary to match df
    class_from <- class(dict[[i]][[col_recode]])
    class_to <- class(df[[col_recode]])

    if (class_from == "character" & class_to == "Date") {
      dict[[i]][[col_recode]] <- lubridate::as_date(dict[[i]][[col_recode]])
    } else {
      dict[[i]][[col_recode]] <- vctrs::vec_cast(
        dict[[i]][[col_recode]],
        to = df[[col_recode]]
      )
    }

    # rename dict column
    names(dict[[i]])[names(dict[[i]]) == col_recode] <- cols_replace[i]

    # add column of row identifiers to dict[[i]]
    dict[[i]][[cols_dictrow[i]]] <- seq_len(nrow(dict[[i]]))

    # left-join dict[[i]] to df
    cols_common <- base::intersect(names(df), names(dict[[i]]))
    df <- dplyr::left_join(df, dict[[i]], by = cols_common)

    # replace non-matches with FALSE
    df[[cols_repcheck[i]]] <- ifelse(is.na(df[[cols_dictrow[i]]]), FALSE, TRUE)

    # recode
    df[[col_recode]][df[[cols_repcheck[i]]]] <- df[[cols_replace[i]]][df[[cols_repcheck[i]]]]
  }

  # matrix of logicals indicating whether row matched with dictionary
  check_mat <- as.matrix(df[,cols_repcheck])

  # which rows matched by multiple dict entries?
  multiple_matches <- which(apply(check_mat, 1, function(x) length(which(x)) > 1))

  if (length(multiple_matches) > 0) {

    # do matching entries having conflicting replacement values?
    match_conflict_lgl <- apply(df[multiple_matches,cols_replace], 1, function(x) length(unique(x)) > 1)

    multiple_match_conflict <- multiple_matches[match_conflict_lgl]

    # if conflicting matches, print error
    if (any(match_conflict_lgl)) {
      df_conflicts <- df[multiple_matches,cols_dictrow]

      # format conflicting dictionary rows for printing
      l_i <- lapply(seq_len(nrow(df_conflicts)), function (i) which(!is.na(df_conflicts[i,])))
      l_names <- lapply(l_i, function(i) paste0("dict", i))
      l_rows <- lapply(seq_len(nrow(df_conflicts)), function(i) df_conflicts[i, as.vector(!is.na(df_conflicts[i,]))])

      conflicts_print <- mapply(function(x, y) paste0(x, "[", y, ",]"), l_names, l_rows, SIMPLIFY = FALSE)
      conflicts_print <- vapply(conflicts_print, function(x) paste0(x, collapse = ", "), "")
      conflicts_print <- paste0(seq_along(conflicts_print), ". ", conflicts_print, "\n")

      stop(length(multiple_match_conflict),
           " line(s) matched by multiple dictionary entries with conflicting replacement values:\n",
           conflicts_print,
           call. = FALSE)
    }
  }

  cols_keep <- !names(df) %in% c(cols_replace, cols_repcheck, cols_dictrow)
  df_out <- df[,cols_keep]

  if (flag_recoded) {
    flag_name <- paste(col_recode, "is_recoded", sep = "_")
    df_out[[flag_name]] <- apply(check_mat, 1, any)
  }

  return(df_out)
}