#' Conditionally recode values within a data frame column based on one or more
#' dictionaries containing columns to match against, and corresponding
#' replacement values to insert where a match is made
#'
#' @param df A \code{data.frame}-like object
#' @param dict The dictionary. A data frame, or list of data frames, each
#' containing a column of replacement values, and one or more columns to match
#' against \code{df} (matching via \code{dplyr::left_join}).
#' @param col_recode Name (character) of the column to be recoded. This column
#' must be present both within \code{df} and \code{dict}.
#' @param flag_recoded Logical indicating whether to add a column to the output
#' indicating whether the given row has been matched by a dictionary entry and
#' therefore recoded. Defaults to FALSE. If TRUE, the new column will be
#' called \code{<col_recode>_is_recoded}.
#'
#' @return
#' A data.frame corresponding to \code{df}, but with values in column
#' \code{col_recode} recoded based on the given dictionary.
#'
#' If \code{flag_recoded} is TRUE, the output will contain an additional column
#' \code{<col_recode>_is_recoded}, indicating whether the given row has been
#' matched by a dictionary entry and therefore recoded.
#'
#' @examples
#' # dictionaries
#' dict1 <- data.frame(
#' x1 = c("a", "b", "c", "d"),
#' y = c("alpha", "bravo", "charlie", "delta"),
#' stringsAsFactors = FALSE
#' )
#'
#' dict2 <- data.frame(
#' x1 = c("0", "1", "2", "3"),
#' y = c("zero", "one", "two", "three"),
#' stringsAsFactors = FALSE
#' )
#'
#' # data frame with column to be recoded ('y')
#' dat <- data.frame(
#' x1 = c("a", "c", "c", "d", "b", "0"),
#' y = c("???", "???", "???", "???", "???", "???"),
#' stringsAsFactors = FALSE
#' )
#'
#' # recode column "y" based on dictionary 'dict1'
#' recode_conditional(dat, dict = dict1, col_recode = "y")
#'
#' # recode column "y" based on dictionaries 'dict1' and 'dict2'
#' recode_conditional(dat, dict = list(dict1, dict2), col_recode = "y")
#'
#' @importFrom vctrs vec_cast
#' @importFrom lubridate as_date
#' @importFrom dplyr left_join
#' @export recode_conditional
recode_conditional <- function(df, dict, col_recode, flag_recoded = FALSE) {
# if dict is single df convert to list
if ("data.frame" %in% class(dict)) {
dict <- list(dict)
}
# number of dictionaries
n_dict <- length(dict)
# name of replacement columns, and replacement check columns
cols_replace <- paste0("NC_REPLACE_", col_recode, "_", seq_len(n_dict))
cols_repcheck <- paste0("NC_REPCHECK_", seq_len(n_dict))
cols_dictrow <- paste0("NC_DICTROW_", seq_len(n_dict))
# for each dictionary...
for(i in seq_len(n_dict)) {
# check dict[[i]] for non-matching columns
names_df <- names(df)
names_dict <- names(dict[[i]])
if (!all(is.element(names_dict, names_df))) {
warning("The following column(s) of dict[[", i, "]] are not found in df: ",
paste(names_dict[!is.element(names_dict, names_df)], collapse = ", "),
call. = FALSE)
}
# take unique rows of dictionary, to prevent adding new rows to df
# TODO: perhaps add optional warning here
dict[[i]] <- unique(dict[[i]])
# reclass dictionary to match df
class_from <- class(dict[[i]][[col_recode]])
class_to <- class(df[[col_recode]])
if (class_from == "character" & class_to == "Date") {
dict[[i]][[col_recode]] <- lubridate::as_date(dict[[i]][[col_recode]])
} else {
dict[[i]][[col_recode]] <- vctrs::vec_cast(
dict[[i]][[col_recode]],
to = df[[col_recode]]
)
}
# rename dict column
names(dict[[i]])[names(dict[[i]]) == col_recode] <- cols_replace[i]
# add column of row identifiers to dict[[i]]
dict[[i]][[cols_dictrow[i]]] <- seq_len(nrow(dict[[i]]))
# left-join dict[[i]] to df
cols_common <- base::intersect(names(df), names(dict[[i]]))
df <- dplyr::left_join(df, dict[[i]], by = cols_common)
# replace non-matches with FALSE
df[[cols_repcheck[i]]] <- ifelse(is.na(df[[cols_dictrow[i]]]), FALSE, TRUE)
# recode
df[[col_recode]][df[[cols_repcheck[i]]]] <- df[[cols_replace[i]]][df[[cols_repcheck[i]]]]
}
# matrix of logicals indicating whether row matched with dictionary
check_mat <- as.matrix(df[,cols_repcheck])
# which rows matched by multiple dict entries?
multiple_matches <- which(apply(check_mat, 1, function(x) length(which(x)) > 1))
if (length(multiple_matches) > 0) {
# do matching entries having conflicting replacement values?
match_conflict_lgl <- apply(df[multiple_matches,cols_replace], 1, function(x) length(unique(x)) > 1)
multiple_match_conflict <- multiple_matches[match_conflict_lgl]
# if conflicting matches, print error
if (any(match_conflict_lgl)) {
df_conflicts <- df[multiple_matches,cols_dictrow]
# format conflicting dictionary rows for printing
l_i <- lapply(seq_len(nrow(df_conflicts)), function (i) which(!is.na(df_conflicts[i,])))
l_names <- lapply(l_i, function(i) paste0("dict", i))
l_rows <- lapply(seq_len(nrow(df_conflicts)), function(i) df_conflicts[i, as.vector(!is.na(df_conflicts[i,]))])
conflicts_print <- mapply(function(x, y) paste0(x, "[", y, ",]"), l_names, l_rows, SIMPLIFY = FALSE)
conflicts_print <- vapply(conflicts_print, function(x) paste0(x, collapse = ", "), "")
conflicts_print <- paste0(seq_along(conflicts_print), ". ", conflicts_print, "\n")
stop(length(multiple_match_conflict),
" line(s) matched by multiple dictionary entries with conflicting replacement values:\n",
conflicts_print,
call. = FALSE)
}
}
cols_keep <- !names(df) %in% c(cols_replace, cols_repcheck, cols_dictrow)
df_out <- df[,cols_keep]
if (flag_recoded) {
flag_name <- paste(col_recode, "is_recoded", sep = "_")
df_out[[flag_name]] <- apply(check_mat, 1, any)
}
return(df_out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.