#' Produce a dictionary of non-valid categorical values within a dataset, for
#' use in subsequent data cleaning
#'
#' @description
#' Values are compared against a user-provided dictionary specifying the allowed
#' values of each categorical variable, after text standardization to account
#' for minor differences in character case, spacing, and punctuation.
#'
#' The resulting cleaning dictionary can then be manually reviewed to fill in
#' appropriate replacement values for each non-valid categorical value, or a
#' missing-value keyword indicating that the value should be converted to `NA`.
#'
#' @inheritParams check_numeric
#' @inherit check_numeric return
#'
#' @param dict_allowed Dictionary of allowed values for each variable of
#' interest. Must include columns for "variable" and "value" (the names of
#' which can be modified with args `col_allowed_var` and `col_allowed_value`).
#' @param col_allowed_var Name of column in `dict_allowed` giving variable name
#' (defaults to "variable")
#' @param col_allowed_value Name of column in `dict_allowed` giving allowed
#' values (defaults to "value")
#' @param fn Function to standardize raw values in both the dataset and
#' dictionary prior to comparing, to account for minor variation in character
#' case, spacing, punctuation, etc. Defaults to [`std_text`]. To omit the
#' standardization step can use e.g. `as.character` or an identity function
#' `function(x) x`.
#' @param allow_na Logical indicating whether missing values should always be
#' treated as 'allowed' even if not explicitly specified in `dict_allowed`.
#' Defaults to `TRUE`.
#'
#' @examples
#' # load example dataset, and dictionary of allowed categorical values
#' data(ll1)
#' data(dict_categ1)
#'
#' # basic output
#' check_categorical(ll1, dict_allowed = dict_categ1)
#'
#' @importFrom dplyr `%>%` select filter mutate any_of all_of case_when if_else
#' distinct arrange bind_rows left_join anti_join
#' @importFrom tidyr pivot_longer
#' @importFrom rlang .data .env
#' @export check_categorical
check_categorical <- function(x,
dict_allowed,
dict_clean = NULL,
vars_id = NULL,
col_allowed_var = "variable",
col_allowed_value = "value",
fn = std_text,
allow_na = TRUE,
na = ".na",
populate_na = FALSE,
return_all = FALSE) {
fn <- match.fun(fn)
vars <- intersect(unique(dict_allowed[[col_allowed_var]]), names(x))
# pivot numeric vars to long format
x_long <- x %>%
reclass_cols(cols = vars, fn = as.character) %>%
select(any_of(vars_id), all_of(vars)) %>%
tidyr::pivot_longer(cols = -any_of(vars_id), names_to = "variable")
# standardize
x_long_std <- x_long %>%
mutate(value = suppressWarnings(fn(.data$value)))
# apply existing dictionary-based corrections, if specified
if (!is.null(dict_clean)) {
# prep dict
dict_clean_std <- dict_clean %>%
select(any_of(vars_id), all_of(c("variable", "value", "replacement"))) %>%
filter(!is.na(.data$replacement)) %>%
mutate(
replacement = case_when(
replacement %in% .env$na ~ .env$na,
TRUE ~ suppressWarnings(fn(.data$replacement))
)
)
# apply corrections
x_long_std <- x_long_std %>%
left_join(dict_clean_std, by = c(vars_id, "variable", "value")) %>%
mutate(
value = if_else(!is.na(.data$replacement), .data$replacement, .data$value),
value = if_else(.data$replacement %in% .env$na, NA_character_, .data$value)
)
} else {
x_long_std$replacement <- NA_character_
}
# filter to non-valid and non-replaced
dict_allowed_std <- dict_allowed %>%
select(variable = all_of(col_allowed_var), value = all_of(col_allowed_value)) %>%
mutate(value = suppressWarnings(fn(.data$value)))
x_nonvalid <- x_long_std %>%
anti_join(dict_allowed_std, by = c("variable", "value")) %>%
filter(is.na(.data$replacement))
if (allow_na) {
x_nonvalid <- x_nonvalid %>%
filter(!is.na(.data$value))
}
# prep for output
replacement_prepopulate <- ifelse(populate_na, na, NA_character_)
x_out <- x_nonvalid %>%
select(
any_of(vars_id),
all_of(c("variable", "value"))
) %>%
arrange(.data$variable) %>%
distinct() %>%
mutate(
replacement = .env$replacement_prepopulate,
new = TRUE
)
# add original rows of dict to output
if (return_all & !is.null(dict_clean)) {
x_out_new <- x_out %>%
anti_join(dict_clean, by = c(vars_id, "variable", "value"))
x_out <- dict_clean %>%
mutate(new = as.logical(NA)) %>%
bind_rows(x_out_new)
}
# return
return(x_out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.