#' Creates a preparation map
#'
#' @param data_to_encode `data.frame()` \cr Data.frame of data to encode.
#' @param group_variables `character(1)` \cr Name of the variable to target
#' encode. Should be column names of `data_to_encode`.
#' @param outcome_variable `character(1)` \cr Target to encode.
#' @export
target_encode_create <- function(
data_to_encode,
group_variables,
outcome_variable
){
assertthat::assert_that(all(group_variables %in% names(data_to_encode)))
assertthat::assert_that(outcome_variable %in% names(data_to_encode))
res <- data.frame()
for (variable in group_variables){
aux_data <- data_to_encode %>%
dplyr::select(tidyselect::one_of(variable, outcome_variable)) %>%
dplyr::rename(group = !!dplyr::sym(variable)) %>%
dplyr::group_by(group) %>%
dplyr::summarize(
group_variable = variable,
positive = sum(!!dplyr::sym(outcome_variable)),
n = dplyr::n()
)
suppressWarnings(
res <- dplyr::bind_rows(res, aux_data) %>%
tibble::as_tibble()
)
}
res$group <- as.factor(res$group)
return(dplyr::arrange(res, group_variable))
}
#' Applies a preparation map
#'
#' @param preparation_map `data.frame()` \cr Preparation map formatted as the
#' expected output from target_encode_create.
#' @param group_variables `character()` \cr On which categorical variables to
#' apply the target encoding?
#' @param outcome_variable `character()` \cr Name of the column representing
#' the boolean outcome.
#' @param data `data.frame()` \cr Data to apply the target encode preparation
#' map to.
#' @param holdout_type `character(1)` \cr The holdout type used. Must be one of: "LeaveOneOut", "None".
#' @param prior_sample_size `integer(1)` \cr If non-zero, then a bayesian
#' blended average is computed. The prior_sample_size defines the weight of
#' the population mean compared to the sample mean.
#' @param noise_level `float(1)`\cr The amount of random noise added to the
#' target encoding. This helps prevent overfitting. Defaults to 0.01.
#' @param seed `integer(1)` \cr Seed for reproductible random values.
#'
#' @export
target_encode_apply <- function(
data,
group_variables,
outcome_variable,
preparation_map,
holdout_type = "none",
prior_sample_size = 30L,
noise_level = 0,
seed = 1793
){
set.seed(seed)
assertthat::assert_that(
holdout_type %in% c("leave_one_out", "none"),
msg = 'holdout_type should either be "leave_one_out" or "none"'
)
assertthat::assert_that(
all(group_variables %in% names(data)),
msg = paste0(
"Variables are missing from data input frame : ",
setdiff(group_variables, names(data))
)
)
assertthat::assert_that(
all(group_variables %in% preparation_map$group_variable),
msg = paste0(
"Variables are missing from preparation map : ",
setdiff(group_variables, preparation_map$group_variable)
)
)
res <- data
for (variable in group_variables){
preparation_map_excerpt <- preparation_map %>%
dplyr::filter(group_variable == variable) %>%
dplyr::select(-group_variable)
res <- res %>%
dplyr::left_join(
preparation_map_excerpt,
by = setNames("group", variable)
)
if (holdout_type == "leave_one_out"){
res <- res %>%
dplyr::mutate(
positive = positive - outcome,
n = n - 1
)
}
te_name <- paste("target", "encode", variable, sep = "_")
if (prior_sample_size > 0){
pop_mean <- sum(preparation_map_excerpt[["positive"]]) /
sum(preparation_map_excerpt[["n"]])
res[[te_name]] <- ( prior_sample_size * pop_mean + res[["positive"]]) /
(prior_sample_size + res[["n"]])
} else {
res[[te_name]] <- ifelse(
res[["n"]] <= 0,
NA,
res[["positive"]] / res[["n"]]
)
}
if (noise_level != 0) {
res[[te_name]] <- res[[te_name]] + noise_level * runif(nrow(res))
}
res <- res %>%
dplyr::select(-positive, -n)
}
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.