fte: Feature Target Encoding for classification

Documented in target_encode_apply target_encode_create

#' Creates a preparation map
#'
#' @param data_to_encode `data.frame()` \cr Data.frame of data to encode.
#' @param group_variables `character(1)` \cr Name of the variable to target
#' encode. Should be column names of `data_to_encode`.
#' @param outcome_variable `character(1)` \cr Target to encode.
#' @export
target_encode_create <- function(
  data_to_encode,
  group_variables,
  outcome_variable
  ){

  assertthat::assert_that(all(group_variables %in%  names(data_to_encode)))
  assertthat::assert_that(outcome_variable %in%  names(data_to_encode))
  res <- data.frame()
  for (variable in group_variables){
    aux_data <- data_to_encode %>%
      dplyr::select(tidyselect::one_of(variable, outcome_variable)) %>%
      dplyr::rename(group = !!dplyr::sym(variable)) %>%
      dplyr::group_by(group) %>%
      dplyr::summarize(
        group_variable = variable,
        positive = sum(!!dplyr::sym(outcome_variable)),
        n = dplyr::n()
      )

      suppressWarnings(
        res <- dplyr::bind_rows(res, aux_data) %>%
          tibble::as_tibble()
      )
  }
  res$group <- as.factor(res$group)
  return(dplyr::arrange(res, group_variable))
}


#' Applies a preparation map
#'
#' @param preparation_map `data.frame()` \cr Preparation map formatted as the
#' expected output from target_encode_create.
#' @param group_variables `character()` \cr On which categorical variables to
#' apply the target encoding?
#' @param outcome_variable `character()` \cr Name of the column representing
#' the boolean outcome.
#' @param data `data.frame()` \cr Data to apply the target encode preparation
#' map to.
#' @param holdout_type `character(1)` \cr The holdout type used. Must be one of: "LeaveOneOut", "None".
#' @param prior_sample_size `integer(1)` \cr If non-zero, then a bayesian
#'   blended average is computed. The prior_sample_size defines the weight of
#'   the population mean compared to the sample mean.
#' @param noise_level `float(1)`\cr  The amount of random noise added to the
#'   target encoding. This helps prevent overfitting. Defaults to 0.01.
#' @param seed `integer(1)` \cr Seed for reproductible random values.
#'
#' @export
target_encode_apply <- function(
  data,
  group_variables,
  outcome_variable,
  preparation_map,
  holdout_type = "none",
  prior_sample_size = 30L,
  noise_level = 0,
  seed = 1793
  ){
  set.seed(seed)

  assertthat::assert_that(
    holdout_type %in% c("leave_one_out", "none"),
    msg = 'holdout_type should either be "leave_one_out" or "none"'
  )
  assertthat::assert_that(
    all(group_variables %in% names(data)),
    msg = paste0(
      "Variables are missing from data input frame : ",
      setdiff(group_variables, names(data))
    )
  )
  assertthat::assert_that(
    all(group_variables %in% preparation_map$group_variable),
    msg = paste0(
      "Variables are missing from preparation map : ",
      setdiff(group_variables, preparation_map$group_variable)
    )
  )

  res  <- data
  for (variable in group_variables){
    preparation_map_excerpt  <- preparation_map %>%
      dplyr::filter(group_variable == variable) %>%
      dplyr::select(-group_variable)
    res <- res %>%
      dplyr::left_join(
        preparation_map_excerpt,
        by = setNames("group", variable)
      )

    if (holdout_type == "leave_one_out"){
      res  <- res %>%
        dplyr::mutate(
          positive = positive - outcome,
          n = n - 1
        )
    }

    te_name <- paste("target", "encode", variable, sep = "_")

    if (prior_sample_size > 0){
     pop_mean <- sum(preparation_map_excerpt[["positive"]]) /
       sum(preparation_map_excerpt[["n"]])
     res[[te_name]] <- ( prior_sample_size * pop_mean + res[["positive"]]) /
       (prior_sample_size + res[["n"]])
    } else {
      res[[te_name]] <- ifelse(
        res[["n"]] <= 0,
        NA,
        res[["positive"]] / res[["n"]]
      )
    }

    if (noise_level != 0) {
      res[[te_name]]  <- res[[te_name]] + noise_level * runif(nrow(res))
    }
    res  <- res %>%
      dplyr::select(-positive, -n)
  }
  return(res)
}