encodeR: A collection of categorical encoders in R

Documented in conjugate_encoder

#' conjugate_encoder
#'
#' This function encodes categorical variables by fitting a posterior
#'  distribution per each category to the target variable y, using a
#'  known conjugate-prior. The resulting mean(s) of each posterior distribution
#'  per each category are used as the encodings.
#'
#' @param X_train A `tibble` or `data.frame` representing the training data set
#' containing some categorical features/columns.
#' @param X_test A `tibble` or `data.frame` representing the test set,
#' containing some set of categorical features/columns.
#' @param y A numeric vector or character vector representing the target
#' variable. If the objective is "binary", then the vector should only contain
#' two unique values.
#' @param cat_columns A character vector containing the names of the
#' categorical columns in the tibble that should be encoded.
#' @param prior_params A list with named parameters that specify the
#' prior assumed. For regression, this requires a list with named values:
#'  mu, vega, alpha, beta. All must be real numbers,
#' alpha should be greater than 0, beta and vega should be greater than 0.
#' mu can be negative. For binary classification, this requires a list with
#' four named values: alpha, beta. All must be real numbers and be greater than
#' 0.
#' @param objective A string, either "regression" or "binary" specifying the
#' problem. Default is regression.
#'
#' @return A list containing with processed training and test sets, in which
#' the named categorical columns are replaced with their encodings. For
#' regression, the encoder will add one additional dimension to the original
#' training set since the assumed prior distribution is two dimensional.
#' @export
#'
#' @examples conjugate_encoder(
#' X_train = mtcars,
#' y = mtcars$mpg,
#' cat_columns = c("cyl", "vs"),
#' prior_params = list(mu = 3, vega = 5, alpha = 3, beta = 3),
#' objective = "regression")
conjugate_encoder <- function(X_train, X_test = NULL, y, cat_columns,
                              prior_params, objective = "regression") {

  # Check inputs for incorrect values.

    if (!objective %in% c("regression", "binary")) {
      stop("Objective must be regression or binary.")
    }

    if (any(cat_columns %in% colnames(X_train)) == FALSE) {
      stop("Column does not exist in the training set.")
    }

    if (objective == "regression") {

      if (setequal(c("vega", "mu", "alpha", "beta"), names(prior_params)) ==
          FALSE) {
        stop("Missing a required prior parameter.")
      }

      if (prior_params$vega <= 0 | prior_params$alpha <= 0 |
          prior_params$beta <= 0) {
        stop("Invalid prior specification. Note that vega, alpha, and beta must
             all be greater than 0.")
      }

      if (!is.numeric(y)) {
        stop("The target variable should be numeric.")
      }

      # Initialize lists and grab the prior parameters necessary for the
      # encodings.
      encodings_list <- vector("list", length(cat_columns))
      encodings_list[[1]] <- X_train
      mu <- prior_params$mu
      alpha <- prior_params$alpha
      vega <- prior_params$vega
      beta <- prior_params$beta
      n <- nrow(X_train)

      for (i in seq_along(cat_columns)) {

        column <- cat_columns[i]

        # Calculate the conditional means and conditional variances for each
        # category.

        conditionals <- X_train %>%
          dplyr::bind_cols(target = y) %>%
          dplyr::group_by(!!rlang::sym(column)) %>%
          dplyr::summarize(conditional_mean = mean(target),
                    conditional_variances = var(target))

        if (any(is.na(conditionals$conditional_variances)) == TRUE) {
          warning("NA's fitted for expected variance. The variance of a single
                  data point does not exist. Make sure columns specified are
                  truly categorical.")
        }

        # Calculates the posterior parameters of the distribution, given the
        # observed conditional means and conditional variances.
        # These are just closed form formulas.

        mu_post <- (vega * mu + n * conditionals$conditional_mean) / (vega + n)
        alpha_post <- alpha + n / 2
        beta_post <- beta + 0.5 * n * conditionals$conditional_variances +
          (n * vega) / (vega + n) * ((conditionals$conditional_mean - mu)^2 / 2)

        # Calculate the encodings which are just the marginal means of each
        # random variable of the posterior.
        # The posterior distribution is two dimensional (X, Y) and so we will
        # get 2 columns of encodings.

        posterior_expected_var <- beta_post / (alpha_post - 1)
        all_encodings <- conditionals %>%
          dplyr::bind_cols(
            .,
            tidyr::tibble(
              encoding_mu = mu_post,
              encoding_sigma = posterior_expected_var)) %>%
          dplyr::select(-conditional_mean, -conditional_variances)

        # Changing the names of the learned encodings here so I can reference
        # them later when joining.

        names(all_encodings)[2] <- paste(column, "encoded_mean", sep = "_")
        names(all_encodings)[3] <- paste(column, "encoded_var", sep = "_")

        # i + 1 here since we set the first item in the list to the original
        # train set. This is so that I can use reduce on the list with
        # left_join to join the encodings for each category back to the
        # training set without looping.

        encodings_list[[i + 1]] <- all_encodings

      }

      # Join all of the learned encodings for each feature back to the original
      # training set. Then, drop the original categorical columns.
      train_processed <- purrr::reduce(
        encodings_list,
        function(x, y) dplyr::left_join(x, y, by = NULL)) %>%
        dplyr::select(-all_of(cat_columns))

      # Providing X_test isn't NA, encode the test set.

      if (!is.null(X_test)) {

      # Set the first element of the list containing all of the learned
      # encodings from the training set to the test set now.
      # This is again to recursively call left_join on each item so bind each
      # pair of encodings for each categorical column back
      # to the test set.
      encodings_list[[1]] <- X_test

      # This is the recursive call to left_join. It's the same thing as a loop
      # where the result of each left_join in the "loop" is carried forwards.
      test_encodings_means <- purrr::reduce(
        encodings_list, function(x, y) dplyr::left_join(x, y, by = NULL)) %>%
        dplyr::select(contains("encoded_mean"))

      # If there are test categories that we didn't see in our training set,
      # our best guess is the prior marginal mean of the 1st random variable.
      test_encodings_means[is.na(test_encodings_means)] <- mu

      # Same thing as above, but for the 2nd random variable of the posterior
      # distribution.
      test_encodings_var <- purrr::reduce(
        encodings_list, function(x, y) dplyr::left_join(x, y, by = NULL)) %>%
        dplyr::select(contains("encoded_var"))

      # Again, set categories that are in test set but not in training to be
      # the prior marginal mean of the 2nd random variable.
      test_encodings_var[is.na(test_encodings_var)] <- beta / (alpha - 1)

      test_processed <- dplyr::bind_cols(X_test, test_encodings_means) %>%
        dplyr::bind_cols(., test_encodings_var) %>%
        dplyr::select(-all_of(cat_columns))

      }

    } else {

      # More checks for the binary case. If the user supplies a vector that has
      # more than 2 unique values, stop.

      if (length(unique(y)) != 2) {
        stop("Binary classification is supported only.")
      }

      if (setequal(c("alpha", "beta"), names(prior_params)) == FALSE) {
        stop("Missing a required prior parameter, or
             invalid parameters specified.")
      }

      if (prior_params$alpha <= 0 | prior_params$beta <= 0) {
        stop("Invalid prior specification. Note that alpha and beta must all
             be greater than 0.")
      }

      # Encode any factor variables or character vectors to numeric 0/1's.
      if (is.factor(y) || is.character(y)) {

        all_unique_values <- unique(y)
        target_new <- dplyr::case_when(y == all_unique_values[1] ~ 0,
                                y == all_unique_values[2] ~ 1)

      } else {
        target_new <- y
      }

      # Grab needed parameters, untialize the list that will store the
      # encodings.

      encodings_list <- vector("list", length(cat_columns) + 1)
      encodings_list[[1]] <- X_train
      alpha <- prior_params$alpha
      beta <- prior_params$beta
      n <- nrow(X_train)

      for (i in seq_along(cat_columns)) {

        column <- cat_columns[i]

        # Same thing as in the regression case, but the posterior here is less
        # complicated (it's one dimension here, not two as above).
        # So we just need the total number of "1"'s.
        conditionals <- X_train %>%
          dplyr::bind_cols(target = target_new) %>%
          dplyr::group_by(!!rlang::sym(column)) %>%
          dplyr::summarize(conditional_success = sum(target))

        # Calculate the posterior parameters.
        alpha_post <- alpha + conditionals$conditional_success
        beta_post <- beta + n - conditionals$conditional_success

        # Get the encodings, which is the mean of a beta distribution.
        posterior_expected_val <- alpha_post / (alpha_post + beta_post)
        all_encodings <- conditionals %>%
          dplyr::bind_cols(
            .,
            tidyr::tibble(encoded = posterior_expected_val)) %>%
          dplyr::select(-conditional_success)

        names(all_encodings)[2] <- paste(column, "encoded", sep = "_")

        encodings_list[[i + 1]] <- all_encodings

      }

      # Recursive joining to the train set.

      train_processed <- purrr::reduce(
        encodings_list,
        function(x, y) dplyr::left_join(x, y, by = NULL)) %>%
        dplyr::select(-all_of(cat_columns))

      if (!is.null(X_test)) {

      encodings_list[[1]] <- X_test

      # Recursive joining on the test set, if it exists.
      test_encodings <- purrr::reduce(
        encodings_list, function(x, y) dplyr::left_join(x, y, by = NULL)) %>%
        dplyr::select(contains("encoded"))

      # If there exists a category that we didn't see in our training set, use
      # the prior mean.
      test_encodings[is.na(test_encodings)] <- alpha / (alpha + beta)

      test_processed <- dplyr::bind_cols(X_test, test_encodings) %>%
        dplyr::select(-all_of(cat_columns))

      }

    }

  # Output.
  if (!is.null(X_test)) {
  out <- list("train" = train_processed, "test" = test_processed)
  } else {
    out <- list(train_processed)
  }

  out

}