R/targetencoder.R
In h2o: R Interface for the 'H2O' Scalable Machine Learning Platform

Documented in h2o.targetencoder

# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details) 
#'
# -------------------------- Target Encoder -------------------------- #
#'
#' Transformation of a categorical variable with a mean value of the target variable
#'
#' @param x (Optional) A vector containing the names or indices of the predictor variables to use in building the model.
#'        If x is missing, then all columns except y are used.
#' @param y The name or column index of the response variable in the data. 
#'        The response must be either a numeric or a categorical/factor variable. 
#'        If the response is numeric, then a regression model will be trained, otherwise it will train a classification model.
#' @param training_frame Id of the training data frame.
#' @param model_id Destination id for this model; auto-generated if not specified.
#' @param fold_column Column with cross-validation fold index assignment per observation.
#' @param columns_to_encode List of categorical columns or groups of categorical columns to encode. When groups of columns are specified,
#'        each group is encoded as a single column (interactions are created internally).
#' @param keep_original_categorical_columns \code{Logical}. If true, the original non-encoded categorical features will remain in the result frame.
#'        Defaults to TRUE.
#' @param blending \code{Logical}. If true, enables blending of posterior probabilities (computed for a given categorical value)
#'        with prior probabilities (computed on the entire set). This allows to mitigate the effect of categorical
#'        values with small cardinality. The blending effect can be tuned using the `inflection_point` and `smoothing`
#'        parameters. Defaults to FALSE.
#' @param inflection_point Inflection point of the sigmoid used to blend probabilities (see `blending` parameter). For a given
#'        categorical value, if it appears less that `inflection_point` in a data sample, then the influence of the
#'        posterior probability will be smaller than the prior. Defaults to 10.
#' @param smoothing Smoothing factor corresponds to the inverse of the slope at the inflection point on the sigmoid used to blend
#'        probabilities (see `blending` parameter). If smoothing tends towards 0, then the sigmoid used for blending
#'        turns into a Heaviside step function. Defaults to 20.
#' @param data_leakage_handling Data leakage handling strategy used to generate the encoding. Supported options are:
#'        1) "none" (default) - no holdout, using the entire training frame.
#'        2) "leave_one_out" - current row's response value is subtracted from the per-level frequencies pre-calculated
#'        on the entire training frame.
#'        3) "k_fold" - encodings for a fold are generated based on out-of-fold data.
#'         Must be one of: "leave_one_out", "k_fold", "none", "LeaveOneOut", "KFold", "None". Defaults to None.
#' @param noise The amount of noise to add to the encoded column. Use 0 to disable noise, and -1 (=AUTO) to let the algorithm
#'        determine a reasonable amount of noise. Defaults to 0.01.
#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
#'        Defaults to -1 (time-based random number).
#' @param ... Mainly used for backwards compatibility, to allow deprecated parameters.
#' @examples
#' \dontrun{
#' library(h2o)
#' h2o.init()
#' #Import the titanic dataset
#' f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
#' titanic <- h2o.importFile(f)
#' 
#' # Set response as a factor
#' response <- "survived"
#' titanic[response] <- as.factor(titanic[response])
#' 
#' # Split the dataset into train and test
#' splits <- h2o.splitFrame(data = titanic, ratios = .8, seed = 1234)
#' train <- splits[[1]]
#' test <- splits[[2]]
#' 
#' # Choose which columns to encode
#' encode_columns <- c("home.dest", "cabin", "embarked")
#' 
#' # Train a TE model
#' te_model <- h2o.targetencoder(x = encode_columns,
#'                               y = response, 
#'                               training_frame = train,
#'                               fold_column = "pclass", 
#'                               data_leakage_handling = "KFold")
#' 
#' # New target encoded train and test sets
#' train_te <- h2o.transform(te_model, train)
#' test_te <- h2o.transform(te_model, test)
#' }
#' @export
h2o.targetencoder <- function(x,
                              y,
                              training_frame,
                              model_id = NULL,
                              fold_column = NULL,
                              columns_to_encode = NULL,
                              keep_original_categorical_columns = TRUE,
                              blending = FALSE,
                              inflection_point = 10,
                              smoothing = 20,
                              data_leakage_handling = c("leave_one_out", "k_fold", "none", "LeaveOneOut", "KFold", "None"),
                              noise = 0.01,
                              seed = -1,
                              ...)
{
  varargs <- list(...)
  for (arg in names(varargs)) {
     if (arg == 'k') {
        warning("argument 'k' is deprecated; please use 'inflection_point' instead.")
        if (missing(inflection_point)) inflection_point <- varargs$k else warning("ignoring 'k' as 'inflection_point' was also provided.")
     } else if (arg == 'f') {
        warning("argument 'f' is deprecated; please use 'smoothing' instead.")
        if (missing(smoothing)) smoothing <- varargs$f else warning("ignoring 'f' as 'smoothing' was also provided.")
     } else if (arg == 'noise_level') {
        warning("argument 'noise_level' is deprecated; please use 'noise' instead.")
        if (missing(noise)) noise <- varargs$noise_level else warning("ignoring 'noise_level' as 'noise' was also provided.")
     } else {
        stop(paste("unused argument", arg, "=", varargs[[arg]]))
     }
  }
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)

  # Validate other required args
  # If x is missing, then assume user wants to use all columns as features.
  if (missing(x)) {
     if (is.numeric(y)) {
         x <- setdiff(col(training_frame), y)
     } else {
         x <- setdiff(colnames(training_frame), y)
     }
  }

  # Validate other args
  if (!missing(columns_to_encode))
    columns_to_encode <- lapply(columns_to_encode, function(x) if(is.character(x) & length(x) == 1) list(x) else x)

  # Build parameter list to send to model builder
  parms <- list()
  args <- .verify_dataxy(training_frame, x, y)
  if( !missing(fold_column) && !is.null(fold_column)) args$x_ignore <- args$x_ignore[!( fold_column == args$x_ignore )]
  parms$ignored_columns <- args$x_ignore
  parms$response_column <- args$y
  parms$training_frame <- training_frame

  if (!missing(model_id))
    parms$model_id <- model_id
  if (!missing(fold_column))
    parms$fold_column <- fold_column
  if (!missing(columns_to_encode))
    parms$columns_to_encode <- columns_to_encode
  if (!missing(keep_original_categorical_columns))
    parms$keep_original_categorical_columns <- keep_original_categorical_columns
  if (!missing(blending))
    parms$blending <- blending
  if (!missing(inflection_point))
    parms$inflection_point <- inflection_point
  if (!missing(smoothing))
    parms$smoothing <- smoothing
  if (!missing(data_leakage_handling))
    parms$data_leakage_handling <- data_leakage_handling
  if (!missing(noise))
    parms$noise <- noise
  if (!missing(seed))
    parms$seed <- seed

  # Error check and build model
  model <- .h2o.modelJob('targetencoder', parms, h2oRestApiVersion=3, verbose=FALSE)
  return(model)
}
.h2o.train_segments_targetencoder <- function(x,
                                              y,
                                              training_frame,
                                              fold_column = NULL,
                                              columns_to_encode = NULL,
                                              keep_original_categorical_columns = TRUE,
                                              blending = FALSE,
                                              inflection_point = 10,
                                              smoothing = 20,
                                              data_leakage_handling = c("leave_one_out", "k_fold", "none", "LeaveOneOut", "KFold", "None"),
                                              noise = 0.01,
                                              seed = -1,
                                              segment_columns = NULL,
                                              segment_models_id = NULL,
                                              parallelism = 1,
                                              ...)
{
  varargs <- list(...)
  for (arg in names(varargs)) {
     if (arg == 'k') {
        warning("argument 'k' is deprecated; please use 'inflection_point' instead.")
        if (missing(inflection_point)) inflection_point <- varargs$k else warning("ignoring 'k' as 'inflection_point' was also provided.")
     } else if (arg == 'f') {
        warning("argument 'f' is deprecated; please use 'smoothing' instead.")
        if (missing(smoothing)) smoothing <- varargs$f else warning("ignoring 'f' as 'smoothing' was also provided.")
     } else if (arg == 'noise_level') {
        warning("argument 'noise_level' is deprecated; please use 'noise' instead.")
        if (missing(noise)) noise <- varargs$noise_level else warning("ignoring 'noise_level' as 'noise' was also provided.")
     } else {
        stop(paste("unused argument", arg, "=", varargs[[arg]]))
     }
  }
  # formally define variables that were excluded from function parameters
  model_id <- NULL
  verbose <- NULL
  destination_key <- NULL
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)

  # Validate other required args
  # If x is missing, then assume user wants to use all columns as features.
  if (missing(x)) {
     if (is.numeric(y)) {
         x <- setdiff(col(training_frame), y)
     } else {
         x <- setdiff(colnames(training_frame), y)
     }
  }

  # Validate other args
  if (!missing(columns_to_encode))
    columns_to_encode <- lapply(columns_to_encode, function(x) if(is.character(x) & length(x) == 1) list(x) else x)

  # Build parameter list to send to model builder
  parms <- list()
  args <- .verify_dataxy(training_frame, x, y)
  if( !missing(fold_column) && !is.null(fold_column)) args$x_ignore <- args$x_ignore[!( fold_column == args$x_ignore )]
  parms$ignored_columns <- args$x_ignore
  parms$response_column <- args$y
  parms$training_frame <- training_frame

  if (!missing(fold_column))
    parms$fold_column <- fold_column
  if (!missing(columns_to_encode))
    parms$columns_to_encode <- columns_to_encode
  if (!missing(keep_original_categorical_columns))
    parms$keep_original_categorical_columns <- keep_original_categorical_columns
  if (!missing(blending))
    parms$blending <- blending
  if (!missing(inflection_point))
    parms$inflection_point <- inflection_point
  if (!missing(smoothing))
    parms$smoothing <- smoothing
  if (!missing(data_leakage_handling))
    parms$data_leakage_handling <- data_leakage_handling
  if (!missing(noise))
    parms$noise <- noise
  if (!missing(seed))
    parms$seed <- seed

  # Build segment-models specific parameters
  segment_parms <- list()
  if (!missing(segment_columns))
    segment_parms$segment_columns <- segment_columns
  if (!missing(segment_models_id))
    segment_parms$segment_models_id <- segment_models_id
  segment_parms$parallelism <- parallelism

  # Error check and build segment models
  segment_models <- .h2o.segmentModelsJob('targetencoder', segment_parms, parms, h2oRestApiVersion=3)
  return(segment_models)
}