R/adaboost.R
In h2o: R Interface for the 'H2O' Scalable Machine Learning Platform

Documented in h2o.adaBoost

# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details) 
#'
# -------------------------- AdaBoost Model -------------------------- #
#'
#' Build an AdaBoost model
#' 
#' Builds an AdaBoost model on an H2OFrame.
#'
#' @param x (Optional) A vector containing the names or indices of the predictor variables to use in building the model.
#'        If x is missing, then all columns except y are used.
#' @param y The name or column index of the response variable in the data. 
#'        The response must be either a numeric or a categorical/factor variable. 
#'        If the response is numeric, then a regression model will be trained, otherwise it will train a classification model.
#' @param training_frame Id of the training data frame.
#' @param model_id Destination id for this model; auto-generated if not specified.
#' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE.
#' @param categorical_encoding Encoding scheme for categorical features Must be one of: "AUTO", "Enum", "OneHotInternal", "OneHotExplicit",
#'        "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited". Defaults to AUTO.
#' @param weights_column Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from
#'        the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
#'        weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the
#'        data frame. This is typically the number of times a row is repeated, but non-integer values are supported as
#'        well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If
#'        you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get
#'        an accurate prediction, remove all rows with weight == 0.
#' @param nlearners Number of AdaBoost weak learners. Defaults to 50.
#' @param weak_learner Choose a weak learner type. Defaults to AUTO, which means DRF. Must be one of: "AUTO", "DRF", "GLM", "GBM",
#'        "DEEP_LEARNING". Defaults to AUTO.
#' @param learn_rate Learning rate (from 0.0 to 1.0) Defaults to 0.5.
#' @param weak_learner_params Customized parameters for the weak_learner algorithm. E.g list(ntrees=3, max_depth=2, histogram_type='UniformAdaptive'))
#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
#'        Defaults to -1 (time-based random number).
#' @return Creates a \linkS4class{H2OModel} object of the right type.
#' @seealso \code{\link{predict.H2OModel}} for prediction
#' @examples
#' \dontrun{
#' library(h2o)
#' h2o.init()
#' 
#' # Import the airlines dataset
#' f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv"
#' data <- h2o.importFile(f)
#' 
#' # Set predictors and response; set response as a factor
#' data["CAPSULE"] <- as.factor(data["CAPSULE"])
#' predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON")
#' response <- "CAPSULE"
#' 
#' # Train the AdaBoost model
#' h2o_adaboost <- h2o.adaBoost(x = predictors, y = response, training_frame = data, seed = 1234)
#' }
#' @export
h2o.adaBoost <- function(x,
                         y,
                         training_frame,
                         model_id = NULL,
                         ignore_const_cols = TRUE,
                         categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"),
                         weights_column = NULL,
                         nlearners = 50,
                         weak_learner = c("AUTO", "DRF", "GLM", "GBM", "DEEP_LEARNING"),
                         learn_rate = 0.5,
                         weak_learner_params = NULL,
                         seed = -1)
{
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)

  # Validate other required args
  # If x is missing, then assume user wants to use all columns as features.
  if (missing(x)) {
     if (is.numeric(y)) {
         x <- setdiff(col(training_frame), y)
     } else {
         x <- setdiff(colnames(training_frame), y)
     }
  }

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  args <- .verify_dataxy(training_frame, x, y)
  parms$ignored_columns <- args$x_ignore
  parms$response_column <- args$y

  if (!missing(model_id))
    parms$model_id <- model_id
  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(categorical_encoding))
    parms$categorical_encoding <- categorical_encoding
  if (!missing(weights_column))
    parms$weights_column <- weights_column
  if (!missing(nlearners))
    parms$nlearners <- nlearners
  if (!missing(weak_learner))
    parms$weak_learner <- weak_learner
  if (!missing(learn_rate))
    parms$learn_rate <- learn_rate
  if (!missing(seed))
    parms$seed <- seed

  if (!missing(weak_learner_params))
      parms$weak_learner_params <- as.character(toJSON(weak_learner_params, pretty = TRUE, auto_unbox = TRUE))

  # Error check and build model
  model <- .h2o.modelJob('adaboost', parms, h2oRestApiVersion=3, verbose=FALSE)
  return(model)
}
.h2o.train_segments_adaboost <- function(x,
                                         y,
                                         training_frame,
                                         ignore_const_cols = TRUE,
                                         categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"),
                                         weights_column = NULL,
                                         nlearners = 50,
                                         weak_learner = c("AUTO", "DRF", "GLM", "GBM", "DEEP_LEARNING"),
                                         learn_rate = 0.5,
                                         weak_learner_params = NULL,
                                         seed = -1,
                                         segment_columns = NULL,
                                         segment_models_id = NULL,
                                         parallelism = 1)
{
  # formally define variables that were excluded from function parameters
  model_id <- NULL
  verbose <- NULL
  destination_key <- NULL
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)

  # Validate other required args
  # If x is missing, then assume user wants to use all columns as features.
  if (missing(x)) {
     if (is.numeric(y)) {
         x <- setdiff(col(training_frame), y)
     } else {
         x <- setdiff(colnames(training_frame), y)
     }
  }

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  args <- .verify_dataxy(training_frame, x, y)
  parms$ignored_columns <- args$x_ignore
  parms$response_column <- args$y

  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(categorical_encoding))
    parms$categorical_encoding <- categorical_encoding
  if (!missing(weights_column))
    parms$weights_column <- weights_column
  if (!missing(nlearners))
    parms$nlearners <- nlearners
  if (!missing(weak_learner))
    parms$weak_learner <- weak_learner
  if (!missing(learn_rate))
    parms$learn_rate <- learn_rate
  if (!missing(seed))
    parms$seed <- seed

  if (!missing(weak_learner_params))
      parms$weak_learner_params <- as.character(toJSON(weak_learner_params, pretty = TRUE, auto_unbox = TRUE))

  # Build segment-models specific parameters
  segment_parms <- list()
  if (!missing(segment_columns))
    segment_parms$segment_columns <- segment_columns
  if (!missing(segment_models_id))
    segment_parms$segment_models_id <- segment_models_id
  segment_parms$parallelism <- parallelism

  # Error check and build segment models
  segment_models <- .h2o.segmentModelsJob('adaboost', segment_parms, parms, h2oRestApiVersion=3)
  return(segment_models)
}