R/anovaglm.R
In h2o: R Interface for the 'H2O' Scalable Machine Learning Platform

Documented in h2o.anovaglm

# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details) 
#'
# -------------------------- ANOVA GLM -------------------------- #
#'
#' H2O ANOVAGLM is used to calculate Type III SS which is used to evaluate the contributions of individual predictors 
#' and their interactions to a model.  Predictors or interactions with negligible contributions to the model will have 
#' high p-values while those with more contributions will have low p-values. 
#' 
#'
#' @param x (Optional) A vector containing the names or indices of the predictor variables to use in building the model.
#'        If x is missing, then all columns except y are used.
#' @param y The name or column index of the response variable in the data. 
#'        The response must be either a numeric or a categorical/factor variable. 
#'        If the response is numeric, then a regression model will be trained, otherwise it will train a classification model.
#' @param training_frame Id of the training data frame.
#' @param model_id Destination id for this model; auto-generated if not specified.
#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
#'        Defaults to -1 (time-based random number).
#' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE.
#' @param score_each_iteration \code{Logical}. Whether to score during each iteration of model training. Defaults to FALSE.
#' @param offset_column Offset column. This will be added to the combination of columns before applying the link function.
#' @param weights_column Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from
#'        the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
#'        weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the
#'        data frame. This is typically the number of times a row is repeated, but non-integer values are supported as
#'        well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If
#'        you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get
#'        an accurate prediction, remove all rows with weight == 0.
#' @param family Family. Use binomial for classification with logistic regression, others are for regression problems. Must be
#'        one of: "AUTO", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "poisson", "gamma", "tweedie",
#'        "negativebinomial". Defaults to AUTO.
#' @param tweedie_variance_power Tweedie variance power Defaults to 0.
#' @param tweedie_link_power Tweedie link power Defaults to 1.
#' @param theta Theta Defaults to 0.
#' @param solver AUTO will set the solver based on given data and the other parameters. IRLSM is fast on on problems with small
#'        number of predictors and for lambda-search with L1 penalty, L_BFGS scales better for datasets with many
#'        columns. Must be one of: "AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT",
#'        "GRADIENT_DESCENT_LH", "GRADIENT_DESCENT_SQERR". Defaults to IRLSM.
#' @param missing_values_handling Handling of missing values. Either MeanImputation, Skip or PlugValues. Must be one of: "MeanImputation",
#'        "Skip", "PlugValues". Defaults to MeanImputation.
#' @param plug_values Plug Values (a single row frame containing values that will be used to impute missing values of the
#'        training/validation frame, use with conjunction missing_values_handling = PlugValues)
#' @param compute_p_values \code{Logical}. Request p-values computation, p-values work only with IRLSM solver and no regularization
#'        Defaults to TRUE.
#' @param standardize \code{Logical}. Standardize numeric columns to have zero mean and unit variance Defaults to TRUE.
#' @param non_negative \code{Logical}. Restrict coefficients (not intercept) to be non-negative Defaults to FALSE.
#' @param max_iterations Maximum number of iterations Defaults to 0.
#' @param link Link function. Must be one of: "family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit".
#'        Defaults to family_default.
#' @param prior Prior probability for y==1. To be used only for logistic regression iff the data has been sampled and the mean
#'        of response does not reflect reality. Defaults to 0.
#' @param alpha Distribution of regularization between the L1 (Lasso) and L2 (Ridge) penalties. A value of 1 for alpha
#'        represents Lasso regression, a value of 0 produces Ridge regression, and anything in between specifies the
#'        amount of mixing between the two. Default value of alpha is 0 when SOLVER = 'L-BFGS'; 0.5 otherwise.
#' @param lambda Regularization strength Defaults to c(0.0).
#' @param lambda_search \code{Logical}. Use lambda search starting at lambda max, given lambda is then interpreted as lambda min
#'        Defaults to FALSE.
#' @param stopping_rounds Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the
#'        stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Defaults to 0.
#' @param stopping_metric Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score
#'        for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python
#'        client. Must be one of: "AUTO", "deviance", "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "AUCPR",
#'        "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing". Defaults to
#'        AUTO.
#' @param early_stopping \code{Logical}. Stop early when there is no more relative improvement on train or validation (if provided).
#'        Defaults to FALSE.
#' @param stopping_tolerance Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this
#'        much) Defaults to 0.001.
#' @param balance_classes \code{Logical}. Balance training data class counts via over/under-sampling (for imbalanced data). Defaults to
#'        FALSE.
#' @param class_sampling_factors Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will
#'        be automatically computed to obtain class balance during training. Requires balance_classes.
#' @param max_after_balance_size Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires
#'        balance_classes. Defaults to 5.0.
#' @param max_runtime_secs Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to 0.
#' @param save_transformed_framekeys \code{Logical}. true to save the keys of transformed predictors and interaction column. Defaults to FALSE.
#' @param highest_interaction_term Limit the number of interaction terms, if 2 means interaction between 2 columns only, 3 for three columns and
#'        so on...  Default to 2. Defaults to 0.
#' @param nparallelism Number of models to build in parallel.  Default to 4.  Adjust according to your system. Defaults to 4.
#' @param type Refer to the SS type 1, 2, 3, or 4.  We are currently only supporting 3 Defaults to 0.
#' @examples
#' \dontrun{
#' h2o.init()
#' 
#' # Run ANOVA GLM of VOL ~ CAPSULE + RACE
#' prostate_path <- system.file("extdata", "prostate.csv", package = "h2o")
#' prostate <- h2o.uploadFile(path = prostate_path)
#' prostate$CAPSULE <- as.factor(prostate$CAPSULE)
#' model <- h2o.anovaglm(y = "VOL", x = c("CAPSULE","RACE"), training_frame = prostate)
#' 
#' }
#' @export
h2o.anovaglm <- function(x,
                         y,
                         training_frame,
                         model_id = NULL,
                         seed = -1,
                         ignore_const_cols = TRUE,
                         score_each_iteration = FALSE,
                         offset_column = NULL,
                         weights_column = NULL,
                         family = c("AUTO", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "poisson", "gamma", "tweedie", "negativebinomial"),
                         tweedie_variance_power = 0,
                         tweedie_link_power = 1,
                         theta = 0,
                         solver = c("AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT", "GRADIENT_DESCENT_LH", "GRADIENT_DESCENT_SQERR"),
                         missing_values_handling = c("MeanImputation", "Skip", "PlugValues"),
                         plug_values = NULL,
                         compute_p_values = TRUE,
                         standardize = TRUE,
                         non_negative = FALSE,
                         max_iterations = 0,
                         link = c("family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit"),
                         prior = 0,
                         alpha = NULL,
                         lambda = c(0.0),
                         lambda_search = FALSE,
                         stopping_rounds = 0,
                         stopping_metric = c("AUTO", "deviance", "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "AUCPR", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"),
                         early_stopping = FALSE,
                         stopping_tolerance = 0.001,
                         balance_classes = FALSE,
                         class_sampling_factors = NULL,
                         max_after_balance_size = 5.0,
                         max_runtime_secs = 0,
                         save_transformed_framekeys = FALSE,
                         highest_interaction_term = 0,
                         nparallelism = 4,
                         type = 0)
{
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)

  # Validate other required args
  # If x is missing, then assume user wants to use all columns as features.
  if (missing(x)) {
     if (is.numeric(y)) {
         x <- setdiff(col(training_frame), y)
     } else {
         x <- setdiff(colnames(training_frame), y)
     }
  }

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  args <- .verify_dataxy(training_frame, x, y)
  parms$ignored_columns <- args$x_ignore
  parms$response_column <- args$y

  if (!missing(model_id))
    parms$model_id <- model_id
  if (!missing(seed))
    parms$seed <- seed
  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(score_each_iteration))
    parms$score_each_iteration <- score_each_iteration
  if (!missing(offset_column))
    parms$offset_column <- offset_column
  if (!missing(weights_column))
    parms$weights_column <- weights_column
  if (!missing(family))
    parms$family <- family
  if (!missing(tweedie_variance_power))
    parms$tweedie_variance_power <- tweedie_variance_power
  if (!missing(tweedie_link_power))
    parms$tweedie_link_power <- tweedie_link_power
  if (!missing(theta))
    parms$theta <- theta
  if (!missing(solver))
    parms$solver <- solver
  if (!missing(missing_values_handling))
    parms$missing_values_handling <- missing_values_handling
  if (!missing(plug_values))
    parms$plug_values <- plug_values
  if (!missing(compute_p_values))
    parms$compute_p_values <- compute_p_values
  if (!missing(standardize))
    parms$standardize <- standardize
  if (!missing(non_negative))
    parms$non_negative <- non_negative
  if (!missing(max_iterations))
    parms$max_iterations <- max_iterations
  if (!missing(link))
    parms$link <- link
  if (!missing(prior))
    parms$prior <- prior
  if (!missing(alpha))
    parms$alpha <- alpha
  if (!missing(lambda))
    parms$lambda <- lambda
  if (!missing(lambda_search))
    parms$lambda_search <- lambda_search
  if (!missing(stopping_rounds))
    parms$stopping_rounds <- stopping_rounds
  if (!missing(stopping_metric))
    parms$stopping_metric <- stopping_metric
  if (!missing(early_stopping))
    parms$early_stopping <- early_stopping
  if (!missing(stopping_tolerance))
    parms$stopping_tolerance <- stopping_tolerance
  if (!missing(balance_classes))
    parms$balance_classes <- balance_classes
  if (!missing(class_sampling_factors))
    parms$class_sampling_factors <- class_sampling_factors
  if (!missing(max_after_balance_size))
    parms$max_after_balance_size <- max_after_balance_size
  if (!missing(max_runtime_secs))
    parms$max_runtime_secs <- max_runtime_secs
  if (!missing(save_transformed_framekeys))
    parms$save_transformed_framekeys <- save_transformed_framekeys
  if (!missing(highest_interaction_term))
    parms$highest_interaction_term <- highest_interaction_term
  if (!missing(nparallelism))
    parms$nparallelism <- nparallelism
  if (!missing(type))
    parms$type <- type

  # Error check and build model
  model <- .h2o.modelJob('anovaglm', parms, h2oRestApiVersion=3, verbose=FALSE)
  return(model)
}
.h2o.train_segments_anovaglm <- function(x,
                                         y,
                                         training_frame,
                                         seed = -1,
                                         ignore_const_cols = TRUE,
                                         score_each_iteration = FALSE,
                                         offset_column = NULL,
                                         weights_column = NULL,
                                         family = c("AUTO", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "poisson", "gamma", "tweedie", "negativebinomial"),
                                         tweedie_variance_power = 0,
                                         tweedie_link_power = 1,
                                         theta = 0,
                                         solver = c("AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT", "GRADIENT_DESCENT_LH", "GRADIENT_DESCENT_SQERR"),
                                         missing_values_handling = c("MeanImputation", "Skip", "PlugValues"),
                                         plug_values = NULL,
                                         compute_p_values = TRUE,
                                         standardize = TRUE,
                                         non_negative = FALSE,
                                         max_iterations = 0,
                                         link = c("family_default", "identity", "logit", "log", "inverse", "tweedie", "ologit"),
                                         prior = 0,
                                         alpha = NULL,
                                         lambda = c(0.0),
                                         lambda_search = FALSE,
                                         stopping_rounds = 0,
                                         stopping_metric = c("AUTO", "deviance", "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "AUCPR", "lift_top_group", "misclassification", "mean_per_class_error", "custom", "custom_increasing"),
                                         early_stopping = FALSE,
                                         stopping_tolerance = 0.001,
                                         balance_classes = FALSE,
                                         class_sampling_factors = NULL,
                                         max_after_balance_size = 5.0,
                                         max_runtime_secs = 0,
                                         save_transformed_framekeys = FALSE,
                                         highest_interaction_term = 0,
                                         nparallelism = 4,
                                         type = 0,
                                         segment_columns = NULL,
                                         segment_models_id = NULL,
                                         parallelism = 1)
{
  # formally define variables that were excluded from function parameters
  model_id <- NULL
  verbose <- NULL
  destination_key <- NULL
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)

  # Validate other required args
  # If x is missing, then assume user wants to use all columns as features.
  if (missing(x)) {
     if (is.numeric(y)) {
         x <- setdiff(col(training_frame), y)
     } else {
         x <- setdiff(colnames(training_frame), y)
     }
  }

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  args <- .verify_dataxy(training_frame, x, y)
  parms$ignored_columns <- args$x_ignore
  parms$response_column <- args$y

  if (!missing(seed))
    parms$seed <- seed
  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(score_each_iteration))
    parms$score_each_iteration <- score_each_iteration
  if (!missing(offset_column))
    parms$offset_column <- offset_column
  if (!missing(weights_column))
    parms$weights_column <- weights_column
  if (!missing(family))
    parms$family <- family
  if (!missing(tweedie_variance_power))
    parms$tweedie_variance_power <- tweedie_variance_power
  if (!missing(tweedie_link_power))
    parms$tweedie_link_power <- tweedie_link_power
  if (!missing(theta))
    parms$theta <- theta
  if (!missing(solver))
    parms$solver <- solver
  if (!missing(missing_values_handling))
    parms$missing_values_handling <- missing_values_handling
  if (!missing(plug_values))
    parms$plug_values <- plug_values
  if (!missing(compute_p_values))
    parms$compute_p_values <- compute_p_values
  if (!missing(standardize))
    parms$standardize <- standardize
  if (!missing(non_negative))
    parms$non_negative <- non_negative
  if (!missing(max_iterations))
    parms$max_iterations <- max_iterations
  if (!missing(link))
    parms$link <- link
  if (!missing(prior))
    parms$prior <- prior
  if (!missing(alpha))
    parms$alpha <- alpha
  if (!missing(lambda))
    parms$lambda <- lambda
  if (!missing(lambda_search))
    parms$lambda_search <- lambda_search
  if (!missing(stopping_rounds))
    parms$stopping_rounds <- stopping_rounds
  if (!missing(stopping_metric))
    parms$stopping_metric <- stopping_metric
  if (!missing(early_stopping))
    parms$early_stopping <- early_stopping
  if (!missing(stopping_tolerance))
    parms$stopping_tolerance <- stopping_tolerance
  if (!missing(balance_classes))
    parms$balance_classes <- balance_classes
  if (!missing(class_sampling_factors))
    parms$class_sampling_factors <- class_sampling_factors
  if (!missing(max_after_balance_size))
    parms$max_after_balance_size <- max_after_balance_size
  if (!missing(max_runtime_secs))
    parms$max_runtime_secs <- max_runtime_secs
  if (!missing(save_transformed_framekeys))
    parms$save_transformed_framekeys <- save_transformed_framekeys
  if (!missing(highest_interaction_term))
    parms$highest_interaction_term <- highest_interaction_term
  if (!missing(nparallelism))
    parms$nparallelism <- nparallelism
  if (!missing(type))
    parms$type <- type

  # Build segment-models specific parameters
  segment_parms <- list()
  if (!missing(segment_columns))
    segment_parms$segment_columns <- segment_columns
  if (!missing(segment_models_id))
    segment_parms$segment_models_id <- segment_models_id
  segment_parms$parallelism <- parallelism

  # Error check and build segment models
  segment_models <- .h2o.segmentModelsJob('anovaglm', segment_parms, parms, h2oRestApiVersion=3)
  return(segment_models)
}