R/isolationforest.R
In h2o: R Interface for the 'H2O' Scalable Machine Learning Platform

Documented in h2o.isolationForest

# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details) 
#'
# -------------------------- isolationforest -------------------------- #
#'
#' Trains an Isolation Forest model
#'
#' @param training_frame Id of the training data frame.
#' @param x A vector containing the \code{character} names of the predictors in the model.
#' @param model_id Destination id for this model; auto-generated if not specified.
#' @param score_each_iteration \code{Logical}. Whether to score during each iteration of model training. Defaults to FALSE.
#' @param score_tree_interval Score the model after every so many trees. Disabled if set to 0. Defaults to 0.
#' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE.
#' @param ntrees Number of trees. Defaults to 50.
#' @param max_depth Maximum tree depth (0 for unlimited). Defaults to 8.
#' @param min_rows Fewest allowed (weighted) observations in a leaf. Defaults to 1.
#' @param max_runtime_secs Maximum allowed runtime in seconds for model training. Use 0 to disable. Defaults to 0.
#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
#'        Defaults to -1 (time-based random number).
#' @param build_tree_one_node \code{Logical}. Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.
#'        Defaults to FALSE.
#' @param mtries Number of variables randomly sampled as candidates at each split. If set to -1, defaults (number of
#'        predictors)/3. Defaults to -1.
#' @param sample_size Number of randomly sampled observations used to train each Isolation Forest tree. Only one of parameters
#'        sample_size and sample_rate should be defined. If sample_rate is defined, sample_size will be ignored.
#'        Defaults to 256.
#' @param sample_rate Rate of randomly sampled observations used to train each Isolation Forest tree. Needs to be in range from 0.0
#'        to 1.0. If set to -1, sample_rate is disabled and sample_size will be used instead. Defaults to -1.
#' @param col_sample_rate_change_per_level Relative change of the column sampling rate for every level (must be > 0.0 and <= 2.0) Defaults to 1.
#' @param col_sample_rate_per_tree Column sample rate per tree (from 0.0 to 1.0) Defaults to 1.
#' @param categorical_encoding Encoding scheme for categorical features Must be one of: "AUTO", "Enum", "OneHotInternal", "OneHotExplicit",
#'        "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited". Defaults to AUTO.
#' @param stopping_rounds Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the
#'        stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable) Defaults to 0.
#' @param stopping_metric Metric to use for early stopping (AUTO: logloss for classification, deviance for regression and anomaly_score
#'        for Isolation Forest). Note that custom and custom_increasing can only be used in GBM and DRF with the Python
#'        client. Must be one of: "AUTO", "anomaly_score". Defaults to AUTO.
#' @param stopping_tolerance Relative tolerance for metric-based stopping criterion (stop if relative improvement is not at least this
#'        much) Defaults to 0.01.
#' @param export_checkpoints_dir Automatically export generated models to this directory.
#' @param contamination Contamination ratio - the proportion of anomalies in the input dataset. If undefined (-1) the predict function
#'        will not mark observations as anomalies and only anomaly score will be returned. Defaults to -1 (undefined).
#'        Defaults to -1.
#' @param validation_frame Id of the validation data frame.
#' @param validation_response_column (experimental) Name of the response column in the validation frame. Response column should be binary and
#'        indicate not anomaly/anomaly.
#' @examples
#' \dontrun{
#' library(h2o)
#' h2o.init()
#' 
#' # Import the cars dataset
#' f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv"
#' cars <- h2o.importFile(f)
#' 
#' # Set the predictors
#' predictors <- c("displacement", "power", "weight", "acceleration", "year")
#' 
#' # Train the IF model
#' cars_if <- h2o.isolationForest(x = predictors, training_frame = cars,
#'                                seed = 1234, stopping_metric = "anomaly_score",
#'                                stopping_rounds = 3, stopping_tolerance = 0.1)
#' }
#' @export
h2o.isolationForest <- function(training_frame,
                                x,
                                model_id = NULL,
                                score_each_iteration = FALSE,
                                score_tree_interval = 0,
                                ignore_const_cols = TRUE,
                                ntrees = 50,
                                max_depth = 8,
                                min_rows = 1,
                                max_runtime_secs = 0,
                                seed = -1,
                                build_tree_one_node = FALSE,
                                mtries = -1,
                                sample_size = 256,
                                sample_rate = -1,
                                col_sample_rate_change_per_level = 1,
                                col_sample_rate_per_tree = 1,
                                categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"),
                                stopping_rounds = 0,
                                stopping_metric = c("AUTO", "anomaly_score"),
                                stopping_tolerance = 0.01,
                                export_checkpoints_dir = NULL,
                                contamination = -1,
                                validation_frame = NULL,
                                validation_response_column = NULL)
{
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)
  validation_frame <- .validate.H2OFrame(validation_frame, required=FALSE)

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  if(!missing(x))
    parms$ignored_columns <- .verify_datacols(training_frame, x)$cols_ignore

  if (!missing(model_id))
    parms$model_id <- model_id
  if (!missing(score_each_iteration))
    parms$score_each_iteration <- score_each_iteration
  if (!missing(score_tree_interval))
    parms$score_tree_interval <- score_tree_interval
  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(ntrees))
    parms$ntrees <- ntrees
  if (!missing(max_depth))
    parms$max_depth <- max_depth
  if (!missing(min_rows))
    parms$min_rows <- min_rows
  if (!missing(max_runtime_secs))
    parms$max_runtime_secs <- max_runtime_secs
  if (!missing(seed))
    parms$seed <- seed
  if (!missing(build_tree_one_node))
    parms$build_tree_one_node <- build_tree_one_node
  if (!missing(mtries))
    parms$mtries <- mtries
  if (!missing(sample_size))
    parms$sample_size <- sample_size
  if (!missing(sample_rate))
    parms$sample_rate <- sample_rate
  if (!missing(col_sample_rate_change_per_level))
    parms$col_sample_rate_change_per_level <- col_sample_rate_change_per_level
  if (!missing(col_sample_rate_per_tree))
    parms$col_sample_rate_per_tree <- col_sample_rate_per_tree
  if (!missing(categorical_encoding))
    parms$categorical_encoding <- categorical_encoding
  if (!missing(stopping_rounds))
    parms$stopping_rounds <- stopping_rounds
  if (!missing(stopping_metric))
    parms$stopping_metric <- stopping_metric
  if (!missing(stopping_tolerance))
    parms$stopping_tolerance <- stopping_tolerance
  if (!missing(export_checkpoints_dir))
    parms$export_checkpoints_dir <- export_checkpoints_dir
  if (!missing(contamination))
    parms$contamination <- contamination
  if (!missing(validation_frame))
    parms$validation_frame <- validation_frame
  if (!missing(validation_response_column))
    parms$validation_response_column <- validation_response_column

  # Error check and build model
  model <- .h2o.modelJob('isolationforest', parms, h2oRestApiVersion=3, verbose=FALSE)
  return(model)
}
.h2o.train_segments_isolationforest <- function(training_frame,
                                                x,
                                                score_each_iteration = FALSE,
                                                score_tree_interval = 0,
                                                ignore_const_cols = TRUE,
                                                ntrees = 50,
                                                max_depth = 8,
                                                min_rows = 1,
                                                max_runtime_secs = 0,
                                                seed = -1,
                                                build_tree_one_node = FALSE,
                                                mtries = -1,
                                                sample_size = 256,
                                                sample_rate = -1,
                                                col_sample_rate_change_per_level = 1,
                                                col_sample_rate_per_tree = 1,
                                                categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"),
                                                stopping_rounds = 0,
                                                stopping_metric = c("AUTO", "anomaly_score"),
                                                stopping_tolerance = 0.01,
                                                export_checkpoints_dir = NULL,
                                                contamination = -1,
                                                validation_frame = NULL,
                                                validation_response_column = NULL,
                                                segment_columns = NULL,
                                                segment_models_id = NULL,
                                                parallelism = 1)
{
  # formally define variables that were excluded from function parameters
  model_id <- NULL
  verbose <- NULL
  destination_key <- NULL
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)
  validation_frame <- .validate.H2OFrame(validation_frame, required=FALSE)

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  if(!missing(x))
    parms$ignored_columns <- .verify_datacols(training_frame, x)$cols_ignore

  if (!missing(score_each_iteration))
    parms$score_each_iteration <- score_each_iteration
  if (!missing(score_tree_interval))
    parms$score_tree_interval <- score_tree_interval
  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(ntrees))
    parms$ntrees <- ntrees
  if (!missing(max_depth))
    parms$max_depth <- max_depth
  if (!missing(min_rows))
    parms$min_rows <- min_rows
  if (!missing(max_runtime_secs))
    parms$max_runtime_secs <- max_runtime_secs
  if (!missing(seed))
    parms$seed <- seed
  if (!missing(build_tree_one_node))
    parms$build_tree_one_node <- build_tree_one_node
  if (!missing(mtries))
    parms$mtries <- mtries
  if (!missing(sample_size))
    parms$sample_size <- sample_size
  if (!missing(sample_rate))
    parms$sample_rate <- sample_rate
  if (!missing(col_sample_rate_change_per_level))
    parms$col_sample_rate_change_per_level <- col_sample_rate_change_per_level
  if (!missing(col_sample_rate_per_tree))
    parms$col_sample_rate_per_tree <- col_sample_rate_per_tree
  if (!missing(categorical_encoding))
    parms$categorical_encoding <- categorical_encoding
  if (!missing(stopping_rounds))
    parms$stopping_rounds <- stopping_rounds
  if (!missing(stopping_metric))
    parms$stopping_metric <- stopping_metric
  if (!missing(stopping_tolerance))
    parms$stopping_tolerance <- stopping_tolerance
  if (!missing(export_checkpoints_dir))
    parms$export_checkpoints_dir <- export_checkpoints_dir
  if (!missing(contamination))
    parms$contamination <- contamination
  if (!missing(validation_frame))
    parms$validation_frame <- validation_frame
  if (!missing(validation_response_column))
    parms$validation_response_column <- validation_response_column

  # Build segment-models specific parameters
  segment_parms <- list()
  if (!missing(segment_columns))
    segment_parms$segment_columns <- segment_columns
  if (!missing(segment_models_id))
    segment_parms$segment_models_id <- segment_models_id
  segment_parms$parallelism <- parallelism

  # Error check and build segment models
  segment_models <- .h2o.segmentModelsJob('isolationforest', segment_parms, parms, h2oRestApiVersion=3)
  return(segment_models)
}