R/extendedisolationforest.R

Defines functions .h2o.train_segments_extendedisolationforest h2o.extendedIsolationForest

Documented in h2o.extendedIsolationForest

# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py
# Copyright 2016 H2O.ai;  Apache License Version 2.0 (see LICENSE for details) 
#'
# -------------------------- extendedisolationforest -------------------------- #
#'
#' Trains an Extended Isolation Forest model
#'
#' @param training_frame Id of the training data frame.
#' @param x A vector containing the \code{character} names of the predictors in the model.
#' @param model_id Destination id for this model; auto-generated if not specified.
#' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE.
#' @param categorical_encoding Encoding scheme for categorical features Must be one of: "AUTO", "Enum", "OneHotInternal", "OneHotExplicit",
#'        "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited". Defaults to AUTO.
#' @param score_each_iteration \code{Logical}. Whether to score during each iteration of model training. Defaults to FALSE.
#' @param score_tree_interval Score the model after every so many trees. Disabled if set to 0. Defaults to 0.
#' @param ntrees Number of Extended Isolation Forest trees. Defaults to 100.
#' @param sample_size Number of randomly sampled observations used to train each Extended Isolation Forest tree. Defaults to 256.
#' @param extension_level Maximum is N - 1 (N = numCols). Minimum is 0. Extended Isolation Forest with extension_Level = 0 behaves like
#'        Isolation Forest. Defaults to 0.
#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
#'        Defaults to -1 (time-based random number).
#' @param disable_training_metrics \code{Logical}. Disable calculating training metrics (expensive on large datasets) Defaults to TRUE.
#' @examples
#' \dontrun{
#' library(h2o)
#' h2o.init()
#' 
#' # Import the prostate dataset
#' p <- h2o.importFile(path="https://raw.github.com/h2oai/h2o/master/smalldata/logreg/prostate.csv")
#' 
#' # Set the predictors
#' predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON")
#' 
#' # Build an Extended Isolation forest model
#' model <- h2o.extendedIsolationForest(x = predictors,
#'                                      training_frame = p,
#'                                      model_id = "eif.hex",
#'                                      ntrees = 100,
#'                                      sample_size = 256,
#'                                      extension_level = length(predictors) - 1)
#' 
#' # Calculate score
#' score <- h2o.predict(model, p)
#' anomaly_score <- score$anomaly_score
#' 
#' # Number in [0, 1] explicitly defined in Equation (1) from Extended Isolation Forest paper
#' # or in paragraph '2 Isolation and Isolation Trees' of Isolation Forest paper
#' anomaly_score <- score$anomaly_score
#' 
#' # Average path length of the point in Isolation Trees from root to the leaf
#' mean_length <- score$mean_length
#' }
#' @export
h2o.extendedIsolationForest <- function(training_frame,
                                        x,
                                        model_id = NULL,
                                        ignore_const_cols = TRUE,
                                        categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"),
                                        score_each_iteration = FALSE,
                                        score_tree_interval = 0,
                                        ntrees = 100,
                                        sample_size = 256,
                                        extension_level = 0,
                                        seed = -1,
                                        disable_training_metrics = TRUE)
{
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  if(!missing(x))
    parms$ignored_columns <- .verify_datacols(training_frame, x)$cols_ignore

  if (!missing(model_id))
    parms$model_id <- model_id
  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(categorical_encoding))
    parms$categorical_encoding <- categorical_encoding
  if (!missing(score_each_iteration))
    parms$score_each_iteration <- score_each_iteration
  if (!missing(score_tree_interval))
    parms$score_tree_interval <- score_tree_interval
  if (!missing(ntrees))
    parms$ntrees <- ntrees
  if (!missing(sample_size))
    parms$sample_size <- sample_size
  if (!missing(extension_level))
    parms$extension_level <- extension_level
  if (!missing(seed))
    parms$seed <- seed
  if (!missing(disable_training_metrics))
    parms$disable_training_metrics <- disable_training_metrics

  # Error check and build model
  model <- .h2o.modelJob('extendedisolationforest', parms, h2oRestApiVersion=3, verbose=FALSE)
  return(model)
}
.h2o.train_segments_extendedisolationforest <- function(training_frame,
                                                        x,
                                                        ignore_const_cols = TRUE,
                                                        categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"),
                                                        score_each_iteration = FALSE,
                                                        score_tree_interval = 0,
                                                        ntrees = 100,
                                                        sample_size = 256,
                                                        extension_level = 0,
                                                        seed = -1,
                                                        disable_training_metrics = TRUE,
                                                        segment_columns = NULL,
                                                        segment_models_id = NULL,
                                                        parallelism = 1)
{
  # formally define variables that were excluded from function parameters
  model_id <- NULL
  verbose <- NULL
  destination_key <- NULL
  # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
  training_frame <- .validate.H2OFrame(training_frame, required=TRUE)

  # Build parameter list to send to model builder
  parms <- list()
  parms$training_frame <- training_frame
  if(!missing(x))
    parms$ignored_columns <- .verify_datacols(training_frame, x)$cols_ignore

  if (!missing(ignore_const_cols))
    parms$ignore_const_cols <- ignore_const_cols
  if (!missing(categorical_encoding))
    parms$categorical_encoding <- categorical_encoding
  if (!missing(score_each_iteration))
    parms$score_each_iteration <- score_each_iteration
  if (!missing(score_tree_interval))
    parms$score_tree_interval <- score_tree_interval
  if (!missing(ntrees))
    parms$ntrees <- ntrees
  if (!missing(sample_size))
    parms$sample_size <- sample_size
  if (!missing(extension_level))
    parms$extension_level <- extension_level
  if (!missing(seed))
    parms$seed <- seed
  if (!missing(disable_training_metrics))
    parms$disable_training_metrics <- disable_training_metrics

  # Build segment-models specific parameters
  segment_parms <- list()
  if (!missing(segment_columns))
    segment_parms$segment_columns <- segment_columns
  if (!missing(segment_models_id))
    segment_parms$segment_models_id <- segment_models_id
  segment_parms$parallelism <- parallelism

  # Error check and build segment models
  segment_models <- .h2o.segmentModelsJob('extendedisolationforest', segment_parms, parms, h2oRestApiVersion=3)
  return(segment_models)
}

Try the h2o package in your browser

Any scripts or data that you put into this service are public.

h2o documentation built on May 29, 2024, 4:26 a.m.