#' @title Building H2O Grids
#' @description Creates wide range of machine learning models and perform grid search using
#' `H2O`'s \code{h2o.grid} function implemented with `nano` objects.
#' @param nano nano object to store model in. If not specified, a new nano object will be
#' created the results will be stored in the new nano object.
#' @param response a character. Target variable for model.
#' @param algo a character. Algorithm of model to be built.
#' @param data a data.frame containing data to train model. May also contain testing and
#' holdout data, in which case, the `train_test` must be specified.
#' @param test a data.frame containing testing dataset. If this is provided, the `train_test`,
#' `fold_column` and `nfolds` arguments cannot be used.
#' @param train_test a character. Variable in `data` which contains split for training,
#' testing and holdout datasets (optional). Can only have the values: "train", "test",
#' "holdout".
#' @param grid_id a character. Unique id for created grid.
#' @param ignore_vars vector of characters. Variables in the dataset which should not be used
#' for modelling. Note, if any of `train_test`, `weight_column` or `fold_column` arguments
#' are specified, those variables will be automatically included in `ignore_vars`.
#' @param weight_column a character. Column name in `data` containing weights if used.
#' @param fold_column a character. Column name in `data` containing fold assignments if used.
#' If this is provided, the `test` and `nfolds` arguments cannot be used. The `train_test`
#' argument can be used, however it cannot contain the values "test".
#' @param nfolds a numeric. Number of folds used in cross-validation. If this is provided, the
#' `test` and `nfolds` arguments cannot be used. The `train_test` argument can be used,
#' however it cannot contain the values "test".
#' @param thresh a numeric. Cutoff of number of unique values in response variable to
#' determine whether performing classification or regression. Default value is 10.
#' @param monotone_constraints a list. Mapping between variable names in `data` to values
#' +1 or -1. Use +1 to enforce an increasing constraint while use -1 for a decreasing
#' constraint. Constraints are only valid for numerical columns.
#' @param hyper_params a list. Contains model hyper-parameters for hyper-parameter tuning. The
#' possible hyper-parameters that can be used depends on the algorithm. Look at the `H2O`
#' functions for the specific algorithm to see full details on the available hyper-parameters:
#' h2o.gbm, h2o.glm, h2o.kmeans, h2o.deepLearning.
#' @param strategy a character. Specify whether to perform a Cartesian or random grid search.
#' Only required when more than 1 combination of hyper-parameters are entered.
#' @param max_models a numeric. Maximum number of models to be built.
#' @param max_runtime_secs a numeric. Maximum amount of time (sec) spent building models.
#' @param stopping_metric a character. Metric used to determine whether to terminate fitting
#' process. This is used for sorting the fitted models and determining the best model in the
#' grid. The default for regression models is `deviance` while the for classification models
#' is `logloss`.
#' @param stopping_tolerance a numeric. Minimum threshold for the `stopping_metric` to
#' improve by to continue the fitting process.
#' @param stopping_rounds a numeric. Number of rounds in which if the `stopping_metric` has
#' not at least improved by the `stopping_tolerance`, then terminate fitting process.
#' @param plots a logical. Whether to produce plots.
#' @param alarm a logical. Whether to beep when function has finished running.
#' @param quiet a logical. Whether to print messages to the console.
#' @param seed a numeric.
#' @param grid_description a character. Optional description of grid. Can be later accessed by
#' \code{nano$grid[[grid_no]]@meta$description}.
#' @param ... further parameters to pass to \code{h2o.grid} depending on `algo`.
#' @return nano object with new entry filled with grid produced.
#' @details This function used `H2O`'s \code{h2o.grid} function to easily and quickly build
#' difference machine learning models and perform grid search for hyper-parameter tuning. To
#' perform hyper-parameter tuning, input the desired hyper-parameters in the `hyper_params`
#' argument and set the range of values to build the models on. Importantly, an active H2O
#' connection is required (i.e. run \code{h2o.init()})) before using this function.
#'
#' For more details, please see the documentation for \code{h2o.grid}.
#' @examples
#' \dontrun{
#' if(interactive()){
#' library(h2o)
#' library(nano)
#'
#' h2o.init()
#'
#' # import dataset
#' data(property_prices)
#' # prepare data for modelling
#' data_all <- nano::data_prep(data = property_prices,
#' response = "sale_price",
#' split_or_fold = 0.7,
#' holdout_ratio = 0.1)
#' data <- data_all$data
#'
#' # create models and perform hyper-parameter tuning on ntrees
#' nano <- nano_grid(data = data,
#' response = "sale_price",
#' algo = "drf", # random forest
#' train_test = "data_id",
#' ignore_vars = "data_id",
#' hyper_params = list(ntrees = 1:5),
#' strategy = "RandomDiscrete", # random grid search
#' stopping_metric = "mse",
#' stopping_tolerance = 1e-3,
#' stopping_rounds = 5)
#'
#' }
#' }
#' @rdname nano_grid
#' @export
nano_grid <- function (nano = nano::create_nano(), response, algo, data, test,
train_test = NA, grid_id = paste0("grid_", nano$n_model + 1),
ignore_vars = c(), weight_column = NULL, fold_column = NULL,
nfolds = NA, thresh = 10, monotone_constraints = c(),
hyper_params = NULL, strategy = "RandomDiscrete", max_models = 10,
max_runtime_secs = 60 * 10, stopping_metric = NULL,
stopping_tolerance = 0, stopping_rounds = 0, plots = TRUE,
alarm = TRUE, quiet = FALSE, save = FALSE, subdir = NA,
project = "ML Project", seed = 628, grid_description = "", ...) {
if (class(nano) != "nano") {
stop("`nano` must be a nano object.",
call. = FALSE)
}
if (!missing(test) & !is.na(train_test)) {
stop("Cannot specify `train_test` column and also provide a `test` dataset. Please choose only one of these options.",
call. = FALSE)
}
if (!missing(fold_column) & !is.na(nfolds)) {
stop("Cannot specify `fold_column` and also set the `nfolds` parameter. Please choose only one of these options.",
call. = FALSE)
}
if ((!missing(test) | "test" %in% data[[train_test]]) & (!missing(fold_column) | !is.na(nfolds))) {
stop("Cannot specify both testing dataset and cross-validation methods. Please choose only one of these methods.",
call. = FALSE)
}
if (!is.na(train_test) & !all(unique(data[[train_test]]) %in% c("train", "test", "holdout"))) {
stop("`train_test` column must contain the values `train`, `test` or `holdout`.",
call. = FALSE)
}
if (!algo %in% c("gbm", "xgboost") & !is.null(monotone_constraints)) {
stop("Can only specify `monotone_constraints` for gbm and xgboost algorithmns.",
call. = FALSE)
}
if (!all(names(monotone_constraints) %in% names(data))){
stop("Names of `monotone_constraints` must be variables in `data`.",
call. = FALSE)
}
if (!is.character(grid_description)) {
stop("`grid_description` must be character type.",
call. = FALSE)
}
nano:::tic(id = paste0("nano_", grid_id))
on.exit(nano:::toc(id = paste0("nano_", grid_id), msg = "Process duration:",
quiet = quiet))
if (!quiet) message(paste(Sys.time(), "| Started process..."))
# increase nano space if required
nano <- nano:::nano_increase(nano)
# determine type of model
res_levels <- unique(data[[which(colnames(data) == response)]])
model_type <- ifelse(length(res_levels) <= thresh, "Classification", "Regression")
# for classification model, convert response variable to factor type
if (model_type == "Classification") data[[response]] <- as.factor(data[[response]])
if (!quiet) message("MODEL TYPE: ", model_type)
## create training and testing datasets
# initialise testing dataset
if (missing(test)) test <- NA
# split data in training and testing
train <- data.table::copy(data.table::as.data.table(data))
if (!is.na(train_test)) {
train <- data[get(train_test) == "train"]
test <- data[get(train_test) == "test"]
# if "test" not specified in train_test then set test to NA
if (nrow(test) == 0) test <- NA
}
# if nfolds specified, assign folds to rows
if (!is.na(nfolds)) {
fold <- caret::createFolds(as.vector(train[[response]]), k = nfolds,
list = FALSE)
train[, fold := as.factor(fold)]
fold_column <- "fold"
nfolds <- NA
}
# determine variables to ignore in training models
ignore_vars <- c(ignore_vars,
if (!missing(train_test)) train_test,
if (!missing(weight_column)) weight_column,
if (!missing(fold_column)) fold_column)
# default stopping metric
if (is.null(stopping_metric)) {
if (model_type == "Regression") {
stopping_metric = "deviance"
} else {
stopping_metric = "logloss"
}
}
# search criteria
if (strategy == "Cartesian") {
max_models = NULL
max_runtime_secs = NULL
}
search_criteria <- list(strategy = strategy,
max_models = max_models,
max_runtime_secs = max_runtime_secs)
search_criteria <- search_criteria[!sapply(search_criteria, is.null)]
if (!quiet)
message(sprintf(">>> Iterating until %s models or %s seconds...",
max_models, max_runtime_secs))
# fit models
params <- list(...)
grid <- do.call(h2o::h2o.grid, c(list(x = setdiff(names(train), c(response, ignore_vars)),
y = response,
algorithm = algo,
grid_id = grid_id,
training_frame = nano:::quiet(as.h2o(train)),
search_criteria = search_criteria,
stopping_tolerance = stopping_tolerance,
stopping_rounds = stopping_rounds,
stopping_metric = stopping_metric,
seed = seed),
list(validation_frame = nano:::quiet(h2o::as.h2o(test)))[data.table::is.data.table(test)],
list(weights_column = weight_column)[!is.null(weight_column)],
list(hyper_params = hyper_params)[!is.null(hyper_params)],
list(fold_column = fold_column)[!is.null(fold_column)],
list(monotone_constraints = monotone_constraints)[!is.null(monotone_constraints)],
params[length(params) > 0]
))
# sort the grid models by stopping metric and select best model
grid <- h2o.getGrid(grid_id, sort_by = stopping_metric, decreasing = FALSE)
# print leaderboard
if (nrow(grid@summary_table) == 0) {
stop("NO MODELS TRAINED. Please set max_models to at least 1 or increase max_time")
} else {
if (!quiet) {
message(paste("- EUREKA: Succesfully generated",
nrow(grid@summary_table), "models"))
if (!quiet) print(head(grid@summary_table, 10))
}
}
# recreate dataset to input into nano object
if (missing(train_test) & !"data_id" %in% names(data)) {
# is train_test column is not specified, create data_id column
# and combine datasets together
train[, data_id := "train"]
data <- data.table::copy(train)
if (data.table::is.data.table(test)) {
test[, data_id := "test"]
data <- rbind(data, test)
}
} else {
if (!"data_id" %in% names(data)) {
# rename train_test to data_id for consistency
names(data)[names(data) == train_test] <- "data_id"
if (!quiet) {
message(paste0("Renamed ", train_test, " to data_id."))
}
}
}
# rename fold_column to fold for consistency
if (!missing(fold_column)) {
if (fold_column != "fold") {
names(data)[names(data) == fold_column] <- "fold"
if (!quiet) {
message("Renaming ", fold_column, " to fold.")
}
}
}
# add models to nano object
nano$n_model <- nano$n_model + 1
nano$grid[[nano$n_model]] <- nano:::create_Grid(grid)
nano$model[[nano$n_model]] <- h2o::h2o.getModel(grid@model_ids[[1]])
nano$metric[[nano$n_model]] <- nano:::model_metrics(nano$model[[nano$n_model]], data)
nano$data[[nano$n_model]] <- data.table::copy(data)
nano$meta[[nano$n_model]] <- nano:::model_meta(nano$model[[nano$n_model]],
h2o::as.h2o(train))
nano$grid[[nano$n_model]]@meta$description <- grid_description
# if (!is.null(hyper_params)) {
# nano$grid[[paste0("grid_", nano$n_model)]]@meta$tune_hyper_params <- hyper_params
# }
# rename elements in nano object
names(nano$grid)[nano$n_model] <- paste0("grid_" , nano$n_model)
names(nano$model)[nano$n_model] <- paste0("model_" , nano$n_model)
names(nano$metric)[nano$n_model] <- paste0("metric_", nano$n_model)
names(nano$data)[nano$n_model] <- paste0("data_" , nano$n_model)
names(nano$meta)[nano$n_model] <- paste0("meta_" , nano$n_model)
return(nano)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.