#' SRxgboost_h2o_automl
#'
#' Runs several models with h2o.automl.
#' Install latestest version of h2o:
#' install.packages("h2o", type="source",
#' repos=(c("http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R")))
#'
#' @param y character
#' @param data_train data.frame
#' @param data_test data.frame
#' @param metric character: "AUTO", "RMSE", "MAE", "RMSLE", "AUC", "AUCPR", "logloss",
#' "lift_top_group", "misclassification", "mean_per_class_error"
#' @param kfold integer
#' @param max_runtime_sec integer
#' @param run_shap boolean
#'
#' @return writes a lot of files
#'
#' @export
SRxgboost_h2o_automl <- function(y = NULL,
data_train = NULL,
data_test = NULL,
metric = NULL,
kfold = 5,
max_runtime_sec = 5 * 60,
run_shap = FALSE,
nthreads = NULL,
max_mem_size = "16g") {
# check lauf ends with ".csv"
if (!grepl('.csv$', lauf)) lauf <- paste0(lauf, ".csv")
#
# create output dir
path_output_ <- paste0(path_output, gsub(".csv", "/", lauf))
if (!dir.exists(path_output_)) {dir.create(path_output_, showWarnings = FALSE, recursive = TRUE)}
#
# start h2o
# Error: cannot open the connection, Permission !!!
Sys.setenv(https_proxy = "")
Sys.setenv(http_proxy = "")
Sys.setenv(http_proxy_user = "")
Sys.setenv(https_proxy_user = "")
if (is.null(nthreads)) {
if (exists("n_cores")) {
nthreads <- n_cores
} else {
nthreads <- parallel::detectCores() - 1
}
}
h2o::h2o.init(nthreads = nthreads, max_mem_size = max_mem_size) # always verbose!
#
# set y as.factor for classification
if (length(unique(data_train[, y])) == 2 & class(data_train[, y]) != "factor") {
data_train[, y] <- as.factor(data_train[, y])
}
#
# define x, y and convert df to H2OFrame
train_h2o <- h2o::as.h2o(data_train)
if (!is.null(data_test)) test_h2o <- h2o::as.h2o(data_test) # [1:1000, ]
x <- setdiff(names(train_h2o), y)
#
# set default metric if not defined
if (is.null(metric)) {
metric <- dplyr::case_when(length(unique(data_train[, y])) == 2 ~ "AUC",
length(unique(data_train[, y])) > 2 ~ "RMSE")
}
#
#
#
### Automatic Machine Learning
#
aml <- h2o::h2o.automl(x = x, y = y,
training_frame = train_h2o,
nfolds = kfold,
max_runtime_secs = max_runtime_sec,
stopping_metric = metric,
sort_metric = metric,
stopping_rounds = 20,
keep_cross_validation_predictions = TRUE,
keep_cross_validation_models = TRUE,
seed = 12345)
#
# Extract leaderboard and leader model
lb <- aml@leaderboard; print(lb)
saveRDS(as.data.frame(lb), paste0(path_output_, "leaderboard.rds"))
lb_leader <- aml@leader; lb_leader
saveRDS(lb_leader, paste0(path_output_, "leaderboard_leader.rds"))
h2o::h2o.saveModel(lb_leader, path = path_output_, force = TRUE)
# h2o::h2o.download_mojo(lb_leader, path = path_output_)
#
lb %>%
as.data.frame() %>%
dplyr::slice(1:min(20, dplyr::n() - 5), (dplyr::n() - 4):dplyr::n()) %>%
# dplyr::mutate(model_id = substr(model_id, 1, regexpr("_AutoML_", model_id) - 1) %>%
# gsub("_", " ", .)) %>%
dplyr::mutate(model_id = stats::reorder(model_id, rmse)) %>%
# dplyr::mutate(model_id = stats::reorder(model_id,
# ifelse(lb_leader@parameters[["distribution"]],
# rmse, auc))) %>%
reshape2::melt(id = "model_id") %>%
dplyr::filter(variable %in% c("auc", "logloss", "mean_per_class_error", "aucpr",
"rmse", "mae", "rmsle")) %>%
dplyr::mutate(variable = gsub("mean_per_class_error", "mean per\nclass error", variable)) %>%
ggplot2::ggplot(ggplot2::aes(x = model_id, y = value)) +
ggplot2::geom_bar(stat = "identity", na.rm = TRUE) +
ggplot2::geom_text(ggplot2::aes(label = round(value, 3)), na.rm = TRUE,
position = ggplot2::position_stack(vjust = 0.5)) +
# ggplot2::scale_y_continuous(breaks = scales::pretty_breaks(5)) +
ggplot2::facet_grid(.~variable, scales = "free") + # , space = "free_x"
ggplot2::coord_flip()
ggplot2::ggsave(paste0(path_output_, "leaderboard.png"), width = 9.92, height = 5.3)
#
#
#
### save some results of topmodel
#
# save topmodel (without StackedEnsemble)
topmodel <- as.data.frame(aml@leaderboard$model_id) %>%
dplyr::filter(!grepl("StackedEnsemble", model_id)) %>%
dplyr::slice(1) %>%
dplyr::pull(model_id)
topmodel <- match(topmodel, as.data.frame(aml@leaderboard$model_id)$model_id)
model <- h2o::h2o.getModel(aml@leaderboard[topmodel, 1])
h2o::h2o.saveModel(model, path = path_output_, force = TRUE)
#
# Variable importance
varimp <- h2o::h2o.varimp(model)
saveRDS(varimp, paste0(path_output_, "varimp_topmodel.rds"))
#
varimp %>%
dplyr::arrange(-percentage) %>%
dplyr::slice(1:min(30, nrow(.))) %>%
ggplot2::ggplot(ggplot2::aes(x = stats::reorder(variable, percentage), y = percentage)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(x = "Variable", y = "Importance (%)") +
ggplot2::scale_y_continuous(labels = scales::percent, breaks = scales::pretty_breaks(5)) +
ggplot2::coord_flip()
ggplot2::ggsave(paste0(path_output_, "varimp_plot_topmodel.png"), width = 9.92, height = 5.3)
#
# partial dependence plots TODO !!!
# tbd
#
# Shap values (not supported for "multinomial" as of 2020-06-27)
# if (run_shap & lb_leader@parameters[["distribution"]] != "multinomial") { # 2025-02-23
if (run_shap) {
shap_TRAIN <- as.data.frame(h2o::h2o.predict_contributions(model, train_h2o))
saveRDS(shap_TRAIN, paste0(path_output_, "shap_TRAIN_topmodel.rds"))
if (!is.null(data_test)) {
shap_TEST <- as.data.frame(h2o::h2o.predict_contributions(model, test_h2o))
saveRDS(shap_TEST, paste0(path_output_, "shap_TEST_topmodel.rds"))
}
# plots TODO !!!
# tbd
}
# save OOF CV train predictions
pred_OOF <- as.data.frame(h2o::h2o.cross_validation_holdout_predictions(model))
saveRDS(pred_OOF, paste0(path_output_, "predictions_OOF_topmodel.rds"))
pred_TRAIN <- h2o::h2o.cross_validation_predictions(model)
saveRDS(pred_TRAIN, paste0(path_output_, "predictions_TRAIN_topmodel.rds"))
#
# save OOF predictions
# pred_OOF <- as.data.frame(h2o::h2o.cross_validation_holdout_predictions(lb_leader))
# saveRDS(pred_OOF, paste0(path_output_, "predictions_OOF.rds"))
#
# make predictions on test
if (!is.null(data_test)) {
pred_TEST <- as.data.frame(h2o::h2o.predict(lb_leader, test_h2o))
saveRDS(pred_TEST, paste0(path_output_, "predictions_TEST.rds"))
utils::write.table(pred_TEST, paste0(path_output_, "predictions_TEST",
" train", round(lb_leader@model[["training_metrics"]]@metrics[[metric]], 4),
" test", round(lb_leader@model[["cross_validation_metrics"]]@metrics[[metric]], 4), ".csv"),
row.names = FALSE, sep = ",", dec = ".")
}
#
# clean up
suppressWarnings(rm(train_h2o, test_h2o, x, y, aml, lb, lb_leader, pred_TRAIN, pred_OOF, pred_TEST,
shap_TRAIN, shap_TEST, varimp, metric, topmodel, path_output_)); invisible(gc())
h2o::h2o.shutdown(prompt = FALSE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.