R/model_functions.R

Defines functions h2o_results print.h2o_automl plot.h2o_automl h2o_automl

Documented in h2o_automl h2o_results plot.h2o_automl print.h2o_automl

####################################################################
#' Automated H2O's AutoML
#'
#' This function lets the user create a robust and fast model, using
#' H2O's AutoML function. The result is a list with the best model,
#' its parameters, datasets, performance metrics, variables
#' importance, and plots. Read more about the \code{h2o_automl()} pipeline
#' \href{https://laresbernardo.github.io/lares/articles/h2o_automl.html}{here}.
#'
#' @section List of algorithms:
#' \href{https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html}{-> Read more here}
#' \describe{
#'   \item{DRF}{Distributed Random Forest, including Random Forest (RF)
#'   and Extremely-Randomized Trees (XRT)}
#'   \item{GLM}{Generalized Linear Model}
#'   \item{XGBoost}{eXtreme Grading Boosting}
#'   \item{GBM}{Gradient Boosting Machine}
#'   \item{DeepLearning}{Fully-connected multi-layer artificial neural network}
#'   \item{StackedEnsemble}{Stacked Ensemble}
#' }
#'
#' @section Methods:
#' \describe{
#'   \item{print}{Use \code{print} method to print models stats and summary}
#'   \item{plot}{Use \code{plot} method to plot results using \code{mplot_full()}}
#' }
#'
#' @family Machine Learning
#' @inheritParams h2o::h2o.automl
#' @param df Dataframe. Dataframe containing all your data, including
#' the dependent variable labeled as \code{'tag'}. If you want to define
#' which variable should be used instead, use the \code{y} parameter.
#' @param y Variable or Character. Name of the dependent variable or response.
#' @param ignore Character vector. Force columns for the model to ignore
#' @param train_test Character. If needed, \code{df}'s column name with 'test'
#' and 'train' values to split
#' @param split Numeric. Value between 0 and 1 to split as train/test
#' datasets. Value is for training set. Set value to 1 to train with all
#' available data and test with same data (cross-validation will still be
#' used when training). If \code{train_test} is set, value will be overwritten
#' with its real split rate.
#' @param weight Column with observation weights. Giving some observation a
#' weight of zero is equivalent to excluding it from the dataset; giving an
#' observation a relative weight of 2 is equivalent to repeating that
#' row twice. Negative weights are not allowed.
#' @param target Value. Which is your target positive value? If
#' set to \code{'auto'}, the target with largest \code{mean(score)} will be
#' selected. Change the value to overwrite. Only used when binary
#' categorical model.
#' @param balance Boolean. Auto-balance train dataset with under-sampling?
#' @param impute Boolean. Fill \code{NA} values with MICE?
#' @param no_outliers Boolean/Numeric. Remove \code{y}'s outliers from the dataset?
#' Will remove those values that are farther than n standard deviations from
#' the dependent variable's mean (Z-score). Set to \code{TRUE} for default (3)
#' or numeric to set a different multiplier.
#' @param unique_train Boolean. Keep only unique row observations for training data?
#' @param center,scale Boolean. Using the base function scale, do you wish
#' to center and/or scale all numerical values?
#' @param thresh Integer. Threshold for selecting binary or regression
#' models: this number is the threshold of unique values we should
#' have in \code{'tag'} (more than: regression; less than: classification)
#' @param seed Integer. Set a seed for reproducibility. AutoML can only
#' guarantee reproducibility if max_models is used because max_time is
#' resource limited.
#' @param max_models,max_time Numeric. Max number of models and seconds
#' you wish for the function to iterate. Note that max_models guarantees
#' reproducibility and max_time not (because it depends entirely on your
#' machine's computational characteristics)
#' @param start_clean Boolean. Erase everything in the current h2o
#' instance before we start to train models? You may want to keep other models
#' or not. To group results into a custom common AutoML project, you may
#' use \code{project_name} argument.
#' @param exclude_algos,include_algos Vector of character strings. Algorithms
#' to skip or include during the model-building phase. Set NULL to ignore.
#' When both are defined, only \code{include_algos} will be valid.
#' @param plots Boolean. Create plots objects?
#' @param alarm Boolean. Ping (sound) when done. Requires \code{beepr}.
#' @param quiet Boolean. Quiet all messages, warnings, recommendations?
#' @param print Boolean. Print summary when process ends?
#' @param save Boolean. Do you wish to save/export results into your
#' working directory?
#' @param subdir Character. In which directory do you wish to save
#' the results? Working directory as default.
#' @param project Character. Your project's name
#' @param ... Additional parameters on \code{h2o::h2o.automl}
#' @return List. Trained model, predicted scores and datasets used, performance
#' metrics, parameters, importance data.frame, seed, and plots when \code{plots=TRUE}.
#' @examples
#' \dontrun{
#' # CRAN
#' data(dft) # Titanic dataset
#' dft <- subset(dft, select = -c(Ticket, PassengerId, Cabin))
#'
#' # Classification: Binomial - 2 Classes
#' r <- h2o_automl(dft, y = Survived, max_models = 1, impute = FALSE, target = "TRUE", alarm = FALSE)
#'
#' # Let's see all the stuff we have inside:
#' lapply(r, names)
#'
#' # Classification: Multi-Categorical - 3 Classes
#' r <- h2o_automl(dft, Pclass, ignore = c("Fare", "Cabin"), max_time = 30, plots = FALSE)
#'
#' # Regression: Continuous Values
#' r <- h2o_automl(dft, y = "Fare", ignore = c("Pclass"), exclude_algos = NULL, quiet = TRUE)
#' print(r)
#'
#' # WITH PRE-DEFINED TRAIN/TEST DATAFRAMES
#' splits <- msplit(dft, size = 0.8)
#' splits$train$split <- "train"
#' splits$test$split <- "test"
#' df <- rbind(splits$train, splits$test)
#' r <- h2o_automl(df, "Survived", max_models = 1, train_test = "split")
#' }
#' @export
h2o_automl <- function(df, y = "tag",
                       ignore = NULL,
                       train_test = NA,
                       split = 0.7,
                       weight = NULL,
                       target = "auto",
                       balance = FALSE,
                       impute = FALSE,
                       no_outliers = TRUE,
                       unique_train = TRUE,
                       center = FALSE,
                       scale = FALSE,
                       thresh = 10,
                       seed = 0,
                       nfolds = 5,
                       max_models = 3,
                       max_time = 10 * 60,
                       start_clean = FALSE,
                       exclude_algos = c("StackedEnsemble", "DeepLearning"),
                       include_algos = NULL,
                       plots = TRUE,
                       alarm = TRUE,
                       quiet = FALSE,
                       print = TRUE,
                       save = FALSE,
                       subdir = NA,
                       project = "AutoML Results",
                       verbosity = NULL,
                       ...) {
  try_require("h2o")
  tic(id = "h2o_automl")
  on.exit(toc(id = "h2o_automl", msg = "Process duration:", quiet = quiet))

  if (!quiet) message(paste(Sys.time(), "| Started process..."))

  quiet(h2o.init(nthreads = -1, port = 54321))

  df <- as.data.frame(df)
  y <- gsub('"', "", as_label(enquo(y)))

  # PROCESS THE DATA
  processed <- model_preprocess(
    df,
    y = y,
    train_test = train_test,
    split = split,
    weight = weight,
    target = target,
    balance = balance,
    impute = impute,
    no_outliers = no_outliers,
    unique_train = unique_train,
    center = center,
    scale = scale,
    thresh = thresh,
    seed = seed,
    quiet = quiet
  )

  # PROCESSED DATA: TRAIN AND TEST
  df <- processed$data
  train <- df[processed$train_index, ]
  test <- df[-processed$train_index, ]
  if (nrow(test) == 0) test <- train

  # MODEL TYPE (based on inputs + thresh value)
  model_type <- processed$model_type

  # ALGORITHMS
  if (length(exclude_algos) > 0 && length(include_algos) == 0 && !quiet) {
    message(paste("- ALGORITHMS: excluded", vector2text(exclude_algos)))
  }
  if (length(include_algos) > 0 && !quiet) {
    message(paste("- ALGORITHMS: included", vector2text(include_algos)))
    exclude_algos <- NULL
  }

  # START FRESH?
  if (!quiet && !isTRUE(start_clean)) {
    message(sprintf(
      paste(
        "- CACHE: Previous models %s being erased.",
        "You may use 'start_clean' [clear] or 'project_name' [join]"
      ),
      ifelse(start_clean, "are", "are not")
    ))
  }
  if (start_clean) quiet(h2o.removeAll())

  # INFORMATIVE MSG ON FLOW's UI
  flow <- "http://localhost:54321/flow/index.html"
  if (!quiet && Sys.getenv("HOSTNAME") == "") {
    message("- UI: You may check results using H2O Flow's interactive platform: ", flow)
  }

  # RUN AUTOML
  if (!quiet) {
    message(sprintf(">>> Iterating until %s models or %s seconds...", max_models, max_time))
  }
  training <- .quiet_h2o(as.h2o(train), quiet = TRUE)
  aml <- .quiet_h2o(h2o.automl(
    x = colnames(df)[!colnames(df) %in% c("tag", ignore)],
    y = "tag",
    training_frame = training,
    weights_column = weight,
    max_runtime_secs = max_time,
    max_models = max_models,
    exclude_algos = exclude_algos,
    include_algos = include_algos,
    nfolds = nfolds,
    # project_name = project,
    seed = seed,
    verbosity = verbosity,
    ...
  ), quiet = quiet)

  if (nrow(aml@leaderboard) == 0) {
    warning("NO MODELS TRAINED. Please set max_models to at least 1 and increase max_time")
  } else {
    if (!is.nan(aml@leaderboard[1, 2])) {
      if (!quiet) {
        message(paste("- EUREKA: Succesfully generated", nrow(aml@leaderboard), "models"))
        if (print) print(head(aml@leaderboard, 3))
      }
    }
  }

  # GET RESULTS AND PERFORMANCE
  results <- h2o_results(
    aml, test, train, y,
    which = 1,
    model_type = model_type,
    target = target,
    split = split,
    plots = plots,
    project = project,
    ignore = ignore,
    seed = seed,
    quiet = quiet
  )

  if (save) {
    export_results(results, subdir = subdir, thresh = thresh)
    if (!quiet) message("- EXPORT: Results and model files exported succesfully!")
  }

  if (!quiet && print) print(results)

  if (alarm && !quiet) {
    try_require("beepr", stop = FALSE)
    try(beep())
  }

  attr(results, "type") <- "h2o_automl"
  return(results)
}

#' @rdname h2o_automl
#' @aliases h2o_automl
#' @param x h2o_automl object
#' @export
plot.h2o_automl <- function(x, ...) {
  if (!inherits(x, "h2o_automl")) {
    stop("Object must be class h2o_automl")
  }
  if ("plots" %in% names(x)) {
    x$plots$dashboard
  } else {
    invisible(mplot_full(
      tag = x$scores_test$tag,
      score = x$scores_test$score,
      multis = select(x$scores_test, -.data$tag, -.data$score)
    ))
  }
}

#' @rdname h2o_automl
#' @aliases h2o_automl
#' @param importance Boolean. Print important variables?
#' @export
print.h2o_automl <- function(x, importance = TRUE, ...) {
  if (!inherits(x, "h2o_automl")) {
    stop("Object must be class h2o_automl")
  }
  aux <- list()
  selected <- which(as.vector(x$leaderboard$model_id) == x$model_name)
  n_models <- nrow(x$leaderboard)
  data_points <- nrow(x$datasets$global)
  split <- round(100 * x$split)

  if (x$type == "Classification") {
    cats <- filter(x$datasets$global, grepl("train", .data$train_test)) %>%
      .[, x$y] %>%
      unique() %>%
      nrow()
    x$type <- sprintf("%s (%s classes)", x$type, cats)
  }

  aux[["met"]] <- glued(
    "Test metrics:
{v2t({met}, sep = '\n', quotes = FALSE)}",
    met = paste(
      "  ",
      names(x$metrics$metrics), "=",
      signif(x$metrics$metrics, 5)
    )
  )

  if ("importance" %in% names(x) && importance == TRUE) {
    if (nrow(x$importance) > 0) {
      aux[["imp"]] <- glued(
        "Most important variables:
{v2t({imp}, sep = '\n', quotes = FALSE)}",
        imp = paste(
          "  ",
          x$importance %>% head(5) %>%
            mutate(label = sprintf(
              "%s (%s)",
              .data$variable,
              formatNum(100 * .data$importance, 1, pos = "%")
            )) %>%
            pull(.data$label)
        )
      )
    }
  }

  print(glued("
    Model ({selected}/{n_models}): {x$model_name}
    Dependent Variable: {x$y}
    Type: {x$type}
    Algorithm: {toupper(x$algorithm)}
    Split: {split}% training data (of {data_points} observations)
    Seed: {x$seed}

    {aux$met}

    {aux$imp}"))
}


####################################################################
#' Automated H2O's AutoML Results
#'
#' This is an auxiliary function to calculate predictions and results
#' when using the \code{h2o_automl()} function.
#'
#' @inheritParams h2o_automl
#' @param h2o_object H2O Leaderboard (H2OFrame/H2OAutoML) or Model (h2o)
#' @param test,train Dataframe. Must have the same columns
#' @param which Integer. Which model to select from leaderboard
#' @param model_type Character. Select "Classification" or "Regression"
#' @param ignore Character vector. Columns too ignore
#' @param leaderboard H2O's Leaderboard. Passed when using
#' \code{h2o_selectmodel} as it contains plain model and no leader board.
#' @return List. Trained model, predicted scores and datasets used, performance
#' metrics, parameters, importance data.frame, seed, and plots when \code{plots=TRUE}.
#' @export
h2o_results <- function(h2o_object, test, train, y = "tag", which = 1,
                        model_type, target = "auto", split = 0.7,
                        ignore = NULL, quiet = FALSE,
                        project = "ML Project", seed = 0,
                        leaderboard = list(),
                        plots = TRUE,
                        ...) {
  # MODEL TYPE
  types <- c("Classification", "Regression")
  check_opts(model_type, types)
  thresh <- ifelse(model_type == types[1], 10000, 0)

  # When using h2o_select
  if ("train_test" %in% colnames(test)) {
    colnames(test)[colnames(test) == y] <- "tag"
    colnames(train)[colnames(train) == y] <- "tag"
    test <- test[, 1:(which(colnames(test) == "train_test") - 1)]
    train <- train[, 1:(which(colnames(train) == "train_test") - 1)]
    split <- round(nrow(train) / (nrow(train) + nrow(test)), 2)
  }

  # GLOBAL DATAFRAME FROM TEST AND TRAIN
  if (!all(colnames(train) %in% colnames(test))) {
    stop("All columns from train data must be present on test data as well!")
  }
  if (split == 1) {
    global <- train %>% mutate(train_test = "train_test")
  } else {
    global <- data.frame(test) %>%
      bind_rows(train) %>%
      mutate(train_test = c(rep("test", nrow(test)), rep("train", nrow(train))))
  }
  colnames(global)[colnames(global) == "tag"] <- y
  if (model_type == "Classification") {
    cats <- unique(global[, colnames(global) == y])
  } else {
    cats <- "None"
  }

  # SELECT MODEL FROM h2o_automl()
  if (any(c("H2OFrame", "H2OAutoML") %in% class(h2o_object))) {
    # Note: Best model from leaderboard is which = 1
    m <- h2o.getModel(as.vector(h2o_object@leaderboard$model_id[which]))
    if (!quiet) message(paste("SELECTED MODEL:", as.vector(m@model_id)))
  } else {
    m <- h2o_object
  }

  # VARIABLES IMPORTANCES
  # https://docs.h2o.ai/h2o/latest-stable/h2o-docs/variable-importance.html
  if (sum(grepl("Stacked", as.vector(m@model_id))) > 0) {
    stacked <- TRUE
    if (!quiet) message("- NOTE: No importance features for Stacked Ensemble Models")
  } else {
    stacked <- FALSE
  }
  if (!stacked) {
    imp <- data.frame(h2o.varimp(m)) %>%
      {
        if ("names" %in% colnames(.)) {
          rename(., "variable" = "names", "importance" = "coefficients")
        } else {
          .
        }
      } %>%
      {
        if ("percentage" %in% colnames(.)) {
          rename(., "importance" = "percentage")
        } else {
          .
        }
      }
    noimp <- if (nrow(imp) > 0) {
      dplyr::filter(imp, .data$importance < 1 / (nrow(imp) * 4)) %>%
        arrange(desc(.data$importance))
    } else {
      imp
    }
    if (nrow(noimp) > 0) {
      topn <- noimp %>%
        ungroup() %>%
        slice(1:8)
      which <- vector2text(topn$variable, quotes = FALSE)
      if (nrow(noimp) > 8) {
        which <- paste(which, "and", nrow(noimp) - 8, "other...")
      }
      if (!quiet) {
        message(paste("- NOTE: The following variables were the least important:", which))
      }
    }
  }

  # GET PREDICTIONS
  if (!quiet) message(paste0(">>> Running predictions for ", y, "..."))
  predictions <- .quiet_h2o(h2o_predict_model(global, m), quiet = TRUE)
  global <- cbind(global, predictions)
  # Change dots for space
  if (sum(grepl(" ", cats)) > 0) {
    colnames(global) <- str_replace_all(colnames(global), "\\.", " ")
  }

  # For performance metrics
  scores_test <- .get_scores(
    predictions, test,
    model_type = model_type,
    target = target,
    cats = cats
  )
  multis <- scores_test$multis
  scores <- scores_test$scores

  # # Used for train metrics
  # scores_train <- .get_scores(
  #   predictions, train,
  #   model_type = model_type,
  #   target = target, cats = cats)
  # scores_tr <- scores_train$scores
  # multis_tr <- scores_train$multis
  # scores_train <- data.frame(tag = as.vector(train$tag), scores_tr)

  # GET ALL RESULTS INTO A LIST
  results <- list()
  results[["model"]] <- m
  results[["y"]] <- y
  results[["scores_test"]] <- data.frame(tag = as.vector(test$tag), scores)
  results[["metrics"]] <- model_metrics(
    tag = results$scores_test$tag,
    score = results$scores_test$score,
    multis = multis,
    thresh = thresh,
    target = target,
    model_name = as.vector(m@model_id),
    plots = plots
  )
  cvresults <- m@model$cross_validation_metrics_summary
  if (!is.null(cvresults)) {
    results$metrics[["cv_metrics"]] <- as_tibble(
      data.frame(
        metric = rownames(cvresults),
        mutate_all(cvresults, list(~ as.numeric(as.character(.))))
      )
    )
  }
  if (model_type == "Classification") {
    if (length(cats) == 2) {
      results$metrics[["max_metrics"]] <- data.frame(
        m@model$cross_validation_metrics@metrics$max_criteria_and_metric_scores
      )
    }
    if (length(cats) > 2) {
      results$metrics[["hit_ratio"]] <- data.frame(
        m@model$cross_validation_metrics@metrics$hit_ratio_table
      )
    }
  }
  # results[["parameters"]] <- m@parameters[
  #   sapply(m@parameters, function(x) length(x) == 1)] %>%
  #   bind_rows() %>% tidyr::gather(key = "parameter")
  results[["parameters"]] <- m@parameters
  if (!stacked) results[["importance"]] <- imp
  results[["datasets"]] <- list(
    global = as_tibble(global),
    test = filter(global, grepl("test", .data$train_test))
  )
  # results[["metrics_train"]] <- model_metrics(
  #   tag = scores_train$tag,
  #   score = scores_train$score,
  #   multis = multis_tr,
  #   thresh = thresh,
  #   target = target,
  #   model_name = as.vector(m@model_id),
  #   type = "train")
  results[["scoring_history"]] <- as_tibble(m@model$scoring_history)
  results[["categoricals"]] <- list_cats(filter(global, grepl("train", .data$train_test)))
  results[["type"]] <- model_type
  results[["split"]] <- split
  if (model_type == "Classification") {
    results[["threshold"]] <- thresh
  }
  results[["model_name"]] <- as.vector(m@model_id)
  results[["algorithm"]] <- m@algorithm
  if (any(c("H2OFrame", "H2OAutoML") %in% class(h2o_object))) {
    results[["leaderboard"]] <- h2o_object@leaderboard
  }
  if (length(leaderboard) > 0) {
    results[["leaderboard"]] <- leaderboard
  }
  results[["project"]] <- project
  results[["y"]] <- y
  results[["ignored"]] <- ignore
  results[["seed"]] <- seed
  results[["h2o"]] <- h2o.getVersion()

  if (plots) {
    if (!quiet) message(">>> Generating plots...")
    plots <- list()
    plots[["dashboard"]] <- mplot_full(
      tag = results$scores_test$tag,
      score = results$scores_test$score,
      multis = multis,
      thresh = thresh,
      subtitle = results$project,
      model_name = results$model_name,
      plot = FALSE
    )
    plots[["metrics"]] <- results$metrics$plots
    results$metrics$plots <- NULL
    if (length(multis) > 1) {
      plots[["top_cats"]] <- mplot_topcats(
        tag = results$scores_test$tag,
        score = results$scores_test$score,
        multis = multis,
        model_name = results$model_name
      )
    }
    if (!stacked) {
      plots[["importance"]] <- mplot_importance(
        var = results$importance$variable,
        imp = results$importance$importance,
        model_name = results$model_name,
        subtitle = results$project
      )
    }
    plots <- append(plots, rev(as.list(results$metrics$plots)))
    results$plots <- plots
  }

  attr(results, "type") <- "h2o_automl"
  class(results) <- c("h2o_automl", class(results))

  return(results)
}

.get_scores <- function(predictions,
                        traintest,
                        model_type,
                        target = "auto",
                        cats) {
  type <- deparse(substitute(traintest))

  # Train or Test data
  nrows <- nrow(traintest)
  if (type == "test") {
    scores <- predictions[1:nrows, ]
  }
  if (type == "train") {
    scores <- predictions[(nrows + 1):nrows, ]
  }

  # Selected target value
  if (target != "auto" && length(cats) == 2) {
    scores <- target_set(
      tag = as.vector(traintest$tag),
      score = scores[, 2],
      target = target,
      quiet = TRUE
    )$df
  }

  # Multis object and standard predictions output
  multis <- NA
  if (model_type == "Classification") {
    if (length(cats) == 2) {
      scores <- select_if(scores, is.numeric) %>% .[, 1]
    } else {
      colnames(scores)[1] <- "score"
      multis <- select(scores, -.data$score)
    }
  } else {
    scores <- data.frame(score = as.vector(scores))
  }
  ret <- list(scores = scores, multis = multis)
  return(ret)
}


####################################################################
#' Select Model from h2o_automl's Leaderboard
#'
#' Select wich model from the h2o_automl function to use
#'
#' @family Machine Learning
#' @family Tools
#' @inheritParams h2o_automl
#' @param results \code{h2o_automl()} object.
#' @param which_model Integer. Which model from the leaderboard you wish to use?
#' @return H2O processed model
#' @export
h2o_selectmodel <- function(results, which_model = 1, quiet = FALSE, ...) {
  check_attr(results, attr = "type", check = "h2o_automl")

  # Select model (Best one by default)
  ntop <- nrow(results$leaderboard)
  if (which_model > ntop) {
    stop("Select a valid model ID. Range: 1 to ", ntop)
  }
  model_id <- as.vector(results$leaderboard$model_id[which_model])
  if (!quiet) message("Model selected: ", model_id)
  m <- h2o.getModel(model_id)
  d <- results$datasets

  # Calculate everything
  output <- h2o_results(
    m,
    test = d$test,
    train = filter(d$global, grepl("test", .data$train_test)),
    y = results$y,
    which = which_model,
    model_type = results$type,
    project = results$project,
    leaderboard = results$leaderboard,
    seed = results$seed,
    quiet = TRUE,
    ...
  )

  if (!quiet) print(output)
  return(output)
}


####################################################################
#' Export h2o_automl's Results
#'
#' Export RDS, TXT, POJO, MOJO and all results from \code{h2o_automl()}.
#'
#' @family Machine Learning
#' @family Tools
#' @param results \code{h2o_automl} or \code{h2o} model
#' @param thresh Integer. Threshold for selecting binary or regression
#' models: this number is the threshold of unique values we should
#' have in 'tag' (more than: regression; less than: classification)
#' @param which Character vector. Select which file format to export:
#' Possible values: txt, csv, rds, binary, mojo, plots. You might also
#' use dev (txt, csv, rds) or production (binary, mojo) or simply don't use
#' parameter to export everything
#' @param note Character. Add a note to the txt file. Useful when lots of
#' models are trained and saved to remember which one is which one
#' @param subdir Character. In which directory do you wish to save
#' the results?
#' @param save Boolean. Do you wish to save/export results?
#' @param seed Numeric. For reproducible results and random splits.
#' @return No return value, called for side effects.
#' @export
export_results <- function(results,
                           thresh = 10,
                           which = c(
                             "txt", "csv", "rds",
                             "binary", "mojo", "plots",
                             "dev", "production"
                           ),
                           note = NA,
                           subdir = NA,
                           save = TRUE,
                           seed = 0) {
  if (save) {
    quiet(h2o.init(nthreads = -1, port = 54321))

    pass <- !is.null(attr(results, "type"))
    if (!pass) results <- list(model = results)
    stopifnot(grepl("H2O", class(results$model)))
    name <- ifelse(pass, results$model_name, results$model@model_id)
    subdir <- paste0(ifelse(is.na(subdir), "", subdir), "/", name)
    if (substr(subdir, 1, 1) == "/") subdir <- substr(subdir, 2, nchar(subdir))

    # Directory to save all our results
    dir <- file.path(subdir)
    message(paste("Export directory:", dir))
    if (!dir.exists(dir)) {
      message("Creating directory: ", subdir)
      dir.create(dir, recursive = TRUE)
    }

    if ("dev" %in% which) which <- unique(c(which, "txt", "csv", "rds"))
    if ("production" %in% which) which <- unique(c(which, "binary", "mojo"))

    if ("txt" %in% which || !is.na(note)[1] && pass) {
      on.exit(set.seed(seed))
      results_txt <- list(
        "Project" = results$project,
        "Note" = note,
        "Model Type" = results$type,
        "Algorithm" = results$algorithm,
        "Model name" = name,
        "Train/Test" = table(results$datasets$global$train_test),
        "Metrics Glossary" = results$metrics$dictionary,
        "Test Metrics" = results$metrics$metrics,
        "Test Metrics by labels" = if (length(results$metrics$metrics_tags) > 1) {
          results$metrics$metrics_tags
        } else {
          "NA"
        },
        "Test's Confusion Matrix" = if (length(results$metrics$confusion_matrix) > 1) {
          results$metrics$confusion_matrix
        } else {
          NULL
        },
        "Predicted Variable" = results$y,
        "Ignored Variables" = results$ignored,
        "Variables Importance" = results$importance,
        "H2O Global Results" = results$model,
        "Leaderboard" = results$leaderboard,
        "Data examples" = data.frame(sample_n(results$datasets$global, 10)),
        "Seed" = results$seed,
        "H20 Version" = results$h2o
      )
      if (is.na(note)[1]) results_txt$Note <- NULL
      capture.output(results_txt, file = paste0(dir, "/", name, ".txt"))
      cats <- lapply(results$categoricals, data.frame)
      aux <- cats[names(cats)[!names(cats) %in% results$ignore]]
      capture.output(aux, file = paste0(dir, "/", name, "_cats.txt"))
      message(">>> Summary text files saved...")
    }

    # Export CSV with predictions and datasets
    if ("csv" %in% which && pass) {
      write.csv(results$datasets$global,
        paste0(dir, "/", name, ".csv"),
        row.names = FALSE
      )
      message(">>> CSV file exported...")
    }

    # Export Results List
    if ("rds" %in% which) {
      saveRDS(results, file = paste0(dir, "/", name, ".rds"))
      message(">>> RDS file exported...")
    }

    # Export Model as POJO && MOJO for Production
    if ("mojo" %in% which) {
      h2o.download_mojo(results$model, path = dir, get_genmodel_jar = TRUE)
      message(">>> MOJO (zip + jar files) exported...")
    }

    # if (pojo) h2o.download_pojo(results$model, path = dir)

    # Export Binary
    if ("binary" %in% which) {
      h2o.saveModel(results$model, path = dir, force = TRUE)
      message(">>> Binary file saved...")
    }

    if ("plots" %in% which && "plots" %in% names(results) && pass) {
      message(">>> Saving plots...")
      # Metrics plots
      aux <- results$plots$metrics
      for (i in seq_along(aux)) {
        export_plot(aux[[i]],
          name = names(aux)[i],
          width = 8, height = 6, res = 300,
          subdir = paste0(subdir, "/Plots"),
          quiet = TRUE
        )
      }
      # Other plots
      aux <- results$plots
      aux$metrics <- NULL
      for (i in seq_along(aux)) {
        export_plot(aux[[i]],
          name = names(aux)[i],
          width = 8, height = 6, res = 300,
          subdir = paste0(subdir, "/Plots"),
          quiet = TRUE
        )
      }
      message(">>> Plots saved...")
    }
    message(paste("Succesfully exported files:", vector2text(which)))
  }
}


####################################################################
#' Split a dataframe for training and testing sets
#'
#' This function splits automatically a dataframe into train and
#' test datasets. You can define a seed to get the same results
#' every time, but has a default value. You can prevent it from
#' printing the split counter result.
#'
#' @family Machine Learning
#' @family Tools
#' @param df Dataframe
#' @param size Numeric. Split rate value, between 0 and 1. If set to
#' 1, the train and test set will be the same.
#' @param seed Integer. Seed for random split
#' @param print Boolean. Print summary results?
#' @return List with both datasets, summary, and split rate.
#' @examples
#' data(dft) # Titanic dataset
#' splits <- msplit(dft, size = 0.7, seed = 123)
#' names(splits)
#' @export
msplit <- function(df, size = 0.7, seed = 0, print = TRUE) {
  if (size <= 0 || size > 1) stop("Set size parameter to a value >0 and <=1")

  on.exit(set.seed(seed))
  df <- data.frame(df)
  if (size == 1) train <- test <- df

  if (size < 1 && size > 0) {
    ind <- sample(seq_len(nrow(df)), size = floor(size * nrow(df)))
    train <- df[ind, ]
    test <- df[-ind, ]
  } else {
    ind <- seq_len(nrow(df))
  }

  train_size <- dim(train)
  test_size <- dim(test)
  summary <- rbind(train_size, test_size)[, 1]

  if (print == TRUE) print(summary)

  sets <- list(
    train = train,
    test = test,
    summary = summary,
    split_size = size,
    train_index = ind
  )

  return(sets)
}


####################################################################
#' Set Target Value in Target Variable
#'
#' This function detects or forces the target value when predicting
#' a categorical binary model. This is an auxiliary function.
#'
#' @param tag Vector. Real known label
#' @param score Vector. Predicted value or model's result
#' @param target Value. Which is your target positive value? If
#' set to 'auto', the target with largest mean(score) will be
#' selected. Change the value to overwrite. Only used when binary
#' categorical model.
#' @param quiet Boolean. Do not show message for auto target?
#' @return List. Contains original data.frame \code{df} and
#' \code{which} with the target variable.
#' @export
target_set <- function(tag, score, target = "auto", quiet = FALSE) {
  df <- data.frame(tag = tag, score = score)

  # Validate inputs
  if (!is.numeric(score) || is.numeric(tag)) {
    stop("Your tag must be categorical. Your score must be numerical.")
  }

  # Get mean scores for each tag
  means <- df %>%
    group_by(.data$tag) %>%
    summarise(mean = mean(.data$score))
  auto <- means$tag[means$mean == max(means$mean)]
  if (length(auto) > 1) {
    auto <- if (any(auto %in% target)) target else auto[1]
  }
  if (target == "auto") {
    target <- auto
  }
  if (!target %in% unique(df$tag)) {
    stop(paste(
      "Your target value", target, "is not valid.",
      "Possible other values:", vector2text(unique(df$tag))
    ))
  }
  if (!quiet) message(paste("Target value:", target))
  # If the forced target value is the "lower scores" value, invert scores
  if (auto != target) df$score <- df$score * (-1) + 1
  ret <- list(df = df, which = target)
  return(ret)
}


####################################################################
#' Iterate Seeds on AutoML
#'
#' This functions lets the user iterate and search for best seed. Note that if
#' the results change a lot, you are having a high variance in your data.
#'
#' @family Machine Learning
#' @inheritParams h2o_automl
#' @param tries Integer. Number of iterations
#' @param ... Additional arguments passed to \code{h2o_automl}
#' @return data.frame with performance results by seed tried on every row.
#' @export
iter_seeds <- function(df, y, tries = 10, ...) {
  seeds <- data.frame()
  for (i in 1:tries) {
    model <- h2o_automl(df, y, seed = i, quiet = TRUE, ...)
    seeds <- rbind(seeds, cbind(seed = i, model$metrics$metrics))
    seeds <- arrange(seeds, desc(2))
    statusbar(i, tries, seeds[1, 1])
  }
  return(seeds)
}

.quiet_h2o <- function(..., quiet = TRUE) {
  if (quiet) on.exit(h2o.no_progress())
  x <- eval(...)
  h2o.show_progress()
  return(x)
}
laresbernardo/lares documentation built on April 25, 2024, 5:31 a.m.