#' @title Create ICE
#' @description Creates partial dependency plots (PDPs) from h2o models stored i nano
#' objects.
#' @param model_no the positions of each model in the list of models in the nano object for which
#' the PDP should be calculated. If not entered, the last model is taken by default.
#' @param vars a character vector of variables to create PDPs off.
#' @param row_index a numeric vector of dataset rows numbers to be used to calculate PDPs. To
#' use entire dataset, set to -1.
#' @param plot a logical specifying whether the variable importance should be plotted.
#' @param subtitle subtitle for the plot.
#' @param save a logical specifying whether the plot should be saved into working directory.
#' @param subdir sub directory in which the plot should be saved.
#' @param file_name file name of the saved plot.
#' @return nano object with variable importance of specified models calculated. Also returns a
#' plot if \code{plot = TRUE}.
#' @details Function first checks if the variable importance of the specified model has already
#' been calculated (by checking in the list \code{nano$varimp}). If it has not been calculated,
#' then the variable importance will be calculated and the relevant slot in \code{nano$varimp}
#' will be filled out.
#'
#' If \code{plot = TRUE}, a plot of the variable importance will also be returned. The plot can
#' be saved in a subfolder of the working directory by using the `save` and `subdir` arguments.
#' @examples
#' \dontrun{
#' if(interactive()){
#' library(h2o)
#' library(nano)
#'
#' h2o.init()
#'
#' # import dataset
#' data(property_prices)
#' train <- as.h2o(property_prices)
#'
#' # set the response and predictors
#' response <- "sale_price"
#' var <- setdiff(colnames(property_prices), response)
#'
#' # build grids
#' grid_1 <- h2o.grid(x = var,
#' y = response,
#' training_frame = train,
#' algorithm = "randomForest",
#' hyper_params = list(ntrees = 1:2),
#' nfolds = 3,
#' seed = 628)
#'
#' grid_2 <- h2o.grid(x = var,
#' y = response,
#' training_frame = train,
#' algorithm = "randomForest",
#' hyper_params = list(ntrees = 3:4),
#' nfolds = 3,
#' seed = 628)
#'
#'
#' obj <- create_nano(grid = list(grid_1, grid_2),
#' data = list(property_prices), # since underlying dataset is the same
#' ) # since model is not entered, will take best model from grids
#'
#' # calculate ICE
#' obj <- nano_ice(nano = obj, model_no = 1:2, vars <- c("lot_size", "income"))
#'
#' }
#' }
#' @rdname nano_ice
#' @export
nano_ice <- function (nano, model_no = NA, vars, quantiles = seq(0, 1, 0.1),
max_levels = 30, target = NULL, plot = TRUE, subtitle = NA,
save = FALSE, subdir = NA, file_name) {
if (class(nano) != "nano") {
stop("`nano` must be a nano object.",
call. = FALSE)
}
if (!all(is.na(model_no))) {
if (!is.integer(as.integer(model_no))) {
stop("`model_no` must be numeric.",
call. = FALSE)
}
if (min(model_no) <= 0) {
stop("`model_no` must be greater than 0",
call. = FALSE)
}
if (max(model_no) > nano$n_model) {
stop("`model_no` cannot be greater than number of models in `nano`.",
call. = FALSE)
}
}
if (missing(vars)) {
stop("`vars` must be entered, there are no defaults.",
call.= FALSE)
}
if (!is.character(vars)) {
stop("`vars` must be a vector of character.",
call. = FALSE)
}
# if model_no not entered, then use last model as default
if (all(is.na(model_no))) model_no = nano$n_model
for (i in model_no) {
if (!all(vars %in% nano$model[[i]]@parameters$x)) {
stop("`vars` must be predictors in each of the specified models.",
call. = FALSE)
}
}
if (plot & length(model_no) > 1) {
for (i in 1:(length(model_no) - 1)) {
if (nano$model[[model_no[i]]]@parameters$y != nano$model[[model_no[i + 1]]]@parameters$y) {
warning("the response variable of each of the specified models are not the same hence plots will not be produced.",
call. = FALSE)
}
}
}
# check which models do not already have PDP calculated
models <- list()
vars_inc <- list()
data <- list()
model_inc <- c()
j <- 1
for (i in model_no) {
if (all(is.na(nano$ice[[i]]))) {
new_vars <- vars
} else {
if (grepl("Regression", as.vector(class(nano$model[[model_no[1]]])))) {
label <- paste0(" - ", quantiles * 100, "th Percentile")
} else {
label <- paste0(" - Level: ",
levels(nano$data[[model_no[1]]][[nano$model[[model_no[1]]]@parameters$y]]))
}
new_vars <- c()
for (var in vars) {
key <- paste0(var, label)
if (!all(key %in% nano$ice[[i]]$key)) new_vars <- c(new_vars, var)
}
}
if (length(new_vars) > 0) {
models[[j]] <- nano$model[[i]]
vars_inc[[j]] <- new_vars
data[[j]] <- nano$data[[i]]
model_inc <- c(model_inc, i)
j <- j + 1
}
# write message for pdps which have already been created
vars_prev_inc <- setdiff(vars, new_vars)
if (length(vars_prev_inc) > 0) {
message("For model_", model_no[i], ", ices have already been created for the following variables:")
for (var in vars_prev_inc) {
message(" ", var)
}
}
}
if (length(models) > 0) {
## need to calculate ICEs for required models and variables
# variables which need to be calculated for all models
vars_multi <- Reduce(intersect, vars_inc)
# remaining variables for each model
vars_single <- lapply(vars_inc, function(x) setdiff(x, vars_multi))
# calculate ICEs for variables common across all models
if (length(vars_multi) > 0) {
ice_multi <- nano::nano_multi_ice(models = models,
data = data,
vars = vars_multi,
quantiles = quantiles)
# add calculated ICEs to nano object
for (i in 1:length(models)) {
model_no <- model_inc[i]
if (all(is.na(nano$ice[[model_no]]))) {
nano$ice[[model_no]] <- ice_multi[[i]]
names(nano$ice)[model_no] <- paste0("ice_", model_no)
} else {
nano$ice[[model_no]] <- rbind(nano$ice[[model_no]], ice_multi[[i]])
}
}
}
# for each model, calculate pdps for remaining variables
if (sum(sapply(vars_single, length)) > 0) {
index <- lapply(vars_single, length) > 0
models_single <- models[index]
vars_single <- vars_single[index]
data_single <- data[index]
model_inc_single <- model_inc[index]
for (i in 1:length(models_single)) {
ice_single <- nano::nano_single_ice(model = models_single[[i]],
data = data_single[[i]],
vars = vars_single[[i]],
quantiles = quantiles)
model_no <- model_inc_single[i]
# add newly calculated ICEs to nano object
if (all(is.na(nano$ice[[model_no]]))) {
nano$ice[[model_no]] <- ice_single
names(nano$ice)[model_no] <- paste0("ice_", model_no)
} else {
nano$ice[[model_no]] <- rbind(nano$ice[[model_no]], ice_single)
}
}
}
}
# calculate PDPs which will be used in plots
if (plot == TRUE) {
nano <- nano::nano_pdp(nano = nano,
model_no = model_no,
vars = vars,
plot = FALSE)
}
return(nano)
# # create plots
# p <- ggplot2::ggplot(ggplot2::aes(x = .data[[col_name]],
# y = .data$mean_response, color = .data$name, text = .data$text),
# data = results) + stat_count_or_bin(!is.numeric(newdata[[column]]),
# ggplot2::aes(x = .data[[col_name]], y = (.data$..count../max(.data$..count..)) *
# diff(y_range)/1.61), position = ggplot2::position_nudge(y = y_range[[1]] -
# 0.05 * diff(y_range)), alpha = 0.2, inherit.aes = FALSE,
# data = as.data.frame(newdata[[column]])) + geom_point_or_line(!is.numeric(newdata[[column]]),
# ggplot2::aes(group = .data$name)) + geom_point_or_line(!is.numeric(newdata[[column]]),
# if (is.factor(pdp[[col_name]])) {
# ggplot2::aes(shape = "Partial Dependence",
# group = "Partial Dependence")
# }
# else {
# ggplot2::aes(linetype = "Partial Dependence",
# group = "Partial Dependence")
# }, data = pdp, color = "black") + ggplot2::geom_rug(ggplot2::aes(x = .data[[col_name]],
# y = NULL, text = NULL), sides = "b", alpha = 0.1,
# color = "black", data = stats::setNames(as.data.frame(newdata[[column]]),
# col_name)) + ggplot2::labs(y = "Response",
# title = sprintf("Individual Conditional Expectations on \"%s\"%s\nfor Model: \"%s\"",
# col_name, if (is.null(target)) {
# ""
# }
# else {
# sprintf(" with Target = \"%s\"", target)
# }, model@model_id)) + ggplot2::scale_color_viridis_d(option = "plasma") +
# ggplot2::scale_linetype_manual(values = c(`Partial Dependence` = "dashed")) +
# ggplot2::scale_y_continuous(expand = ggplot2::expansion(mult = c(0,
# 0.05))) + ggplot2::theme_bw() + ggplot2::theme(legend.title = ggplot2::element_blank(),
# axis.text.x = ggplot2::element_text(angle = if (h2o.isfactor(newdata[[col_name]]))
# 45
# else 0, hjust = 1), plot.margin = margin, plot.title = ggplot2::element_text(hjust = 0.5))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.