# tune_GridSearch.R
# ::rtemis::
# 2025 EDG rtemis.org
#' \pkg{rtemis} internal: Grid Search for Hyperparameter Tuning of \pkg{rtemis} Learners
#'
#' Train models using a combination of parameter values for model selection
#'
#' Note that weights, if defined (and not NULL), should be passed directly to `grid_search`
#' as they need to be resampled along `x` and `y`, and should not be passed along with
#' `grid_params`. `ifw` and `ifw_type` should be passed as part of `grid_params`
#' and will be passed on to the learner.
#' Includes a special case for training [s_H2OGBM] or [s_GBM] which requires extracting and averaging n.trees
#' along with params.
#'
#' @param x data.frame or similar: Training set.
#' @param hyperparameters List: Hyperparameters.
#' @param tuner_parameters List: Tuner parameters.
#' @param weights Vector: Class weights.
#' @param verbosity Integer: Verbosity level.
#'
#' @return GridSearch object.
#'
#' @author EDG
#' @keywords internal
#' @noRd
tune_GridSearch <- function(
x,
hyperparameters,
tuner_parameters,
weights = NULL,
save_mods = FALSE,
verbosity = 1L,
parallel_type = "none"
) {
check_is_S7(hyperparameters, Hyperparameters)
check_is_S7(tuner_parameters, TunerParameters)
stopifnot(needs_tuning(hyperparameters))
# Dependencies ----
check_dependencies(c("future", "future.apply"))
# Intro ----
start_time <- intro(
newline_pre = TRUE,
caller = "tune_GridSearch",
verbosity = verbosity - 1L
)
# Arguments ----
algorithm <- hyperparameters@algorithm
# Make Grid ----
grid_params <- get_params_need_tuning(hyperparameters)
n_params <- length(grid_params)
n_resamples <- tuner_parameters[["resampler_parameters"]][["n"]]
search_type <- tuner_parameters[["search_type"]]
# expand_grid convert NULL to "null" for expansion to work.
param_grid <- expand_grid(grid_params, stringsAsFactors = FALSE)
# param_grid <- expand.grid(grid_params, stringsAsFactors = FALSE)
param_grid <- cbind(param_combo_id = seq_len(NROW(param_grid)), param_grid)
n_param_combinations <- NROW(param_grid)
res_param_grid <- expand_grid(
c(list(resample_id = seq_len(n_resamples)), grid_params),
stringsAsFactors = FALSE
)
n_res_x_comb <- NROW(res_param_grid)
if (search_type == "randomized") {
index_per_resample <- sample(
n_param_combinations,
round(tuner_parameters[["randomize_p"]] * n_param_combinations)
)
res_param_grid <- res_param_grid[rep(index_per_resample, n_resamples), ]
}
# Resamples ----
res <- resample(
x = x,
parameters = tuner_parameters[["resampler_parameters"]],
verbosity = verbosity
)
# learner1 ----
if (parallel_type != "mirai") {
ptn <- progressr::progressor(steps = NROW(res_param_grid))
}
learner1 <- function(
index,
x,
res,
res_param_grid,
hyperparameters,
weights,
verbosity,
save_mods,
n_res_x_comb
) {
if (verbosity > 0L) {
msg2(
"Running grid line #",
hilite(index),
"/",
NROW(res_param_grid),
"...",
caller = "tune_GridSearch",
sep = ""
)
}
res1 <- res[[res_param_grid[index, "resample_id"]]]
dat_train1 <- x[res1, ]
weights1 <- weights[res1]
dat_valid1 <- x[-res1, ]
hyperparams1 <- hyperparameters
hyperparams1 <- update(
hyperparams1,
as.list(res_param_grid[index, 2:NCOL(res_param_grid), drop = FALSE]),
tuned = -9L # Hyperparameters are being tuned
)
mod1 <- do_call(
"train",
args = list(
x = dat_train1,
dat_validation = dat_valid1,
algorithm = hyperparams1@algorithm,
hyperparameters = hyperparams1,
weights = weights1,
verbosity = verbosity - 1L
)
)
out1 <- list(
id = index,
resample_id = res_param_grid[index, "resample_id"],
metrics_training = mod1@metrics_training,
metrics_validation = mod1@metrics_validation,
type = mod1@type,
hyperparameters = hyperparams1
)
# Algorithm-specific params ----
# => add to hyperparameters
if (algorithm == "GLMNET") {
out1[["hyperparameters"]]@hyperparameters[["lambda.min"]] <- mod1@model[[
"lambda.min"
]]
out1[["hyperparameters"]]@hyperparameters[["lambda.1se"]] <- mod1@model[[
"lambda.1se"
]]
}
if (algorithm == "LightGBM") {
# Check best_iter is meaningful, otherwise issue message and set to 100L
best_iter <- mod1@model[["best_iter"]]
if (is.null(best_iter) || best_iter == -1 || best_iter == 0) {
info(bold(italic(
"best_iter returned from lightgbm:",
best_iter,
" - setting to 100L"
)))
best_iter <- 100L
}
out1[["hyperparameters"]]@hyperparameters[["best_iter"]] <- best_iter
}
# if (algorithm %in% c("LINAD", "LINOA")) {
# out1$est.n.leaves <- mod1$mod$n.leaves
# }
# if (algorithm == "LIHADBoost") {
# out1$sel.n.steps <- mod1$mod$selected.n.steps
# }
if (save_mods) {
out1[["mod1"]] <- mod1
}
if (parallel_type != "mirai") {
ptn(sprintf("Tuning resample %i/%i", index, n_res_x_comb))
}
out1
} # /learner1
# Train Grid ----
if (verbosity > 0L) {
msg2(
hilite("Tuning", algorithm, "by", search_type, "grid search.")
)
msg20(
hilite(n_param_combinations),
ngettext(
n_param_combinations,
" parameter combination x ",
" parameter combinations x "
),
hilite(n_resamples),
" resamples: ",
hilite(n_res_x_comb),
" models total",
# hilite(n_res_x_comb), " models total running on ",
# singorplu(n_workers, "worker"),
" (",
Sys.getenv("R_PLATFORM"),
")."
)
}
if (parallel_type == "none") {
grid_run <- lapply(
X = seq_len(n_res_x_comb),
FUN = learner1,
x = x,
res = res,
hyperparameters = hyperparameters,
res_param_grid = res_param_grid,
weights = weights,
verbosity = verbosity,
save_mods = save_mods,
n_res_x_comb = n_res_x_comb
)
} else if (parallel_type == "future") {
grid_run <- future.apply::future_lapply(
X = seq_len(n_res_x_comb),
FUN = learner1,
x = x,
res = res,
hyperparameters = hyperparameters,
res_param_grid = res_param_grid,
weights = weights,
verbosity = verbosity,
save_mods = save_mods,
n_res_x_comb = n_res_x_comb,
future.seed = TRUE,
future.globals = FALSE # See: https://github.com/futureverse/globals/issues/93
)
} else if (parallel_type == "mirai") {
grid_run <- mirai::mirai_map(
.x = seq_len(n_res_x_comb),
.f = learner1,
.args = list(
x = x,
res = res,
hyperparameters = hyperparameters,
res_param_grid = res_param_grid,
weights = weights,
verbosity = verbosity,
save_mods = save_mods,
n_res_x_comb = n_res_x_comb
)
)
}
# Metric ----
type <- supervised_type(x)
metric <- tuner_parameters@parameters[["metric"]]
maximize <- tuner_parameters@parameters[["maximize"]]
if (is.null(metric)) {
if (type == "Classification") {
metric <- "Balanced_Accuracy"
} else if (type == "Regression") {
metric <- "MSE"
} else {
metric <- "Concordance"
}
tuner_parameters@parameters[["metric"]] <- metric
}
if (is.null(maximize)) {
maximize <- metric %in%
c("Accuracy", "Balanced_Accuracy", "Concordance", "Rsq", "r")
tuner_parameters@parameters[["maximize"]] <- maximize
}
select_fn <- if (maximize) which.max else which.min
verb <- if (maximize) "maximize" else "minimize"
# Aggregate ----
# Average test errors
# if using mirai, wait for all to finish
if (parallel_type == "mirai") {
# Appease R CMD check
.progress_cli <- NULL
grid_run <- grid_run[.progress_cli]
# grid_run <- mirai::collect_mirai(grid_run)
}
if (type %in% c("Regression", "Survival")) {
metrics_training_all <- as.data.frame(t(sapply(
grid_run,
function(r) unlist(r[["metrics_training"]]@metrics)
)))
metrics_validation_all <- as.data.frame(t(sapply(
grid_run,
function(r) unlist(r[["metrics_validation"]]@metrics)
)))
} else if (type == "Classification") {
metrics_training_all <- as.data.frame(t(sapply(
grid_run,
function(r) unlist(r[["metrics_training"]]@metrics[["Overall"]])
)))
metrics_validation_all <- as.data.frame(t(sapply(
grid_run,
function(r) unlist(r[["metrics_validation"]]@metrics[["Overall"]])
)))
}
metrics_validation_all[["param_combo_id"]] <- rep(
seq_len(n_param_combinations),
each = n_resamples
)
metrics_training_all[["param_combo_id"]] <- rep(
seq_len(n_param_combinations),
each = n_resamples
)
metrics_training_by_combo_id <- aggregate(
. ~ param_combo_id,
data = metrics_training_all,
FUN = tuner_parameters[["metrics_aggregate_fn"]]
)
metrics_validation_by_combo_id <- aggregate(
. ~ param_combo_id,
data = metrics_validation_all,
FUN = tuner_parameters[["metrics_aggregate_fn"]]
)
tune_results <- list(
param_grid = param_grid,
metrics_training = metrics_training_by_combo_id,
metrics_validation = metrics_validation_by_combo_id
)
# Algorithm-specific collection ----
# N of iterations is the one hyperparameter that may be determined
# automatically, we therefore need to extract it and average it
## GLMNET ----
if (algorithm == "GLMNET") {
if (is.null(grid_params[["lambda"]])) {
# if lambda was NULL, cv.glmnet was run and optimal lambda was estimated
# For each i in grid_run, get grid_run[[i]]$hyperparameters[[grid_run[[i]]$hyperparameters$which_lambda_cv]]
if (verbosity > 1L) {
info("Extracting best lambda from GLMNET models...")
}
lambda_cv2 <- data.frame(
lambda = sapply(
grid_run,
function(x) {
x[["hyperparameters"]][[x[["hyperparameters"]][[
"which_lambda_cv"
]]]]
}
)
)
lambda_cv2[["param_combo_id"]] <- rep(
1:n_param_combinations,
each = n_resamples
)
lambda_by_param_combo_id <- aggregate(
lambda ~ param_combo_id,
lambda_cv2,
tuner_parameters[["metrics_aggregate_fn"]]
)
# Replace NULL lambda in tune_results$param_grid with average value of CV-squared lambda
stopifnot(tune_results[["param_grid"]][["lambda"]] == "null")
param_grid[["lambda"]] <- tune_results[["param_grid"]][[
"lambda"
]] <- lambda_by_param_combo_id[["lambda"]]
}
} # /GLMNET
## LightGBM ----
if (algorithm == "LightGBM") {
if (is.null(grid_params[["nrounds"]])) {
if (verbosity > 1L) {
info("Extracting best N of iterations from LightGBM models...")
}
nrounds_cv <- data.frame(
nrounds = sapply(grid_run, \(x) x[["hyperparameters"]][["best_iter"]])
)
nrounds_cv[["param_combo_id"]] <- rep(
seq_len(n_param_combinations),
each = n_resamples
)
nrounds_by_param_combo_id <- aggregate(
nrounds ~ param_combo_id,
nrounds_cv,
tuner_parameters[["metrics_aggregate_fn"]]
)
# Replace NULL nrounds in tune_results$param_grid with average value of Res nrounds
stopifnot(tune_results[["param_grid"]][["nrounds"]] == "null")
param_grid[["nrounds"]] <- tune_results[["param_grid"]][["nrounds"]] <-
as.integer(round(nrounds_by_param_combo_id[["nrounds"]]))
}
} # /LightGBM
## GBM, H2OGBM ----
# if (algorithm %in% c("H2OGBM", "GBM", "GBM3")) {
# est.n.trees.all <- data.frame(n.trees = plyr::laply(
# grid_run,
# function(x) x$est.n.trees
# ))
# est.n.trees.all$param_combo_id <- rep(seq_len(n_param_combinations), each = n_resamples)
# est.n.trees.by.param_combo_id <- aggregate(
# n.trees ~ param_combo_id, est.n.trees.all,
# metrics_aggregate_fn
# )
# tune_results <- cbind(
# n.trees = round(est.n.trees.by.param_combo_id$n.trees),
# tune_results
# )
# n_params <- n_params + 1
# } # /GBM, H2OGBM
## XGBoost ----
# if (algorithm == "XGBoost") {
# if (verbosity > 1L) {
# msg2(hilite("Extracting best N of iterations from XGBoost models..."))
# }
# est.nrounds.all <- data.frame(nrounds = plyr::laply(
# grid_run,
# \(m) m$best_iteration
# ))
# est.nrounds.all$param_combo_id <- rep(seq_len(n_param_combinations),
# each = n_resamples
# )
# est.nrounds.by.param_combo_id <- aggregate(
# nrounds ~ param_combo_id, est.nrounds.all,
# metrics_aggregate_fn
# )
# tune_results <- cbind(
# nrounds = round(est.nrounds.by.param_combo_id$nrounds),
# tune_results
# )
# n_params <- n_params + 1
# } /XGBoost
## LINAD ----
# if (algorithm %in% c("LINAD", "LINOA")) {
# if (verbosity > 1L) {
# info("Extracting best N leaves from LINAD models...")
# }
# est.n.leaves.all <- data.frame(n.leaves = plyr::laply(
# grid_run,
# \(x) ifelse(length(x$est.n.leaves) == 0, 1, x$est.n.leaves)
# ))
# est.n.leaves.all$param_combo_id <- rep(seq_len(n_param_combinations),
# each = n_resamples
# )
# est.n.leaves.by.param_combo_id <- aggregate(
# n.leaves ~ param_combo_id, est.n.leaves.all,
# metrics_aggregate_fn
# )
# tune_results <- cbind(
# n.leaves =
# round(est.n.leaves.by.param_combo_id$n.leaves), tune_results
# )
# n_params <- n_params + 1
# } # /LINAD, LINOA
## LIHADBoost ----
# if (algorithm == "LIHADBoost") {
# if (verbosity > 1L) {
# msg2(hilite("Extracting best N steps from LIHADBoost models..."))
# }
# est.n.steps.all <- data.frame(n.steps = plyr::laply(
# grid_run,
# \(x) x$sel.n.steps
# ))
# est.n.steps.all$param_combo_id <- rep(seq_len(n_param_combinations),
# each = n_resamples
# )
# est.n.steps.by.param_combo_id <- aggregate(
# n.steps ~ param_combo_id, est.n.steps.all,
# metrics_aggregate_fn
# )
# tune_results <- cbind(
# n.steps = round(est.n.steps.by.param_combo_id$n.steps),
# tune_results
# )
# n_params <- n_params + 1
# } # /LIHADBoost
# Consider explicitly sorting hyperparam values in increasing order,
# so that in case of tie, lowest value is chosen -
# if that makes sense, e.g. n.leaves, etc.
best_param_combo_id <- as.integer(
tune_results[["metrics_validation"]][
select_fn(tune_results[["metrics_validation"]][[metric]]),
1
]
)
best_param_combo <- as.list(param_grid[best_param_combo_id, -1, drop = FALSE])
if (verbosity > 0L) {
message()
msg2(hilite(paste0("Best parameters to ", paste(verb, metric), ":")))
print_tune_finding(grid_params, best_param_combo)
}
# Outro ----
# Since this is always called from within `train()`, we don't want to print "Completed..."
outro(start_time, verbosity = verbosity - 1)
# => add optional mods field to GridSearch
# if (save_mods) mods <- grid_run
GridSearch(
hyperparameters = hyperparameters,
tuner_parameters = tuner_parameters,
tuning_results = list(
param_grid = param_grid,
training = metrics_training_by_combo_id,
validation = metrics_validation_by_combo_id
),
best_hyperparameters = best_param_combo
)
} # /rtemis::tune_GridSearch
# Print tuning results ----
# Print set of search values and best value in the form {1, 3, 5} => 3
# for each hyperparameter that was tuned.
print_tune_finding <- function(grid_params, best_param_combo) {
# Make list of search values and best value
tfl <- lapply(seq_along(grid_params), function(i) {
paste0(
"{",
paste(grid_params[[i]], collapse = ", "),
"}",
" => ",
bold(best_param_combo[[names(grid_params)[i]]])
)
})
names(tfl) <- names(grid_params)
printls(tfl, print_class = FALSE)
} # /rtemis::print_tune_finding
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.