#' lightgbm - parameter tuning and model selection with k-fold cross-validation
#' and grid search
#'
#' @param x Predictor matrix.
#' @param y Response vector.
#' @param params Parameter grid generated by [cv_param_grid()].
#' @param n_folds Number of folds. Default is 5.
#' @param n_threads The number of parallel threads. For optimal speed,
#' match this to the number of physical CPU cores, not threads.
#' See respective model documentation for more details. Default is 1.
#' @param seed Random seed for reproducibility.
#' @param verbose Show progress?
#'
#' @return
#' A data frame containing the complete tuning grid and the AUC values,
#' with the best parameter combination and the highest AUC value.
#'
#' @export
#'
#' @examplesIf is_installed_lightgbm()
#' sim_data <- msaenet::msaenet.sim.binomial(
#' n = 100,
#' p = 10,
#' rho = 0.6,
#' coef = rnorm(5, mean = 0, sd = 10),
#' snr = 1,
#' p.train = 0.8,
#' seed = 42
#' )
#'
#' params <- suppressWarnings(
#' cv_lightgbm(
#' sim_data$x.tr,
#' sim_data$y.tr,
#' params = cv_param_grid(
#' n_iterations = c(100, 200),
#' max_depth = c(3, 5),
#' learning_rate = c(0.1, 0.5)
#' ),
#' n_folds = 5,
#' n_threads = 1,
#' seed = 42,
#' verbose = FALSE
#' )
#' )
#'
#' params$df
cv_lightgbm <- function(
x, y,
params = cv_param_grid(),
n_folds = 5,
n_threads = 1,
seed = 42,
verbose = TRUE) {
params <- map_params_lightgbm(params)
set.seed(seed)
nrow_x <- nrow(x)
index <- sample(rep_len(1L:n_folds, nrow_x))
df_grid <- expand.grid(
"num_iterations" = params$num_iterations,
"max_depth" = params$max_depth,
"learning_rate" = params$learning_rate,
"metric" = NA
)
nrow_grid <- nrow(df_grid)
x <- as.matrix(x)
pb <- progress_bar$new(
format = " searching grid [:bar] :percent in :elapsed",
total = nrow_grid * n_folds, clear = FALSE, width = 60
)
for (j in 1L:nrow_grid) {
ypred <- matrix(NA, ncol = 2L, nrow = nrow_x)
for (i in 1L:n_folds) {
if (verbose) pb$tick()
xtrain <- x[index != i, , drop = FALSE]
ytrain <- y[index != i]
xtest <- x[index == i, , drop = FALSE]
ytest <- y[index == i]
fit <- lightgbm_train(
data = xtrain,
label = ytrain,
params = list(
objective = "binary",
learning_rate = df_grid[j, "learning_rate"],
num_iterations = df_grid[j, "num_iterations"],
max_depth = df_grid[j, "max_depth"],
num_leaves = 2^(df_grid[j, "max_depth"]) - 1,
num_threads = n_threads
),
verbose = -1
)
ypredvec <- predict(fit, xtest)
ypred[index == i, 1L] <- ytest
ypred[index == i, 2L] <- ypredvec
}
colnames(ypred) <- c("y.real", "y.pred")
df_grid[j, "metric"] <- as.numeric(pROC::auc(ypred[, "y.real"], ypred[, "y.pred"], quiet = TRUE))
}
best_row <- which.max(df_grid$metric)
best_metric <- df_grid$metric[best_row]
best_num_iterations <- df_grid$num_iterations[best_row]
best_max_depth <- df_grid$max_depth[best_row]
best_learning_rate <- df_grid$learning_rate[best_row]
structure(
list(
"df" = df_grid,
"metric" = best_metric,
"num_iterations" = best_num_iterations,
"max_depth" = best_max_depth,
"learning_rate" = best_learning_rate
),
class = c("cv_params", "cv_lightgbm")
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.