#' xgboost - parameter tuning and model selection with k-fold cross-validation
#' and grid search
#'
#' @param x Predictor matrix.
#' @param y Response vector.
#' @param params Parameter grid generated by [cv_param_grid()].
#' @param n_folds Number of folds. Default is 5.
#' @param n_threads The number of parallel threads. For optimal speed,
#' match this to the number of physical CPU cores, not threads.
#' See respective model documentation for more details. Default is 1.
#' @param seed Random seed for reproducibility.
#' @param verbose Show progress?
#'
#' @return A data frame containing the complete tuning grid and the AUC values,
#' with the best parameter combination and the highest AUC value.
#'
#' @importFrom pROC auc
#'
#' @export
#'
#' @examplesIf is_installed_xgboost()
#' sim_data <- msaenet::msaenet.sim.binomial(
#' n = 100,
#' p = 10,
#' rho = 0.6,
#' coef = rnorm(5, mean = 0, sd = 10),
#' snr = 1,
#' p.train = 0.8,
#' seed = 42
#' )
#'
#' params <- cv_xgboost(
#' sim_data$x.tr,
#' sim_data$y.tr,
#' params = cv_param_grid(
#' n_iterations = c(100, 200),
#' max_depth = c(3, 5),
#' learning_rate = c(0.1, 0.5)
#' ),
#' n_folds = 5,
#' n_threads = 1,
#' seed = 42,
#' verbose = FALSE
#' )
#'
#' params$df
cv_xgboost <- function(
x, y,
params = cv_param_grid(),
n_folds = 5,
n_threads = 1,
seed = 42,
verbose = TRUE) {
params <- map_params_xgboost(params)
set.seed(seed)
nrow_x <- nrow(x)
index <- sample(rep_len(1L:n_folds, nrow_x))
df_grid <- expand.grid(
"nrounds" = params$nrounds,
"max_depth" = params$max_depth,
"eta" = params$eta,
"metric" = NA
)
nrow_grid <- nrow(df_grid)
x <- as.matrix(x)
pb <- progress_bar$new(
format = " searching grid [:bar] :percent in :elapsed",
total = nrow_grid * n_folds, clear = FALSE, width = 60
)
for (j in 1L:nrow_grid) {
ypred <- matrix(NA, ncol = 2L, nrow = nrow_x)
for (i in 1L:n_folds) {
if (verbose) pb$tick()
xtrain <- x[index != i, , drop = FALSE]
ytrain <- y[index != i]
xtest <- x[index == i, , drop = FALSE]
ytest <- y[index == i]
xtrain <- xgboost_dmatrix(xtrain, label = ytrain)
xtest <- xgboost_dmatrix(xtest)
fit <- xgboost_train(
params = list(
objective = "binary:logistic",
eval_metric = "auc",
max_depth = df_grid[j, "max_depth"],
eta = df_grid[j, "eta"]
),
data = xtrain,
nrounds = df_grid[j, "nrounds"],
nthread = n_threads
)
ypredvec <- predict(fit, xtest)
ypred[index == i, 1L] <- ytest
ypred[index == i, 2L] <- ypredvec
}
colnames(ypred) <- c("y.real", "y.pred")
df_grid[j, "metric"] <- as.numeric(pROC::auc(ypred[, "y.real"], ypred[, "y.pred"], quiet = TRUE))
}
best_row <- which.max(df_grid$metric)
best_metric <- df_grid$metric[best_row]
best_nrounds <- df_grid$nrounds[best_row]
best_eta <- df_grid$eta[best_row]
best_max_depth <- df_grid$max_depth[best_row]
structure(
list(
"df" = df_grid,
"metric" = best_metric,
"nrounds" = best_nrounds,
"eta" = best_eta,
"max_depth" = best_max_depth
),
class = c("cv_params", "cv_xgboost")
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.