#' catboost - parameter tuning and model selection with k-fold cross-validation
#' and grid search
#'
#' @param x Predictor matrix.
#' @param y Response vector.
#' @param params Parameter grid generated by [cv_param_grid()].
#' @param n_folds Number of folds. Default is 5.
#' @param n_threads The number of parallel threads. For optimal speed,
#' match this to the number of physical CPU cores, not threads.
#' See respective model documentation for more details. Default is 1.
#' @param seed Random seed for reproducibility.
#' @param verbose Show progress?
#'
#' @return
#' A data frame containing the complete tuning grid and the AUC values,
#' with the best parameter combination and the highest AUC value.
#'
#' @export
#'
#' @examplesIf is_installed_catboost()
#' sim_data <- msaenet::msaenet.sim.binomial(
#' n = 100,
#' p = 10,
#' rho = 0.6,
#' coef = rnorm(5, mean = 0, sd = 10),
#' snr = 1,
#' p.train = 0.8,
#' seed = 42
#' )
#'
#' params <- cv_catboost(
#' sim_data$x.tr,
#' sim_data$y.tr,
#' params = cv_param_grid(
#' n_iterations = c(100, 200),
#' max_depth = c(3, 5),
#' learning_rate = c(0.1, 0.5)
#' ),
#' n_folds = 5,
#' n_threads = 1,
#' seed = 42,
#' verbose = FALSE
#' )
#'
#' params$df
cv_catboost <- function(
x, y,
params = cv_param_grid(),
n_folds = 5,
n_threads = 1,
seed = 42,
verbose = TRUE) {
params <- map_params_catboost(params)
set.seed(seed)
nrow_x <- nrow(x)
index <- sample(rep_len(1L:n_folds, nrow_x))
df_grid <- expand.grid(
"iterations" = params$iterations,
"depth" = params$depth,
"metric" = NA
)
nrow_grid <- nrow(df_grid)
# x <- as.matrix(x) # uncomment to use non-categorical features
pb <- progress_bar$new(
format = " searching grid [:bar] :percent in :elapsed",
total = nrow_grid * n_folds, clear = FALSE, width = 60
)
for (j in 1L:nrow_grid) {
ypred <- matrix(NA, ncol = 2L, nrow = nrow_x)
for (i in 1L:n_folds) {
if (verbose) pb$tick()
xtrain <- x[index != i, , drop = FALSE]
ytrain <- y[index != i]
xtest <- x[index == i, , drop = FALSE]
ytest <- y[index == i]
train_pool <- catboost_load_pool(data = xtrain, label = ytrain)
test_pool <- catboost_load_pool(data = xtest, label = NULL)
fit <- catboost_train(
train_pool,
test_pool = NULL,
params = list(
loss_function = "Logloss",
iterations = df_grid[j, "iterations"],
depth = df_grid[j, "depth"],
logging_level = "Silent",
thread_count = n_threads
)
)
ypredvec <- catboost_predict(fit, pool = test_pool, prediction_type = "Probability")
ypred[index == i, 1L] <- ytest
ypred[index == i, 2L] <- ypredvec
}
colnames(ypred) <- c("y.real", "y.pred")
df_grid[j, "metric"] <- as.numeric(pROC::auc(ypred[, "y.real"], ypred[, "y.pred"], quiet = TRUE))
}
best_row <- which.max(df_grid$metric)
best_metric <- df_grid$metric[best_row]
best_iterations <- df_grid$iterations[best_row]
best_depth <- df_grid$depth[best_row]
structure(
list(
"df" = df_grid,
"metric" = best_metric,
"iterations" = best_iterations,
"depth" = best_depth
),
class = c("cv_params", "cv_catboost")
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.