#' Train models with Bayesian Optimization algorithm
#'
#' Bayesian Optimization takes relatively a long time - the bigger `iters.n` param,
#' the more time (but if you want to get model parameters better than default params,
#' it is suggested to set `iters.n` equals 20 at least.
#' Also the bigger dataset, the more time takes Bayesian Optimization.
#'
#' @param train_data A training data for models created by `prepare_data()` function.
#' @param y A string that indicates a target column name for regression or classification.
#' Either y, or pair: time, status can be used.
#' @param time A string that indicates a time column name for survival analysis task.
#' Either y, or pair: time, status can be used.
#' @param status A string that indicates a status column name for survival analysis task.
#' Either y, or pair: time, status can be used.
#' @param test_data A test data for models created by `prepare_data()` function.
#' @param engine A vector of tree-based models that shall be created. Possible
#' values are: `ranger`, `xgboost`,`decision_tree`, `lightgbm`, `catboost`. Doesn't
#' matter for survival analysis.
#' @param type A string that determines if Machine Learning task is the
#' `binary_clf`, `regression`, or `survival`.
#' @param parallel A logical value, if set to TRUE, the function will use parallel computing.
#' By default set to FALSE.
#' @param iters.n The number of iterations of BayesOpt function.
#' @param bayes_info A list with two values, determining the verbosity of the Bayesian
#' Optmization process. The first value is `verbose` with 3 levels: 0 - no output;
#' 1 - describes what is hapenning, and if we can reach local optimum; 2 - addtionally
#' provides infromation about recent, and the best scores. The second value is
#' `plotProgress`, which is a logical value indicating if the progress of the Bayesian
#' Optimization should be plotted. WARNING it will create plot after each step, thus
#' it might be computationally expensive. Both arguments come from the
#' `ParBayesianOptimization` package. It only matters if you set global verbose to TRUE.
#' Default values are: list(verbose = 0, plotProgress = FALSE).
#' @param return_params A logical value, if set to TRUE, returns optimized model parameters.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return Trained models with optimized parameters. If `return_params` is `TRUE`, then
#' returns also training parameters in the one list with models.
#' @export
train_models_bayesopt <- function(train_data,
y,
time,
status,
test_data,
engine,
type,
parallel = FALSE,
iters.n = 7,
bayes_info = list(verbose = 0, plotProgress = FALSE),
return_params = FALSE,
verbose = TRUE) {
if (!is.numeric(iters.n) | as.integer(iters.n) != iters.n ) {
verbose_cat(crayon::green('\u2714'), 'The number of bayesian optimization iterations must be an integer. \n\n', verbose = verbose)
stop('The number of bayesian optimization iterations must be an integer.')
}
if (iters.n <= 0) {
verbose_cat(crayon::green('\u2714'), 'Bayesian Optimization was turned off. \n', verbose = verbose)
return(NULL)
}
ranger_model <- NULL
xgboost_model <- NULL
decision_tree_model <- NULL
lightgbm_model <- NULL
catboost_model <- NULL
models_params <- NULL
models_params$ranger_params <- NULL
models_params$xgboost_params <- NULL
models_params$decision_tree_params <- NULL
models_params$lightgbm_params <- NULL
models_params$catboost_params <- NULL
if (type == 'survival') {
verbose_cat(' ', crayon::green('\u2714'), 'rfsrc: Starting training procedure.\n', verbose = verbose)
t0 <- as.numeric(Sys.time())
fitness_fun_rfsrc <- function(ntree, nodesize, nsplit) {
# Our optimized metric is the Brier score.
model <- randomForestSRC::rfsrc(
formula = as.formula(paste0('Surv(',time,',', status,') ~ .')),
# We use ranger data, as rfsrc doesn't need preprocessing.
data = train_data$ranger_data,
na.action = 'na.omit',
ntree = ntree,
nodesize = nodesize,
nsplit = nsplit,
splitrule = "logrankscore"
)
# Brier Score
pred <- randomForestSRC::predict.rfsrc(model, test_data$ranger_data)
predictions <- pred$survival
ordered_times <- model$time.interest
median_idx <- median(1:length(ordered_times))
surv_object <- survival::Surv(test_data$ranger_data[[time]], test_data$ranger_data[[status]])
med_time <- median(ordered_times)
max_metric <- -SurvMetrics::Brier(object = surv_object, pre_sp = predictions[, median_idx], t_star = med_time)
return(list(Score = as.numeric(max_metric)))
}
bounds <- list(
ntree = c(5L, 1000L),
nodesize = c(5L, 30L),
nsplit = c(1L, 100L)
)
bayes <- NULL
tryCatch(
expr = {
if (verbose) {
capture.output(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_rfsrc,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = bayes_info[[1]],
parallel = parallel,
plotProgress = bayes_info[[2]]))
} else {
capture.output(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_rfsrc,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = 0,
parallel = parallel))
}
},
error = function(e) {
print(e)
}
)
if (is.null(bayes)) {
rfsrc_model <- randomForestSRC::rfsrc(
formula = as.formula(paste0('Surv(',time,',', status,') ~ .')),
data = train_data$ranger_data,
splitrule = 'logrankscore'
)
verbose_cat(' ', crayon::red('\u2716'), 'rfsrc: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
} else {
if (return_params == TRUE) {
models_params$ranger_params$ntree <- as.integer(ParBayesianOptimization::getBestPars(bayes)$ntree)
models_params$ranger_params$nodesize <- as.integer(ParBayesianOptimization::getBestPars(bayes)$nodesize)
models_params$ranger_params$nsplit <- as.integer(ParBayesianOptimization::getBestPars(bayes)$nsplit)
}
rfsrc_model <- randomForestSRC::rfsrc(
formula = as.formula(paste0('Surv(',time,',', status,') ~ .')),
data = train_data$ranger_data,
na.action = 'na.omit',
ntree = as.integer(ParBayesianOptimization::getBestPars(bayes)$ntree),
nodesize = as.integer(ParBayesianOptimization::getBestPars(bayes)$nodesize),
nsplit = as.integer(ParBayesianOptimization::getBestPars(bayes)$nsplit),
splitrule = 'logrankscore'
)
verbose_cat(' ', crayon::green('\u2714'), 'rfsrc: Bayesian Optimization was successful!\n', verbose = verbose)
}
t1 <- as.numeric(Sys.time())
verbose_cat(' ', crayon::green('\u2714'), 'rfsrc: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
if (return_params == TRUE) {
# To remove models that are NULL.
return_list <- list(
rfsrc_bayes = rfsrc_model,
models_params = models_params
)
}
else {
# To remove models that are NULL.
return_list <- list(
rfsrc_bayes = rfsrc_model
)
}
return(return_list)
}
for (i in 1:length(engine)) {
if (engine[i] == 'ranger') {
verbose_cat(' ', crayon::green('\u2714'), 'ranger: Starting training procedure.\n', verbose = verbose)
t0 <- as.numeric(Sys.time())
if (type == 'regression') {
classification <- FALSE
probability <- FALSE
} else if (type %in% c('binary_clf', 'multiclass')) {
classification <- TRUE
probability <- TRUE
} else {
verbose_cat('Incorrect task type.', verbose = verbose)
}
fitness_fun_ranger <- function(num.trees, min.node.size, max.depth, sample.fraction) {
model <- ranger::ranger(
dependent.variable.name = y,
data = train_data$ranger_data,
num.trees = num.trees,
min.node.size = min.node.size,
max.depth = max.depth,
sample.fraction = sample.fraction,
classification = classification,
probability = probability
)
if (type == 'regression') {
preds <- ranger::predictions(predict(model, test_data$ranger_data))
} else if (type == 'binary_clf') {
preds <- predict(model, test_data$ranger_data)$predictions[, 2]
} else if (type == 'multiclass') {
predicts <- ranger::predictions(predict(model, test_data$ranger_data))
preds <- c()
for (j in 1:nrow(predicts)) {
preds <- c(preds, which.max(unname(predicts[j, ])))
}
}
observed <- test_data$ranger_data[, y]
max_metric <- NULL
if (type == 'regression') {
max_metric <- - model_performance_rmse(preds, observed) # rmse
} else if (type == 'binary_clf') {
y_levels <- levels(factor(train_data$ranger_data[, y]))
preds <- factor(1 * (preds > 0.5), levels = c(0, 1), labels = y_levels)
max_metric <- mean(preds == observed) # accuracy
} else if (type == 'multiclass') {
max_metric <- mean(preds == observed) # accuracy
}
return(list(Score = as.numeric(max_metric)))
}
bounds <- list(num.trees = c(5L, 1000L),
min.node.size = c(1L, 10L),
max.depth = c(1L, 100L),
sample.fraction = c(0.25, 0.75))
bayes <- NULL
tryCatch(
expr = {
if (verbose) {
suppressWarnings(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_ranger,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = bayes_info[[1]],
parallel = parallel,
plotProgress = bayes_info[[2]]))
} else {
suppressWarnings(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_ranger,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = 0,
parallel = parallel))
}
},
error = function(e) {
print(e)
}
)
if (is.null(bayes)) {
ranger_model <- ranger::ranger(
dependent.variable.name = y,
data = train_data$ranger_data,
classification = classification,
probability = probability
)
verbose_cat(' ', crayon::red('\u2716'), 'ranger: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
} else {
if (return_params == TRUE) {
models_params$ranger_params$num.trees <- as.integer(ParBayesianOptimization::getBestPars(bayes)$num.trees)
models_params$ranger_params$min.node.size <- as.integer(ParBayesianOptimization::getBestPars(bayes)$min.node.size)
models_params$ranger_params$max.depth <- as.integer(ParBayesianOptimization::getBestPars(bayes)$max.depth)
models_params$ranger_params$sample.fraction <- ParBayesianOptimization::getBestPars(bayes)$sample.fraction
}
ranger_model <- ranger::ranger(
dependent.variable.name = y,
data = train_data$ranger_data,
num.trees = as.integer(ParBayesianOptimization::getBestPars(bayes)$num.trees),
min.node.size = as.integer(ParBayesianOptimization::getBestPars(bayes)$min.node.size),
max.depth = as.integer(ParBayesianOptimization::getBestPars(bayes)$max.depth),
sample.fraction = ParBayesianOptimization::getBestPars(bayes)$sample.fraction,
classification = classification,
probability = probability
)
verbose_cat(' ', crayon::green('\u2714'), 'ranger: Bayesian Optimization was successful!\n', verbose = verbose)
}
t1 <- as.numeric(Sys.time())
verbose_cat(' ', crayon::green('\u2714'), 'ranger: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
}
else if (engine[i] == 'xgboost') {
verbose_cat(' ', crayon::green('\u2714'), 'xgboost: Starting training procedure.\n', verbose = verbose)
t0 <- as.numeric(Sys.time())
if (type == 'binary_clf') {
objective <- 'binary:logistic'
eval_metric <- 'auc'
params <- list(objective = objective, eval_metric = eval_metric)
} else if (type == 'regression') {
objective <- 'reg:squarederror'
eval_metric <- 'rmse'
params <- list(objective = objective, eval_metric = eval_metric)
} else if (type == 'multiclass') {
objective <- 'multi:softprob'
eval_metric <- 'merror'
num_class <- length(unique(as.vector(train_data$ranger[[y]])))
params <- list(objective = objective, eval_metric = eval_metric, num_class = num_class)
} else {
verbose_cat('Incorrect task type.', verbose = verbose)
}
fitness_fun_xgboost <- function(nrounds, eta, subsample, gamma, max_depth) {
capture.output(
model <- xgboost::xgboost(
train_data$xgboost_data,
nrounds = nrounds,
verbose = 1,
label = as.vector(as.numeric(train_data$ranger_data[[y]])) - 1,
params = params,
eta = eta,
subsample = subsample,
gamma = gamma,
max_depth = max_depth))
if (type %in% c('binary_clf', 'regression')) {
preds <- predict(model, test_data$xgboost_data, type = 'prob')
} else if (type == 'multiclass') {
predicts <- predict(model, test_data$xgboost_data)
predicts <- matrix(predicts, ncol = length(unique(test_data$ranger_data[[y]])), byrow = TRUE)
preds <- c()
for (j in 1:nrow(predicts)) {
preds <- c(preds, which.max(unname(predicts[j, ])))
}
}
observed <- test_data$ranger_data[, y]
max_metric <- NULL
if (type == 'regression') {
max_metric <- - model_performance_rmse(preds, observed) # rmse
} else if (type == 'binary_clf') {
y_levels <- levels(factor(train_data$ranger_data[, y]))
preds <- factor(1 * (preds > 0.5), levels = c(0, 1), labels = y_levels)
max_metric <- mean(preds == observed) # accuracy
} else if (type == 'multiclass') {
max_metric <- mean(preds == observed) # accuracy
}
return(list(Score = as.numeric(max_metric)))
}
bounds <- list(nrounds = c(5L, 100L),
eta = c(0.01, 0.5),
subsample = c(0.7, 1),
gamma = c(0, 10),
max_depth = c(1L, 10L))
bayes <- NULL
tryCatch(
expr = {
if (verbose) {
suppressWarnings(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_xgboost,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = bayes_info[[1]],
parallel = parallel,
plotProgress = bayes_info[[2]]))
} else {
suppressWarnings(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_xgboost,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = 0,
parallel = parallel))
}
},
error = function(e) {
print(e)
}
)
if (is.null(bayes)) {
capture.output(xgboost_model <- xgboost::xgboost(train_data$xgboost_data,
label = as.vector(as.numeric(train_data$ranger_data[[y]])) - 1,
params = params,
nrounds = 5000,
verbose = 1))
verbose_cat(' ', crayon::red('\u2716'), 'xgboost: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
} else {
if (return_params == TRUE) {
models_params$xgboost_params$nrounds <- as.integer(ParBayesianOptimization::getBestPars(bayes)$nrounds)
models_params$xgboost_params$eta <- ParBayesianOptimization::getBestPars(bayes)$eta
models_params$xgboost_params$subsample <- ParBayesianOptimization::getBestPars(bayes)$subsample
models_params$xgboost_params$gamma <- ParBayesianOptimization::getBestPars(bayes)$gamma
models_params$xgboost_params$max_depth <- as.integer(ParBayesianOptimization::getBestPars(bayes)$max_depth)
}
capture.output(
xgboost_model <- xgboost::xgboost(train_data$xgboost_data,
label = as.vector(as.numeric(train_data$ranger_data[[y]])) - 1,
verbose = 1,
params = params,
nrounds = as.integer(ParBayesianOptimization::getBestPars(bayes)$nrounds),
eta = ParBayesianOptimization::getBestPars(bayes)$eta,
subsample = ParBayesianOptimization::getBestPars(bayes)$subsample,
gamma = ParBayesianOptimization::getBestPars(bayes)$gamma,
max_depth = as.integer(ParBayesianOptimization::getBestPars(bayes)$max_depth)))
verbose_cat(' ', crayon::green('\u2714'), 'xgboost: Bayesian Optimization was successful!\n', verbose = verbose)
}
t1 <- as.numeric(Sys.time())
verbose_cat(' ', crayon::green('\u2714'), 'xgboost: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
}
else if (engine[i] == 'decision_tree') {
verbose_cat(' ', crayon::green('\u2714'), 'decision_tree: Starting training procedure.\n', verbose = verbose)
t0 <- as.numeric(Sys.time())
form <- as.formula(paste0(y, ' ~.'))
fitness_fun_decision_tree <- function(minsplit, minprob, maxdepth, nresample) {
model <- partykit::ctree(form, data = train_data$decision_tree_data,
minsplit = minsplit,
minprob = minprob,
maxdepth = maxdepth,
nresample = nresample
)
if (type %in% c('binary_clf', 'regression')) {
preds <- predict(model, test_data$decision_tree_data)
} else if (type == 'multiclass') {
predicts <- unname(predict(model, test_data$decision_tree_data, type = 'prob'))
predicts <- matrix(predicts, ncol = length(unique(test_data$ranger_data[[y]])), byrow = TRUE)
preds <- c()
for (j in 1:nrow(predicts)) {
preds <- c(preds, which.max(unname(predicts[j, ])))
}
}
observed <- test_data$ranger_data[, y]
max_metric <- NULL
if (type == 'regression') {
max_metric <- - model_performance_rmse(preds, observed) # rmse
} else if (type == 'binary_clf') {
preds <- unname(preds)
max_metric <- mean(preds == observed) # accuracy
} else if (type == 'multiclass') {
max_metric <- mean(preds == observed) # accuracy
}
return(list(Score = as.numeric(max_metric)))
}
bounds <- list(minsplit = c(1L, 60L),
minprob = c(0.01, 1),
maxdepth = c(1L, 20L),
nresample = c(1L, 1000L))
bayes <- NULL
tryCatch(
expr = {
if (verbose) {
suppressWarnings(suppressMessages(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_decision_tree,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = bayes_info[[1]],
parallel = parallel,
plotProgress = bayes_info[[2]])))
} else {
suppressWarnings(suppressMessages(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_decision_tree,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = 0,
parallel = FALSE)))
}
},
error = function(e) {
print(e)
}
)
if (is.null(bayes)) {
decision_tree_model <- partykit::ctree(form, data = train_data$decision_tree_data)
verbose_cat(' ', crayon::red('\u2716'), 'decision_tree: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
} else {
if (return_params == TRUE) {
models_params$decision_tree_params$minsplit <- as.integer(ParBayesianOptimization::getBestPars(bayes)$minsplit)
models_params$decision_tree_params$minprob <- ParBayesianOptimization::getBestPars(bayes)$minprob
models_params$decision_tree_params$maxdepth <- as.integer(ParBayesianOptimization::getBestPars(bayes)$maxdepth)
models_params$decision_tree_params$nresample <- as.integer(ParBayesianOptimization::getBestPars(bayes)$nresample)
}
decision_tree_model <- partykit::ctree(form,
data = train_data$decision_tree_data,
minsplit = as.integer(ParBayesianOptimization::getBestPars(bayes)$minsplit),
minprob = ParBayesianOptimization::getBestPars(bayes)$minprob,
maxdepth = as.integer(ParBayesianOptimization::getBestPars(bayes)$maxdepth),
nresample = as.integer(ParBayesianOptimization::getBestPars(bayes)$nresample))
verbose_cat(' ', crayon::green('\u2714'), 'decision_tree: Bayesian Optimization was successful!\n', verbose = verbose)
}
t1 <- as.numeric(Sys.time())
verbose_cat(' ', crayon::green('\u2714'), 'decision_tree: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
}
else if (engine[i] == 'lightgbm') {
verbose_cat(' ', crayon::green('\u2714'), 'lightgbm: Starting training procedure.\n', verbose = verbose)
t0 <- as.numeric(Sys.time())
fitness_fun_lightgbm <- function(learning_rate, num_leaves, num_iterations) {
if (type == 'binary_clf') {
obj <- 'binary'
metric <- 'accuracy'
params <- list(objective = obj, metric = metric, boosting = 'gbdt')
} else if (type == 'multiclass') {
obj <- 'multiclass'
params <- list(objective = obj, num_class = length(unique(as.vector(train_data$ranger_data[[y]]))))
} else if (type == 'regression') {
obj <- 'regression'
params <- list(objective = obj)
}
params <- append(params, c(
learning_rate = learning_rate,
num_leaves = as.integer(num_leaves),
num_iterations = as.integer(num_iterations)
))
model <- lightgbm::lgb.train(params = params,
data = train_data$lightgbm_data,
verbose = 0)
if (type %in% c('binary_clf', 'regression')) {
preds <- predict(model, test_data$lightgbm_data)
} else if (type == 'multiclass') {
predicts <- predict(model, test_data$lightgbm_data)
preds <- c()
for (j in 1:nrow(predicts)) {
preds <- c(preds, which.max(unname(predicts[j, ])))
}
}
observed <- test_data$ranger_data[, y]
max_metric <- NULL
if (type == 'regression') {
max_metric <- - model_performance_rmse(preds, observed) # rmse
} else if (type == 'binary_clf') {
y_levels <- levels(factor(train_data$ranger_data[, y]))
preds <- factor(1 * (preds > 0.5), levels = c(0, 1), labels = y_levels)
max_metric <- mean(preds == observed) # accuracy
} else if (type == 'multiclass') {
max_metric <- mean(preds == observed) # accuracy
}
return(list(Score = as.numeric(max_metric)))
}
bounds <- list(learning_rate = c(0.01, 0.5),
num_leaves = c(2L, 50L),
num_iterations = c(5L, 100L))
bayes <- NULL
tryCatch(
expr = {
if (verbose) {
suppressWarnings(suppressMessages(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_lightgbm,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = bayes_info[[1]],
parallel = FALSE,
plotProgress = bayes_info[[2]])))
} else {
suppressWarnings(suppressMessages(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_lightgbm,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = 0,
parallel = FALSE)))
}
},
error = function(e) {
print(e)
})
if (type == 'binary_clf') {
obj <- 'binary'
metric <- 'accuracy'
params <- list(objective = obj, metric = metric, boosting = 'gbdt')
} else if (type == 'multiclass') {
obj <- 'multiclass'
params <- list(objective = obj, num_class = length(unique(as.vector(train_data$ranger_data[[y]]))))
} else if (type == 'regression') {
obj <- 'regression'
params <- list(objective = obj)
}
if (is.null(bayes)) {
lightgbm_model <- lightgbm::lgb.train(params = params,
data = train_data$lightgbm_data,
verbose = -1)
verbose_cat(' ', crayon::red('\u2716'), 'lightgbm: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
} else {
if (return_params == TRUE) {
models_params$lightgbm_params$learning_rate <- ParBayesianOptimization::getBestPars(bayes)$learning_rate
models_params$lightgbm_params$num_leaves <- as.integer(ParBayesianOptimization::getBestPars(bayes)$num_leaves)
models_params$lightgbm_params$num_iterations <- as.integer(ParBayesianOptimization::getBestPars(bayes)$num_iterations)
}
params = append(params, c(
learning_rate = ParBayesianOptimization::getBestPars(bayes)$learning_rate,
num_leaves = as.integer(ParBayesianOptimization::getBestPars(bayes)$num_leaves),
num_iterations = as.integer(ParBayesianOptimization::getBestPars(bayes)$num_iterations)))
lightgbm_model <- lightgbm::lgb.train(params = params,
data = train_data$lightgbm_data,
verbose = -1)
verbose_cat(' ', crayon::green('\u2714'), 'lightgbm: Bayesian Optimization was successful!\n', verbose = verbose)
}
t1 <- as.numeric(Sys.time())
verbose_cat(' ', crayon::green('\u2714'), 'lightgbm: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
}
else if (engine[i] == 'catboost') {
verbose_cat(' ', crayon::green('\u2714'), 'catboost: Starting training procedure.\n', verbose = verbose)
t0 <- as.numeric(Sys.time())
fitness_fun_catboost <- function(iterations, border_count, depth, learning_rate, min_data_in_leaf) {
if (type == 'binary_clf') {
obj <- 'Logloss'
params <- list(loss_function = obj, logging_level = 'Silent')
} else if (type == 'multiclass') {
obj <- 'MultiClass'
params <- list(loss_function = obj, logging_level = 'Silent')
} else if (type == 'regression') {
obj <- 'RMSE'
params <- list(loss_function = obj, logging_level = 'Silent')
}
params <- append(params, c(
iterations = as.integer(iterations),
border_count = as.integer(border_count),
depth = as.integer(depth),
learning_rate = learning_rate,
min_data_in_leaf = as.integer(min_data_in_leaf)))
capture.output(model <- catboost::catboost.train(train_data$catboost_data, params = params))
if (type == 'binary_clf') {
preds <- (catboost::catboost.predict(model,
test_data$catboost_data,
prediction_type = 'Probability'))
} else if (type == 'regression') {
preds <- (catboost::catboost.predict(model,
test_data$catboost_data,
prediction_type = 'RawFormulaVal'))
} else if (type == 'multiclass') {
predicts <- catboost::catboost.predict(model,
test_data$catboost_data,
prediction_type = 'Probability')
preds <- c()
for (j in 1:nrow(predicts)) {
preds <- c(preds, which.max(unname(predicts[j, ])))
}
}
observed <- test_data$ranger_data[, y]
max_metric <- NULL
if (type == 'regression') {
max_metric <- - model_performance_rmse(preds, observed) # rmse
} else if (type == 'binary_clf') {
y_levels <- levels(factor(train_data$ranger_data[, y]))
preds <- factor(1 * (preds > 0.5), levels = c(0, 1), labels = y_levels)
max_metric <- mean(preds == observed) # accuracy
} else if (type == 'multiclass') {
max_metric <- mean(preds == observed) # accuracy
}
return(list(Score = as.numeric(max_metric)))
}
bounds <- list(iterations = c(5L, 100L),
border_count = c(64L, 1024L),
depth = c(2L, 16L),
learning_rate = c(0.01, 0.5),
min_data_in_leaf = c(1L, 10L))
bayes <- NULL
tryCatch(
expr = {
if (verbose) {
suppressWarnings(suppressMessages(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_catboost,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = bayes_info[[1]],
parallel = FALSE,
plotProgress = bayes_info[[2]])))
} else {
suppressWarnings(suppressMessages(
bayes <- ParBayesianOptimization::bayesOpt(FUN = fitness_fun_catboost,
bounds = bounds,
initPoints = length(bounds) + 5,
iters.n = iters.n,
verbose = 0,
parallel = FALSE)))
}
},
error = function(e) {
print(e)
}
)
if (type == 'binary_clf') {
obj <- 'Logloss'
params <- list(loss_function = obj, logging_level = 'Silent')
} else if (type == 'multiclass') {
obj <- 'MultiClass'
params <- list(loss_function = obj, logging_level = 'Silent')
} else if (type == 'regression') {
obj <- 'MAE'
params <- list(loss_function = obj, logging_level = 'Silent')
}
if (is.null(bayes)) {
capture.output(catboost_model <- catboost::catboost.train(train_data$catboost_data, params = params))
verbose_cat(' ', crayon::red('\u2716'), 'catboost: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
} else {
if (return_params == TRUE) {
models_params$catboost_params$iterations <- as.integer(ParBayesianOptimization::getBestPars(bayes)$iterations)
models_params$catboost_params$border_count <- as.integer(ParBayesianOptimization::getBestPars(bayes)$border_count)
models_params$catboost_params$depth <- as.integer(ParBayesianOptimization::getBestPars(bayes)$depth)
models_params$catboost_params$learning_rate <- ParBayesianOptimization::getBestPars(bayes)$learning_rate
models_params$catboost_params$min_data_in_leaf <- as.integer(ParBayesianOptimization::getBestPars(bayes)$min_data_in_leaf)
}
params = append(params, c(
iterations = as.integer(ParBayesianOptimization::getBestPars(bayes)$iterations),
border_count = as.integer(ParBayesianOptimization::getBestPars(bayes)$border_count),
depth = as.integer(ParBayesianOptimization::getBestPars(bayes)$depth),
learning_rate = ParBayesianOptimization::getBestPars(bayes)$learning_rate,
min_data_in_leaf = as.integer(ParBayesianOptimization::getBestPars(bayes)$min_data_in_leaf)))
capture.output(catboost_model <- catboost::catboost.train(train_data$catboost_data, params = params))
verbose_cat(' ', crayon::green('\u2714'), 'catboost: Bayesian Optimization was successful!\n', verbose = verbose)
}
t1 <- as.numeric(Sys.time())
verbose_cat(' ', crayon::green('\u2714'), 'catboost: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
}
}
if (return_params == TRUE) {
# To remove models that are NULL.
return_list <- list(
ranger_bayes = ranger_model,
xgboost_bayes = xgboost_model,
decision_tree_bayes = decision_tree_model,
lightgbm_bayes = lightgbm_model,
catboost_bayes = catboost_model,
models_params = models_params
)
to_rm <- c()
for (i in 1:length(return_list)) {
if (is.null(return_list[[i]])) {
to_rm <- c(to_rm, i)
}
}
if (!is.null(to_rm)) {
return_list <- return_list[-to_rm]
}
return(return_list)
}
else {
# To remove models that are NULL.
return_list <- list(
ranger_bayes = ranger_model,
xgboost_bayes = xgboost_model,
decision_tree_bayes = decision_tree_model,
lightgbm_bayes = lightgbm_model,
catboost_bayes = catboost_model
)
to_rm <- c()
for (i in 1:length(return_list)) {
if (is.null(return_list[[i]])) {
to_rm <- c(to_rm, i)
}
}
if (!is.null(to_rm)) {
return_list <- return_list[-to_rm]
}
return(return_list)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.