R/train_models_bayesopt.R

Defines functions train_models_bayesopt

Documented in train_models_bayesopt

#' Train models with Bayesian Optimization algorithm
#'
#' Bayesian Optimization takes relatively a long time - the bigger `iters.n` param,
#' the more time (but if you want to get model parameters better than default params,
#' it is suggested to set `iters.n` equals 20 at least.
#' Also the bigger dataset, the more time takes Bayesian Optimization.
#'
#' @param train_data A training data for models created by `prepare_data()` function.
#' @param y A string that indicates a target column name for regression or classification.
#' Either y, or pair: time, status can be used.
#' @param time A string that indicates a time column name for survival analysis task.
#' Either y, or pair: time, status can be used.
#' @param status A string that indicates a status column name for survival analysis task.
#' Either y, or pair: time, status can be used.
#' @param test_data A test data for models created by `prepare_data()` function.
#' @param engine A vector of tree-based models that shall be created. Possible
#' values are: `ranger`, `xgboost`,`decision_tree`, `lightgbm`, `catboost`. Doesn't
#' matter for survival analysis.
#' @param type A string that determines if Machine Learning task is the
#' `binary_clf`, `regression`, or `survival`.
#' @param parallel A logical value, if set to TRUE, the function will use parallel computing.
#' By default set to FALSE.
#' @param iters.n The number of iterations of BayesOpt function.
#' @param bayes_info A list with two values, determining the verbosity of the Bayesian
#' Optmization process. The first value is `verbose` with 3 levels: 0 - no output;
#' 1 - describes what is hapenning, and if we can reach local optimum; 2 - addtionally
#' provides infromation about recent, and the best scores. The second value is
#' `plotProgress`, which is a logical value indicating if the progress of the Bayesian
#' Optimization should be plotted. WARNING it will create plot after each step, thus
#' it might be computationally expensive. Both arguments come from the
#' `ParBayesianOptimization` package. It only matters if you set global verbose to TRUE.
#' Default values are: list(verbose = 0, plotProgress = FALSE).
#' @param return_params A logical value, if set to TRUE, returns optimized model parameters.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return Trained models with optimized parameters. If `return_params` is `TRUE`, then
#' returns also training parameters in the one list with models.
#' @export
train_models_bayesopt <- function(train_data,
                                  y,
                                  time,
                                  status,
                                  test_data,
                                  engine,
                                  type,
                                  parallel = FALSE,
                                  iters.n = 7,
                                  bayes_info = list(verbose = 0, plotProgress = FALSE),
                                  return_params = FALSE,
                                  verbose = TRUE) {
  if (!is.numeric(iters.n) | as.integer(iters.n) != iters.n ) {
    verbose_cat(crayon::green('\u2714'), 'The number of bayesian optimization iterations must be an integer. \n\n', verbose = verbose)
    stop('The number of bayesian optimization iterations must be an integer.')
  }
  if (iters.n <= 0) {
    verbose_cat(crayon::green('\u2714'), 'Bayesian Optimization was turned off. \n', verbose = verbose)
    return(NULL)
  }

  ranger_model        <- NULL
  xgboost_model       <- NULL
  decision_tree_model <- NULL
  lightgbm_model      <- NULL
  catboost_model      <- NULL

  models_params                      <- NULL
  models_params$ranger_params        <- NULL
  models_params$xgboost_params       <- NULL
  models_params$decision_tree_params <- NULL
  models_params$lightgbm_params      <- NULL
  models_params$catboost_params      <- NULL

  if (type == 'survival') {
    verbose_cat('   ', crayon::green('\u2714'), 'rfsrc: Starting training procedure.\n', verbose = verbose)
    t0 <- as.numeric(Sys.time())
    fitness_fun_rfsrc <- function(ntree, nodesize, nsplit) {

      # Our optimized metric is the Brier score.

      model <- randomForestSRC::rfsrc(
        formula   = as.formula(paste0('Surv(',time,',', status,') ~ .')),
        # We use ranger data, as rfsrc doesn't need preprocessing.
        data      = train_data$ranger_data,
        na.action = 'na.omit',
        ntree     = ntree,
        nodesize  = nodesize,
        nsplit    = nsplit,
        splitrule = "logrankscore"
      )
      # Brier Score
      pred          <- randomForestSRC::predict.rfsrc(model, test_data$ranger_data)
      predictions   <- pred$survival
      ordered_times <- model$time.interest
      median_idx    <- median(1:length(ordered_times))
      surv_object   <- survival::Surv(test_data$ranger_data[[time]], test_data$ranger_data[[status]])
      med_time      <- median(ordered_times)
      max_metric    <- -SurvMetrics::Brier(object = surv_object, pre_sp = predictions[, median_idx], t_star = med_time)

      return(list(Score = as.numeric(max_metric)))
    }

    bounds <- list(
      ntree    = c(5L, 1000L),
      nodesize = c(5L, 30L),
      nsplit   = c(1L, 100L)
    )

    bayes <- NULL
    tryCatch(
      expr = {
        if (verbose) {
          capture.output(
          bayes <- ParBayesianOptimization::bayesOpt(FUN          = fitness_fun_rfsrc,
                                                     bounds       = bounds,
                                                     initPoints   = length(bounds) + 5,
                                                     iters.n      = iters.n,
                                                     verbose      = bayes_info[[1]],
                                                     parallel     = parallel,
                                                     plotProgress = bayes_info[[2]]))
        } else {
          capture.output(
          bayes <- ParBayesianOptimization::bayesOpt(FUN        = fitness_fun_rfsrc,
                                                     bounds     = bounds,
                                                     initPoints = length(bounds) + 5,
                                                     iters.n    = iters.n,
                                                     verbose    = 0,
                                                     parallel   = parallel))
        }
      },
      error = function(e) {
        print(e)
      }
    )

    if (is.null(bayes)) {
      rfsrc_model <- randomForestSRC::rfsrc(
        formula   = as.formula(paste0('Surv(',time,',', status,') ~ .')),
        data      = train_data$ranger_data,
        splitrule = 'logrankscore'
      )
      verbose_cat('   ', crayon::red('\u2716'), 'rfsrc: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
    } else {
      if (return_params == TRUE) {
        models_params$ranger_params$ntree    <- as.integer(ParBayesianOptimization::getBestPars(bayes)$ntree)
        models_params$ranger_params$nodesize <- as.integer(ParBayesianOptimization::getBestPars(bayes)$nodesize)
        models_params$ranger_params$nsplit   <- as.integer(ParBayesianOptimization::getBestPars(bayes)$nsplit)
      }
      rfsrc_model <- randomForestSRC::rfsrc(
        formula   = as.formula(paste0('Surv(',time,',', status,') ~ .')),
        data      = train_data$ranger_data,
        na.action = 'na.omit',
        ntree     = as.integer(ParBayesianOptimization::getBestPars(bayes)$ntree),
        nodesize  = as.integer(ParBayesianOptimization::getBestPars(bayes)$nodesize),
        nsplit    = as.integer(ParBayesianOptimization::getBestPars(bayes)$nsplit),
        splitrule = 'logrankscore'
      )
      verbose_cat('   ', crayon::green('\u2714'), 'rfsrc: Bayesian Optimization was successful!\n', verbose = verbose)
    }
    t1 <- as.numeric(Sys.time())
    verbose_cat('   ', crayon::green('\u2714'), 'rfsrc: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)

    if (return_params == TRUE) {
      # To remove models that are NULL.
      return_list <- list(
        rfsrc_bayes   = rfsrc_model,
        models_params = models_params
      )
    }
    else {
      # To remove models that are NULL.
      return_list <- list(
        rfsrc_bayes  = rfsrc_model
      )
    }
    return(return_list)
  }

  for (i in 1:length(engine)) {
    if (engine[i] == 'ranger') {
      verbose_cat('   ', crayon::green('\u2714'), 'ranger: Starting training procedure.\n', verbose = verbose)
      t0 <- as.numeric(Sys.time())
      if (type == 'regression') {
        classification <- FALSE
        probability    <- FALSE
      } else if (type %in% c('binary_clf', 'multiclass')) {
        classification <- TRUE
        probability    <- TRUE
      } else {
        verbose_cat('Incorrect task type.', verbose = verbose)
      }

      fitness_fun_ranger <- function(num.trees, min.node.size, max.depth, sample.fraction) {

        model <- ranger::ranger(
          dependent.variable.name = y,
          data            = train_data$ranger_data,
          num.trees       = num.trees,
          min.node.size   = min.node.size,
          max.depth       = max.depth,
          sample.fraction = sample.fraction,
          classification  = classification,
          probability     = probability
        )

        if (type == 'regression') {
          preds    <- ranger::predictions(predict(model, test_data$ranger_data))
        } else if (type == 'binary_clf') {
          preds    <- predict(model, test_data$ranger_data)$predictions[, 2]
        } else if (type == 'multiclass') {
          predicts <- ranger::predictions(predict(model, test_data$ranger_data))
          preds <- c()
          for (j in 1:nrow(predicts)) {
            preds <- c(preds, which.max(unname(predicts[j, ])))
          }
        }
        observed   <- test_data$ranger_data[, y]
        max_metric <- NULL

        if (type == 'regression') {
          max_metric <- - model_performance_rmse(preds, observed) # rmse
        } else if (type == 'binary_clf') {
          y_levels   <- levels(factor(train_data$ranger_data[, y]))
          preds      <- factor(1 * (preds > 0.5), levels = c(0, 1), labels = y_levels)
          max_metric <- mean(preds == observed) # accuracy
        } else if (type == 'multiclass') {
          max_metric <- mean(preds == observed) # accuracy
        }

        return(list(Score = as.numeric(max_metric)))
      }

      bounds <- list(num.trees       = c(5L, 1000L),
                     min.node.size   = c(1L, 10L),
                     max.depth       = c(1L, 100L),
                     sample.fraction = c(0.25, 0.75))

      bayes <- NULL
      tryCatch(
        expr = {
          if (verbose) {
            suppressWarnings(
            bayes <- ParBayesianOptimization::bayesOpt(FUN          = fitness_fun_ranger,
                                                       bounds       = bounds,
                                                       initPoints   = length(bounds) + 5,
                                                       iters.n      = iters.n,
                                                       verbose      = bayes_info[[1]],
                                                       parallel     = parallel,
                                                       plotProgress = bayes_info[[2]]))
          } else {
            suppressWarnings(
            bayes <- ParBayesianOptimization::bayesOpt(FUN        = fitness_fun_ranger,
                                                       bounds     = bounds,
                                                       initPoints = length(bounds) + 5,
                                                       iters.n    = iters.n,
                                                       verbose    = 0,
                                                       parallel   = parallel))
          }
        },
        error = function(e) {
          print(e)
        }
      )

      if (is.null(bayes)) {
        ranger_model <- ranger::ranger(
          dependent.variable.name = y,
          data            = train_data$ranger_data,
          classification  = classification,
          probability     = probability
        )
        verbose_cat('   ', crayon::red('\u2716'), 'ranger: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
      } else {
        if (return_params == TRUE) {
          models_params$ranger_params$num.trees       <- as.integer(ParBayesianOptimization::getBestPars(bayes)$num.trees)
          models_params$ranger_params$min.node.size   <- as.integer(ParBayesianOptimization::getBestPars(bayes)$min.node.size)
          models_params$ranger_params$max.depth       <- as.integer(ParBayesianOptimization::getBestPars(bayes)$max.depth)
          models_params$ranger_params$sample.fraction <- ParBayesianOptimization::getBestPars(bayes)$sample.fraction
        }
        ranger_model <- ranger::ranger(
          dependent.variable.name = y,
          data            = train_data$ranger_data,
          num.trees       = as.integer(ParBayesianOptimization::getBestPars(bayes)$num.trees),
          min.node.size   = as.integer(ParBayesianOptimization::getBestPars(bayes)$min.node.size),
          max.depth       = as.integer(ParBayesianOptimization::getBestPars(bayes)$max.depth),
          sample.fraction = ParBayesianOptimization::getBestPars(bayes)$sample.fraction,
          classification  = classification,
          probability     = probability
        )
        verbose_cat('   ', crayon::green('\u2714'), 'ranger: Bayesian Optimization was successful!\n', verbose = verbose)
      }
      t1 <- as.numeric(Sys.time())
      verbose_cat('   ', crayon::green('\u2714'), 'ranger: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
    }
    else if (engine[i] == 'xgboost') {
      verbose_cat('   ', crayon::green('\u2714'), 'xgboost: Starting training procedure.\n', verbose = verbose)
      t0 <- as.numeric(Sys.time())
      if (type == 'binary_clf') {
        objective   <- 'binary:logistic'
        eval_metric <- 'auc'
        params      <- list(objective = objective, eval_metric = eval_metric)
      } else if (type == 'regression') {
        objective   <- 'reg:squarederror'
        eval_metric <- 'rmse'
        params      <- list(objective = objective, eval_metric = eval_metric)
      } else if (type == 'multiclass') {
        objective   <- 'multi:softprob'
        eval_metric <- 'merror'
        num_class   <- length(unique(as.vector(train_data$ranger[[y]])))
        params      <- list(objective = objective, eval_metric = eval_metric, num_class = num_class)
      } else {
        verbose_cat('Incorrect task type.', verbose = verbose)
      }

      fitness_fun_xgboost <- function(nrounds, eta, subsample, gamma, max_depth) {
        capture.output(
          model <- xgboost::xgboost(
            train_data$xgboost_data,
            nrounds   = nrounds,
            verbose   = 1,
            label     = as.vector(as.numeric(train_data$ranger_data[[y]])) - 1,
            params    = params,
            eta       = eta,
            subsample = subsample,
            gamma     = gamma,
            max_depth = max_depth))

        if (type %in% c('binary_clf', 'regression')) {
          preds    <- predict(model, test_data$xgboost_data, type = 'prob')
        } else if (type == 'multiclass') {
          predicts <- predict(model, test_data$xgboost_data)
          predicts <- matrix(predicts, ncol = length(unique(test_data$ranger_data[[y]])), byrow = TRUE)
          preds    <- c()
          for (j in 1:nrow(predicts)) {
            preds  <- c(preds, which.max(unname(predicts[j, ])))
          }
        }
        observed   <- test_data$ranger_data[, y]
        max_metric <- NULL

        if (type == 'regression') {
          max_metric <- - model_performance_rmse(preds, observed) # rmse
        } else if (type == 'binary_clf') {
          y_levels   <- levels(factor(train_data$ranger_data[, y]))
          preds      <- factor(1 * (preds > 0.5), levels = c(0, 1), labels = y_levels)
          max_metric <- mean(preds == observed) # accuracy
        } else if (type == 'multiclass') {
          max_metric <- mean(preds == observed) # accuracy
        }

        return(list(Score = as.numeric(max_metric)))
      }
      bounds <- list(nrounds   = c(5L, 100L),
                     eta       = c(0.01, 0.5),
                     subsample = c(0.7, 1),
                     gamma     = c(0, 10),
                     max_depth = c(1L, 10L))

      bayes <- NULL
      tryCatch(
        expr = {
          if (verbose) {
            suppressWarnings(
            bayes <- ParBayesianOptimization::bayesOpt(FUN          = fitness_fun_xgboost,
                                                       bounds       = bounds,
                                                       initPoints   = length(bounds) + 5,
                                                       iters.n      = iters.n,
                                                       verbose      = bayes_info[[1]],
                                                       parallel     = parallel,
                                                       plotProgress = bayes_info[[2]]))
          } else {
            suppressWarnings(
            bayes <- ParBayesianOptimization::bayesOpt(FUN        = fitness_fun_xgboost,
                                                       bounds     = bounds,
                                                       initPoints = length(bounds) + 5,
                                                       iters.n    = iters.n,
                                                       verbose    = 0,
                                                       parallel   = parallel))
          }
        },
        error = function(e) {
          print(e)
        }
      )

      if (is.null(bayes)) {
        capture.output(xgboost_model <- xgboost::xgboost(train_data$xgboost_data,
                                                        label     = as.vector(as.numeric(train_data$ranger_data[[y]])) - 1,
                                                        params    = params,
                                                        nrounds   = 5000,
                                                        verbose   = 1))
        verbose_cat('   ', crayon::red('\u2716'), 'xgboost: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
      } else {
        if (return_params == TRUE) {
          models_params$xgboost_params$nrounds   <- as.integer(ParBayesianOptimization::getBestPars(bayes)$nrounds)
          models_params$xgboost_params$eta       <- ParBayesianOptimization::getBestPars(bayes)$eta
          models_params$xgboost_params$subsample <- ParBayesianOptimization::getBestPars(bayes)$subsample
          models_params$xgboost_params$gamma     <- ParBayesianOptimization::getBestPars(bayes)$gamma
          models_params$xgboost_params$max_depth <- as.integer(ParBayesianOptimization::getBestPars(bayes)$max_depth)
        }
        capture.output(
          xgboost_model <- xgboost::xgboost(train_data$xgboost_data,
                                            label     = as.vector(as.numeric(train_data$ranger_data[[y]])) - 1,
                                            verbose   = 1,
                                            params    = params,
                                            nrounds   = as.integer(ParBayesianOptimization::getBestPars(bayes)$nrounds),
                                            eta       = ParBayesianOptimization::getBestPars(bayes)$eta,
                                            subsample = ParBayesianOptimization::getBestPars(bayes)$subsample,
                                            gamma     = ParBayesianOptimization::getBestPars(bayes)$gamma,
                                            max_depth = as.integer(ParBayesianOptimization::getBestPars(bayes)$max_depth)))
        verbose_cat('   ', crayon::green('\u2714'), 'xgboost: Bayesian Optimization was successful!\n', verbose = verbose)
      }
      t1 <- as.numeric(Sys.time())
      verbose_cat('   ', crayon::green('\u2714'), 'xgboost: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
    }
    else if (engine[i] == 'decision_tree') {
      verbose_cat('   ', crayon::green('\u2714'), 'decision_tree: Starting training procedure.\n', verbose = verbose)
      t0      <- as.numeric(Sys.time())
      form    <- as.formula(paste0(y, ' ~.'))
      fitness_fun_decision_tree <- function(minsplit, minprob, maxdepth, nresample) {
        model <- partykit::ctree(form, data = train_data$decision_tree_data,
                                 minsplit   = minsplit,
                                 minprob    = minprob,
                                 maxdepth   = maxdepth,
                                 nresample  = nresample
        )

        if (type %in% c('binary_clf', 'regression')) {
          preds    <- predict(model, test_data$decision_tree_data)
        } else if (type == 'multiclass') {
          predicts <- unname(predict(model, test_data$decision_tree_data, type = 'prob'))
          predicts <- matrix(predicts, ncol = length(unique(test_data$ranger_data[[y]])), byrow = TRUE)
          preds    <- c()
          for (j in 1:nrow(predicts)) {
            preds  <- c(preds, which.max(unname(predicts[j, ])))
          }
        }

        observed   <- test_data$ranger_data[, y]
        max_metric <- NULL

        if (type == 'regression') {
          max_metric <- - model_performance_rmse(preds, observed) # rmse
        } else if (type == 'binary_clf') {
          preds      <- unname(preds)
          max_metric <- mean(preds == observed) # accuracy
        } else if (type == 'multiclass') {
          max_metric <- mean(preds == observed) # accuracy
        }

        return(list(Score = as.numeric(max_metric)))
      }

      bounds <- list(minsplit  = c(1L, 60L),
                     minprob   = c(0.01, 1),
                     maxdepth  = c(1L, 20L),
                     nresample = c(1L, 1000L))

      bayes <- NULL
      tryCatch(
        expr = {
          if (verbose) {
            suppressWarnings(suppressMessages(
            bayes <- ParBayesianOptimization::bayesOpt(FUN          = fitness_fun_decision_tree,
                                                       bounds       = bounds,
                                                       initPoints   = length(bounds) + 5,
                                                       iters.n      = iters.n,
                                                       verbose      = bayes_info[[1]],
                                                       parallel     = parallel,
                                                       plotProgress = bayes_info[[2]])))
          } else {
            suppressWarnings(suppressMessages(
            bayes <- ParBayesianOptimization::bayesOpt(FUN        = fitness_fun_decision_tree,
                                                       bounds     = bounds,
                                                       initPoints = length(bounds) + 5,
                                                       iters.n    = iters.n,
                                                       verbose    = 0,
                                                       parallel   = FALSE)))
          }
        },
        error = function(e) {
          print(e)
        }
      )

      if (is.null(bayes)) {
        decision_tree_model <- partykit::ctree(form, data = train_data$decision_tree_data)
        verbose_cat('   ', crayon::red('\u2716'), 'decision_tree: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
      } else {
        if (return_params == TRUE) {
          models_params$decision_tree_params$minsplit  <- as.integer(ParBayesianOptimization::getBestPars(bayes)$minsplit)
          models_params$decision_tree_params$minprob   <- ParBayesianOptimization::getBestPars(bayes)$minprob
          models_params$decision_tree_params$maxdepth  <- as.integer(ParBayesianOptimization::getBestPars(bayes)$maxdepth)
          models_params$decision_tree_params$nresample <- as.integer(ParBayesianOptimization::getBestPars(bayes)$nresample)
        }
        decision_tree_model <- partykit::ctree(form,
                                               data      = train_data$decision_tree_data,
                                               minsplit  = as.integer(ParBayesianOptimization::getBestPars(bayes)$minsplit),
                                               minprob   = ParBayesianOptimization::getBestPars(bayes)$minprob,
                                               maxdepth  = as.integer(ParBayesianOptimization::getBestPars(bayes)$maxdepth),
                                               nresample = as.integer(ParBayesianOptimization::getBestPars(bayes)$nresample))
        verbose_cat('   ', crayon::green('\u2714'), 'decision_tree: Bayesian Optimization was successful!\n', verbose = verbose)
      }
      t1 <- as.numeric(Sys.time())
      verbose_cat('   ', crayon::green('\u2714'), 'decision_tree: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
    }
    else if (engine[i] == 'lightgbm') {
      verbose_cat('   ', crayon::green('\u2714'), 'lightgbm: Starting training procedure.\n', verbose = verbose)
      t0                  <- as.numeric(Sys.time())
      fitness_fun_lightgbm <- function(learning_rate, num_leaves, num_iterations) {

        if (type == 'binary_clf') {
          obj    <- 'binary'
          metric <- 'accuracy'
          params <- list(objective = obj, metric = metric, boosting = 'gbdt')
        } else if (type == 'multiclass') {
          obj    <- 'multiclass'
          params <- list(objective = obj, num_class = length(unique(as.vector(train_data$ranger_data[[y]]))))
        } else if (type == 'regression') {
          obj    <- 'regression'
          params <- list(objective = obj)
        }
        params <- append(params, c(
          learning_rate  = learning_rate,
          num_leaves     = as.integer(num_leaves),
          num_iterations = as.integer(num_iterations)
          ))
        model <- lightgbm::lgb.train(params  = params,
                                     data    = train_data$lightgbm_data,
                                     verbose = 0)

        if (type %in% c('binary_clf', 'regression')) {
          preds    <- predict(model, test_data$lightgbm_data)
        } else if (type == 'multiclass') {
          predicts <- predict(model, test_data$lightgbm_data)
          preds    <- c()
          for (j in 1:nrow(predicts)) {
            preds  <- c(preds, which.max(unname(predicts[j, ])))
          }
        }
        observed   <- test_data$ranger_data[, y]
        max_metric <- NULL

        if (type == 'regression') {
          max_metric <- - model_performance_rmse(preds, observed) # rmse
        } else if (type == 'binary_clf') {
          y_levels   <- levels(factor(train_data$ranger_data[, y]))
          preds      <- factor(1 * (preds > 0.5), levels = c(0, 1), labels = y_levels)
          max_metric <- mean(preds == observed) # accuracy
        } else if (type == 'multiclass') {
          max_metric <- mean(preds == observed) # accuracy
        }
        return(list(Score = as.numeric(max_metric)))
      }

      bounds <- list(learning_rate  = c(0.01, 0.5),
                     num_leaves     = c(2L, 50L),
                     num_iterations = c(5L, 100L))

      bayes <- NULL

      tryCatch(
        expr = {
          if (verbose) {
            suppressWarnings(suppressMessages(
              bayes <- ParBayesianOptimization::bayesOpt(FUN          = fitness_fun_lightgbm,
                                                         bounds       = bounds,
                                                         initPoints   = length(bounds) + 5,
                                                         iters.n      = iters.n,
                                                         verbose      = bayes_info[[1]],
                                                         parallel     = FALSE,
                                                         plotProgress = bayes_info[[2]])))
              } else {
                suppressWarnings(suppressMessages(
                  bayes <- ParBayesianOptimization::bayesOpt(FUN        = fitness_fun_lightgbm,
                                                             bounds     = bounds,
                                                             initPoints = length(bounds) + 5,
                                                             iters.n    = iters.n,
                                                             verbose    = 0,
                                                             parallel   = FALSE)))
              }
          },
        error = function(e) {
          print(e)
        })

      if (type == 'binary_clf') {
        obj    <- 'binary'
        metric <- 'accuracy'
        params <- list(objective = obj, metric = metric, boosting = 'gbdt')
      } else if (type == 'multiclass') {
        obj    <- 'multiclass'
        params <- list(objective = obj, num_class = length(unique(as.vector(train_data$ranger_data[[y]]))))
      } else if (type == 'regression') {
        obj    <- 'regression'
        params <- list(objective = obj)
      }
      if (is.null(bayes)) {
        lightgbm_model <- lightgbm::lgb.train(params  = params,
                                              data    = train_data$lightgbm_data,
                                              verbose = -1)
        verbose_cat('   ', crayon::red('\u2716'), 'lightgbm: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
      } else {
        if (return_params == TRUE) {
          models_params$lightgbm_params$learning_rate  <- ParBayesianOptimization::getBestPars(bayes)$learning_rate
          models_params$lightgbm_params$num_leaves     <- as.integer(ParBayesianOptimization::getBestPars(bayes)$num_leaves)
          models_params$lightgbm_params$num_iterations <- as.integer(ParBayesianOptimization::getBestPars(bayes)$num_iterations)
        }
        params = append(params, c(
          learning_rate  = ParBayesianOptimization::getBestPars(bayes)$learning_rate,
          num_leaves     = as.integer(ParBayesianOptimization::getBestPars(bayes)$num_leaves),
          num_iterations = as.integer(ParBayesianOptimization::getBestPars(bayes)$num_iterations)))

        lightgbm_model <- lightgbm::lgb.train(params  = params,
                                              data    = train_data$lightgbm_data,
                                              verbose = -1)
        verbose_cat('   ', crayon::green('\u2714'), 'lightgbm: Bayesian Optimization was successful!\n', verbose = verbose)
      }
      t1 <- as.numeric(Sys.time())
      verbose_cat('   ', crayon::green('\u2714'), 'lightgbm: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
    }
    else if (engine[i] == 'catboost') {
      verbose_cat('   ', crayon::green('\u2714'), 'catboost: Starting training procedure.\n', verbose = verbose)
      t0                   <- as.numeric(Sys.time())
      fitness_fun_catboost <- function(iterations, border_count, depth, learning_rate, min_data_in_leaf) {
        if (type == 'binary_clf') {
          obj    <- 'Logloss'
          params <- list(loss_function = obj, logging_level = 'Silent')
        } else if (type == 'multiclass') {
          obj    <- 'MultiClass'
          params <- list(loss_function = obj, logging_level = 'Silent')
        } else if (type == 'regression') {
          obj    <- 'RMSE'
          params <- list(loss_function = obj, logging_level = 'Silent')
        }

        params <- append(params, c(
          iterations       = as.integer(iterations),
          border_count     = as.integer(border_count),
          depth            = as.integer(depth),
          learning_rate    = learning_rate,
          min_data_in_leaf = as.integer(min_data_in_leaf)))

        capture.output(model <- catboost::catboost.train(train_data$catboost_data, params = params))

        if (type == 'binary_clf') {
          preds <- (catboost::catboost.predict(model,
                                               test_data$catboost_data,
                                               prediction_type = 'Probability'))
        } else if (type == 'regression') {
          preds <- (catboost::catboost.predict(model,
                                               test_data$catboost_data,
                                               prediction_type = 'RawFormulaVal'))
        } else if (type == 'multiclass') {
          predicts <- catboost::catboost.predict(model,
                                                 test_data$catboost_data,
                                                 prediction_type = 'Probability')
          preds <- c()
          for (j in 1:nrow(predicts)) {
            preds <- c(preds, which.max(unname(predicts[j, ])))
          }
        }


        observed     <- test_data$ranger_data[, y]
        max_metric   <- NULL
        if (type == 'regression') {
          max_metric <- - model_performance_rmse(preds, observed) # rmse
        } else if (type == 'binary_clf') {
          y_levels   <- levels(factor(train_data$ranger_data[, y]))
          preds      <- factor(1 * (preds > 0.5), levels = c(0, 1), labels = y_levels)
          max_metric <- mean(preds == observed) # accuracy
        } else if (type == 'multiclass') {
          max_metric <- mean(preds == observed) # accuracy
        }

        return(list(Score = as.numeric(max_metric)))
      }

      bounds <- list(iterations       = c(5L, 100L),
                     border_count     = c(64L, 1024L),
                     depth            = c(2L, 16L),
                     learning_rate    = c(0.01, 0.5),
                     min_data_in_leaf = c(1L, 10L))

      bayes <- NULL
      tryCatch(
        expr = {
          if (verbose) {
            suppressWarnings(suppressMessages(
            bayes <- ParBayesianOptimization::bayesOpt(FUN          = fitness_fun_catboost,
                                                       bounds       = bounds,
                                                       initPoints   = length(bounds) + 5,
                                                       iters.n      = iters.n,
                                                       verbose      = bayes_info[[1]],
                                                       parallel     = FALSE,
                                                       plotProgress = bayes_info[[2]])))
          } else {
            suppressWarnings(suppressMessages(
            bayes <- ParBayesianOptimization::bayesOpt(FUN        = fitness_fun_catboost,
                                                       bounds     = bounds,
                                                       initPoints = length(bounds) + 5,
                                                       iters.n    = iters.n,
                                                       verbose    = 0,
                                                       parallel   = FALSE)))
          }
        },
        error = function(e) {
          print(e)
        }
      )

      if (type == 'binary_clf') {
        obj    <- 'Logloss'
        params <- list(loss_function = obj, logging_level = 'Silent')
      } else if (type == 'multiclass') {
        obj    <- 'MultiClass'
        params <- list(loss_function = obj, logging_level = 'Silent')
      } else if (type == 'regression') {
        obj    <- 'MAE'
        params <- list(loss_function = obj, logging_level = 'Silent')
      }

      if (is.null(bayes)) {
        capture.output(catboost_model <- catboost::catboost.train(train_data$catboost_data, params = params))
        verbose_cat('   ', crayon::red('\u2716'), 'catboost: Bayesian Optimization failed! The model has default parameters.\n', verbose = verbose)
      } else {
        if (return_params == TRUE) {
          models_params$catboost_params$iterations       <- as.integer(ParBayesianOptimization::getBestPars(bayes)$iterations)
          models_params$catboost_params$border_count     <- as.integer(ParBayesianOptimization::getBestPars(bayes)$border_count)
          models_params$catboost_params$depth            <- as.integer(ParBayesianOptimization::getBestPars(bayes)$depth)
          models_params$catboost_params$learning_rate    <- ParBayesianOptimization::getBestPars(bayes)$learning_rate
          models_params$catboost_params$min_data_in_leaf <- as.integer(ParBayesianOptimization::getBestPars(bayes)$min_data_in_leaf)
        }
        params = append(params, c(
          iterations       = as.integer(ParBayesianOptimization::getBestPars(bayes)$iterations),
          border_count     = as.integer(ParBayesianOptimization::getBestPars(bayes)$border_count),
          depth            = as.integer(ParBayesianOptimization::getBestPars(bayes)$depth),
          learning_rate    = ParBayesianOptimization::getBestPars(bayes)$learning_rate,
          min_data_in_leaf = as.integer(ParBayesianOptimization::getBestPars(bayes)$min_data_in_leaf)))

        capture.output(catboost_model <- catboost::catboost.train(train_data$catboost_data, params = params))
        verbose_cat('   ', crayon::green('\u2714'), 'catboost: Bayesian Optimization was successful!\n', verbose = verbose)
      }
      t1 <- as.numeric(Sys.time())
      verbose_cat('   ', crayon::green('\u2714'), 'catboost: It took', round(t1 - t0, 2), 'seconds. \n', verbose = verbose)
    }
  }

  if (return_params == TRUE) {
    # To remove models that are NULL.
    return_list <- list(
      ranger_bayes        = ranger_model,
      xgboost_bayes       = xgboost_model,
      decision_tree_bayes = decision_tree_model,
      lightgbm_bayes      = lightgbm_model,
      catboost_bayes      = catboost_model,
      models_params       = models_params
    )

    to_rm <- c()
    for (i in 1:length(return_list)) {
      if (is.null(return_list[[i]])) {
        to_rm <- c(to_rm, i)
      }
    }
    if (!is.null(to_rm)) {
      return_list <- return_list[-to_rm]
    }
    return(return_list)
  }
  else {
    # To remove models that are NULL.
    return_list <- list(
      ranger_bayes        = ranger_model,
      xgboost_bayes       = xgboost_model,
      decision_tree_bayes = decision_tree_model,
      lightgbm_bayes      = lightgbm_model,
      catboost_bayes      = catboost_model
    )

    to_rm <- c()
    for (i in 1:length(return_list)) {
      if (is.null(return_list[[i]])) {
        to_rm <- c(to_rm, i)
      }
    }
    if (!is.null(to_rm)) {
      return_list <- return_list[-to_rm]
    }
    return(return_list)
  }
}
ModelOriented/forester documentation built on June 6, 2024, 7:29 a.m.