R/xgb_cv_opt.R

##' Bayesian Optimization for XGboost (Cross Validation)
##'
##' @title Bayesian Optimization for XGboost(Cross Validation)
##' @param data data
##' @param label label for classification
##' @param objectfun Specify the learning task and the corresponding learning objective
##' \itemize{
##'     \item \code{reg:linear} linear regression (Default).
##'     \item \code{reg:logistic} logistic regression.
##'     \item \code{binary:logistic} logistic regression for binary classification. Output probability.
##'     \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
##'     \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}.
##'     \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
##'     \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
##'   }
##' @param evalmetric evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking).
##' \itemize{
##'   \item \code{error} binary classification error rate
##'   \item \code{rmse} Rooted mean square error
##'   \item \code{logloss} negative log-likelihood function
##'   \item \code{auc} Area under curve
##'   \item \code{merror} Exact matching error, used to evaluate multi-class classification
##' }
##' @param n_folds K for cross Validation
##' @param eta_range The range of eta(default is c(0.1, 1L))
##' @param max_depth_range The range of max_depth(default is c(4L, 6L))
##' @param nrounds_range The range of nrounds(default is c(70, 160L))
##' @param subsample_range The range of subsample rate(default is c(0.1, 1L))
##' @param bytree_range The range of colsample_bytree rate(default is c(0.4, 1L)
##' @param init_points Number of randomly chosen points to sample the
##'   target function before Bayesian Optimization fitting the Gaussian Process.
##' @param n_iter Total number of times the Bayesian Optimization is to repeated.
##' @param acq Acquisition function type to be used. Can be "ucb", "ei" or "poi". #' \itemize{
##'   \item \code{ucb} GP Upper Confidence Bound
##'   \item \code{ei} Expected Improvement
##'   \item \code{poi} Probability of Improvement
##' }
##' @param kappa kappa tunable parameter kappa of GP Upper Confidence Bound, to balance exploitation against exploration,
##'   increasing kappa will make the optimized hyperparameters pursuing exploration.
##' @param eps tunable parameter epsilon of Expected Improvement and Probability of Improvement, to balance exploitation against exploration,
##'   increasing epsilon will make the optimized hyperparameters are more spread out across the whole range.
##' @param optkernel Kernel (aka correlation function) for the underlying Gaussian Process. This parameter should be a list
##'   that specifies the type of correlation function along with the smoothness parameter. Popular choices are square exponential (default) or matern 5/2
##' @param classes set the number of classes. To use only with multiclass objectives.
##' @param seed set seed.(default is 0)
##'
##' @return The score you specified in the evalmetric option and a list of Bayesian Optimization result is returned:
##' \itemize{
##'   \item \code{Best_Par} a named vector of the best hyperparameter set found
##'   \item \code{Best_Value} the value of metrics achieved by the best hyperparameter set
##'   \item \code{History} a \code{data.table} of the bayesian optimization history
##'   \item \code{Pred} a \code{data.table} with validation/cross-validation prediction for each round of bayesian optimization history
##' }
##' @examples
##' library(MlBayesOpt)
##'
##' suppressWarnings(RNGversion("3.5.0"))
##' set.seed(71)
##' res0 <- xgb_cv_opt(data = iris,
##'                    label = Species,
##'                    objectfun = "multi:softmax",
##'                    evalmetric = "mlogloss",
##'                    n_folds = 3,
##'                    classes = 3,
##'                    init_points = 2,
##'                    n_iter = 1)
##'
##' @import xgboost
##' @importFrom Matrix sparse.model.matrix
##' @import rBayesianOptimization
##' @importFrom stats predict
##' @importFrom rlang enquo !!
##' @importFrom dplyr select %>%
##' @export
xgb_cv_opt <- function(data,
                       label,
                       objectfun,
                       evalmetric,
                       n_folds,
                       eta_range = c(0.1, 1L),
                       max_depth_range = c(4L, 6L),
                       nrounds_range = c(70, 160L),
                       subsample_range = c(0.1, 1L),
                       bytree_range = c(0.4, 1L),
                       init_points = 4,
                       n_iter = 10,
                       acq = "ei",
                       kappa = 2.576,
                       eps = 0.0,
                       optkernel = list(type = "exponential", power = 2),
                       classes = NULL,
                       seed = 0
)
{
  if(class(data)[1] == "dgCMatrix")
  {dtrain <- xgb.DMatrix(data,
                         label = label)
  xg_watchlist <- list(msr = dtrain)

  cv_folds <- KFold(label, nfolds = n_folds,
                    stratified = TRUE, seed = seed)
  }
  else
  {
    quolabel <- enquo(label)
    datalabel <- (data %>% select(!! quolabel))[[1]]

    mx <- sparse.model.matrix(datalabel ~ ., data)

    if (class(datalabel) == "factor"){
      dtrain <- xgb.DMatrix(mx, label = as.integer(datalabel) - 1)
    } else{
      dtrain <- xgb.DMatrix(mx, label = datalabel)
      }

    xg_watchlist <- list(msr = dtrain)

    cv_folds <- KFold(datalabel, nfolds = n_folds,
                      stratified = TRUE, seed = seed)
  }

  #about classes
  if (grepl("logi", objectfun) == TRUE){
    xgb_cv <- function(object_fun,
                       eval_met,
                       num_classes,
                       eta_opt,
                       max_depth_opt,
                       nrounds_opt,
                       subsample_opt,
                       bytree_opt) {

      object_fun <- objectfun
      eval_met <- evalmetric

      cv <- xgb.cv(params = list(booster = "gbtree",
                                 nthread = 1,
                                 objective = object_fun,
                                 eval_metric = eval_met,
                                 eta = eta_opt,
                                 max_depth = max_depth_opt,
                                 subsample = subsample_opt,
                                 colsample_bytree = bytree_opt,
                                 lambda = 1, alpha = 0),
                   data = dtrain, folds = cv_folds,
                   watchlist = xg_watchlist,
                   prediction = TRUE, showsd = TRUE,
                   early_stopping_rounds = 5, maximize = TRUE, verbose = 0,
                   nrounds = nrounds_opt)

      if (eval_met %in% c("auc", "ndcg", "map")) {
        s <- max(cv$evaluation_log[, 4])
      } else {
        s <- max(-(cv$evaluation_log[, 4]))
      }
      list(Score = s,
           Pred = cv$pred)
    }
  } else{
    xgb_cv <- function(object_fun,
                       eval_met,
                       num_classes,
                       eta_opt,
                       max_depth_opt,
                       nrounds_opt,
                       subsample_opt,
                       bytree_opt) {

      object_fun <- objectfun
      eval_met <- evalmetric

      num_classes <- classes

      cv <- xgb.cv(params = list(booster = "gbtree",
                                 nthread = 1,
                                 objective = object_fun,
                                 num_class = num_classes,
                                 eval_metric = eval_met,
                                 eta = eta_opt,
                                 max_depth = max_depth_opt,
                                 subsample = subsample_opt,
                                 colsample_bytree = bytree_opt,
                                 lambda = 1, alpha = 0),
                   data = dtrain, folds = cv_folds,
                   watchlist = xg_watchlist,
                   prediction = TRUE, showsd = TRUE,
                   early_stopping_rounds = 5, maximize = TRUE, verbose = 0,
                   nrounds = nrounds_opt)

      if (eval_met %in% c("auc", "ndcg", "map")) {
        s <- max(cv$evaluation_log[, 4])
      } else {
        s <- max(-(cv$evaluation_log[, 4]))
      }
      list(Score = s,
           Pred = cv$pred)
    }
  }

  opt_res <- BayesianOptimization(xgb_cv,
                                  bounds = list(eta_opt = eta_range,
                                                max_depth_opt = max_depth_range,
                                                nrounds_opt = nrounds_range,
                                                subsample_opt = subsample_range,
                                                bytree_opt = bytree_range),
                                  init_points,
                                  init_grid_dt = NULL,
                                  n_iter,
                                  acq,
                                  kappa,
                                  eps,
                                  optkernel,
                                  verbose = TRUE)

  return(opt_res)

}
ymattu/MlBayesOpt documentation built on May 4, 2019, 5:31 p.m.