# XGBoost models ----------------------------------------------------------
#' Specifies cross validation folds for xgboost models
#'
#' @param cv_folds number of cross validation folds
#' @param nrows training data row count
#' @return list of cross validation folds
generate_folds <- function(cv_folds, nrows){
folds_idx <- sample(c(1:cv_folds), nrows, replace = T)
folds <- list()
for(i in unique(folds_idx)){
folds[[i]] <- c(1:nrows)[folds_idx == i]
}
folds
}
#' Automatically generates xgboost hyperparameter sets for optimziation
#' Sample is drawn from autoxgboost params: https://github.com/ja-thomas/autoxgboost/blob/master/R/autoxgbparset.R
#'
#' @param tune_rounds: number of parameter sets to generate
#' @return a dataframe with hyperparameters for tuning
generate_params <- function(tune_rounds){
set.seed(1234)
out <- data.frame(eta = runif(tune_rounds, 0.01, 0.2),
gamma = runif(tune_rounds, 2^-7, 2^6),
max_depth = runif(tune_rounds, 3, 20) %>% round(),
subsample = runif(tune_rounds, 0.05, 1),
colsample_bytree = runif(tune_rounds, 0.5, 1),
colsample_bylevel = runif(tune_rounds, 0.5, 1),
lambda = runif(tune_rounds, 2^-10, 2^10),
alpha = runif(tune_rounds, 2^-10, 2^10))
out
}
#' Automatically models numeric targets with random search hyperparameter optimziation
#' Tuning parameters mirror autoxgboost: https://liuyanguu.github.io/post/2018/10/03/autoxgboost-bayesian-optimization/
#' and may be converted to bayesian hyperparameter optimization in the future.
#'
#' @param mdl: a list of frames from the treat_numeric_data function.
#' @param target: target variable for prediction
#' @param cv_folds: number of cross validation folds
#' @param tune_rounds: number of hyperoptimization tuning rounds
#' @param verbose: print model iterations (T/F)
#' @param max_rounds: maximum number of rounds to use in model fitting
#' @return model, data, and model results
model_numeric <- function(mdl,
target_var,
cv_folds = 5,
tune_rounds = 100,
verbose = T,
max_rounds = 10 ^ 6) {
#generate parameters
params <- generate_params(tune_rounds)
params$eval <- NA
model_list <- list()
#specify folds
folds <- generate_folds(cv_folds, nrow(mdl$train))
#test parameter sets
for(i in 1:nrow(params)) {
xgb_mdl <- xgboost::xgb.cv(data = mdl$train_x %>% as.matrix(),
nrounds = max_rounds,
early_stopping_rounds = 2L,
label = mdl$train_y,
folds = folds,
verbose = 0,
params = list(
tree_method = "hist",
objective = "reg:squarederror",
booster = "gbtree",
grow_policy = "lossguide",
eta = params$eta[i],
gamma = params$gamma[i],
max_depth = params$max_depth[i],
subsample = params$subsample[i],
colsample_bytree = params$colsample_bytree[i],
colsample_bylevel = params$colsample_bylevel[i],
lambda = params$lambda[i],
alpha = params$alpha[i])
)
model_list[[i]] <- xgb_mdl
params$eval[i] <- xgb_mdl$evaluation_log$test_rmse_mean[xgb_mdl$best_iteration]
if(verbose == T) {
flush.console()
print(glue::glue("Tested param set {i} of {tune_rounds}. Eval is: {params$eval[i]}."))
}
}
mdl$params_tested <- params
mdl$best_cv_mdl_res <- model_list[[c(1:nrow(params))[params$eval == min(params$eval)]]]
#train final model & evaluate on test
xgb_final <- xgboost::xgboost(params = mdl$best_cv_mdl_res$params,
data = mdl$train_x,
label = mdl$train_y,
nrounds = max_rounds,
early_stopping_rounds = 10L,
verbose = 0)
mdl$final_mdl <- xgb_final
mdl$test_yhat <- predict(xgb_final, mdl$test_x %>% as.matrix())
#calc final results
mdl$test_rmse <- yardstick::rmse_vec(mdl$test_y, mdl$test_yhat)
mdl$test_mae <- yardstick::mae_vec(mdl$test_y, mdl$test_yhat)
mdl$test_rsq <- yardstick::rsq_vec(mdl$test_y, mdl$test_yhat)
mdl$test_spearman_cor <- cor(mdl$test_y, mdl$test_yhat, method = "spearman")
#return results
mdl
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.