#' Automatically models poisson targets with random search hyperparameter optimziation.
#' XGBoost model trains with lossguide and histogram tree method to accelerate tuning.
#'
#' @param df_train Training data.frame with column called "target" for training.
#' All columns should be numeric and prepared with a package like vtreat.
#' @param df_test Testing data.frame with column called "target" for evaluation.
#' All columns should be numeric and prepared with a package like vtreat.
#' @param tune_rounds Integer (e.g. 25L) indicating the number of hyperoptimization tuning rounds.
#' @param verbose Print model iterations (T/F).
#' @param max_rounds Maximum number of rounds to use in model fitting.
#' @param cv_folds Integer (e.g. 5L) that sets the number of cross validation folds to use in model tuning.
#' @param early_stopping_rounds Integer (e.g. 10L) that sets xgb.cv early_stopping_rounds parameter.
#' @param folds Allows users to specify their own folds (e.g. stratified folds).
#' @return model, data, and model results
xgb_pois <- function(df_train,
df_test,
tune_rounds = 25L,
verbose = T,
max_rounds = 10^6,
cv_folds = 5L,
early_stopping_rounds = 10L,
folds = NULL) {
#check input values
testit::assert("Training and testing frames are not data.frames.", is.data.frame(df_train) & is.data.frame(df_test))
testit::assert('df_train is missing "target" column.', "target" %in% colnames(df_train))
testit::assert('df_test is missing "target" columns.', "target" %in% colnames(df_test))
testit::assert("All df_test columns are not numeric.", sapply(df_test, is.numeric))
testit::assert("All df_train columns are not numeric.", sapply(df_train, is.numeric))
testit::assert("tune_rounds is not an integer.", is.integer(tune_rounds))
testit::assert("cv_folds is not an integer.", is.integer(cv_folds))
testit::assert("early_stopping_rounds is not an integer.", is.integer(early_stopping_rounds))
testit::assert("df_train has bad values.", nrow(df_train[complete.cases(df_train), ]) == nrow(df_train))
testit::assert("df_test has bad values.", nrow(df_test[complete.cases(df_test), ]) == nrow(df_test))
#break up data between y and x and convert to xgb usable format
y_train <- df_train$target
df_train$target <- NULL
x_train <- df_train %>% as.matrix()
y_test <- df_test$target
df_test$target <- NULL
x_test <- df_test %>% as.matrix()
#setup model paramaters and outputs
params <- generate_xgb_params(tune_rounds = tune_rounds)
if(is.null(folds)){
folds <- generate_xgb_folds(cv_folds, nrow(df_train))
}
#model list holds all of the models we are going to try.
model_list <- list()
for(i in 1:nrow(params)){
xgb_mdl <- xgboost::xgb.cv(
data = x_train,
nrounds = max_rounds,
early_stopping_rounds = early_stopping_rounds,
label = y_train,
folds = folds,
verbose = 0,
params = list(
tree_method = "hist",
objective = "count:poisson",
booster = "gbtree",
grow_policy = "lossguide",
eta = params$eta[i],
gamma = params$gamma[i],
max_depth = params$max_depth[i],
max_leaves = params$max_leaves[i],
subsample = params$subsample[i],
colsample_bytree = params$colsample_bytree[i],
min_child_weight = params$min_child_weight[i])
)
model_list[[i]] <- xgb_mdl
#need to update eval to look at deviance
params$eval[i] <- xgb_mdl$evaluation_log$test_poisson_nloglik_mean[xgb_mdl$best_iteration]
#print output
if(verbose == T) {
flush.console()
print(glue::glue("Tested param set {i} of {tune_rounds}. Eval is: {params$eval[i]}."))
}
}
mdl <- list()
mdl$params_tested <- params
#selects best model and in the case of ties takes the first best model
mdl$best_cv_mdl_res <- model_list[[c(1:nrow(params))[params$eval == min(params$eval)][1]]]
#train final model & evaluate on test
xgb_final <- xgboost::xgboost(params = mdl$best_cv_mdl_res$params,
data = x_train,
label = y_train,
nrounds = max_rounds,
early_stopping_rounds = early_stopping_rounds,
verbose = 0)
mdl$final_mdl <- xgb_final
mdl$folds <- folds
y_hat_test <- predict(xgb_final, x_test)
#calc goodness of fit
mdl$test_residual_deviance <- calc_deviance(y_test, y_hat_test)
mdl$boot_metrics <- generate_deviance_metrics(y_test, y_hat_test, n = 10000)
#input data
mdl$x_test <- x_test
mdl$x_train <- x_train
mdl$y_test <- y_test
mdl$y_train <- y_train
#prediction
mdl$y_hat_train <- predict(xgb_final, x_train, predcontrib = F)
mdl$y_hat_test <- predict(xgb_final, x_test, predcontrib = F)
mdl$contrib_train <- predict(xgb_final, x_train, predcontrib = T)
mdl$contrib_test <- predict(xgb_final, x_test, predcontrib = T)
#return results
return(mdl)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.