modelpipe: Fast and opinionated model development pipelines.

Documented in xgb_pois

#' Automatically models poisson targets with random search hyperparameter optimziation.
#' XGBoost model trains with lossguide and histogram tree method to accelerate tuning.
#'
#' @param df_train Training data.frame with column called "target" for training.
#' All columns should be numeric and prepared with a package like vtreat.
#' @param df_test Testing data.frame with column called "target" for evaluation.
#' All columns should be numeric and prepared with a package like vtreat.
#' @param tune_rounds Integer (e.g. 25L) indicating the number of hyperoptimization tuning rounds.
#' @param verbose Print model iterations (T/F).
#' @param max_rounds Maximum number of rounds to use in model fitting.
#' @param cv_folds Integer (e.g. 5L) that sets the number of cross validation folds to use in model tuning.
#' @param early_stopping_rounds Integer (e.g. 10L) that sets xgb.cv early_stopping_rounds parameter.
#' @param folds Allows users to specify their own folds (e.g. stratified folds).
#' @return model, data, and model results

xgb_pois <- function(df_train,
                     df_test,
                     tune_rounds           = 25L,
                     verbose               = T,
                     max_rounds            = 10^6,
                     cv_folds              = 5L,
                     early_stopping_rounds = 10L,
                     folds                 = NULL) {

  #check input values
  testit::assert("Training and testing frames are not data.frames.", is.data.frame(df_train) & is.data.frame(df_test))
  testit::assert('df_train is missing "target" column.', "target" %in% colnames(df_train))
  testit::assert('df_test is missing "target" columns.', "target" %in% colnames(df_test))
  testit::assert("All df_test columns are not numeric.", sapply(df_test, is.numeric))
  testit::assert("All df_train columns are not numeric.", sapply(df_train, is.numeric))
  testit::assert("tune_rounds is not an integer.", is.integer(tune_rounds))
  testit::assert("cv_folds is not an integer.", is.integer(cv_folds))
  testit::assert("early_stopping_rounds is not an integer.", is.integer(early_stopping_rounds))
  testit::assert("df_train has bad values.", nrow(df_train[complete.cases(df_train), ]) == nrow(df_train))
  testit::assert("df_test has bad values.", nrow(df_test[complete.cases(df_test), ]) == nrow(df_test))

  #break up data between y and x and convert to xgb usable format
  y_train <- df_train$target
  df_train$target <- NULL
  x_train <- df_train %>% as.matrix()

  y_test <- df_test$target
  df_test$target <- NULL
  x_test <- df_test %>% as.matrix()

  #setup model paramaters and outputs
  params <- generate_xgb_params(tune_rounds = tune_rounds)
  if(is.null(folds)){
    folds <- generate_xgb_folds(cv_folds, nrow(df_train))
  }

  #model list holds all of the models we are going to try.
  model_list <- list()

  for(i in 1:nrow(params)){
    xgb_mdl <- xgboost::xgb.cv(
      data                  = x_train,
      nrounds               = max_rounds,
      early_stopping_rounds = early_stopping_rounds,
      label                 = y_train,
      folds                 = folds,
      verbose               = 0,
      params = list(
        tree_method       = "hist",
        objective         = "count:poisson",
        booster           = "gbtree",
        grow_policy       = "lossguide",
        eta               = params$eta[i],
        gamma             = params$gamma[i],
        max_depth         = params$max_depth[i],
        max_leaves        = params$max_leaves[i],
        subsample         = params$subsample[i],
        colsample_bytree  = params$colsample_bytree[i],
        min_child_weight  = params$min_child_weight[i])
    )
    model_list[[i]] <- xgb_mdl

    #need to update eval to look at deviance
    params$eval[i] <- xgb_mdl$evaluation_log$test_poisson_nloglik_mean[xgb_mdl$best_iteration]

    #print output
    if(verbose == T) {
      flush.console()
      print(glue::glue("Tested param set {i} of {tune_rounds}. Eval is: {params$eval[i]}."))
    }

  }

  mdl <- list()
  mdl$params_tested <- params
  #selects best model and in the case of ties takes the first best model
  mdl$best_cv_mdl_res <- model_list[[c(1:nrow(params))[params$eval == min(params$eval)][1]]]

  #train final model & evaluate on test
  xgb_final <- xgboost::xgboost(params                = mdl$best_cv_mdl_res$params,
                                data                  = x_train,
                                label                 = y_train,
                                nrounds               = max_rounds,
                                early_stopping_rounds = early_stopping_rounds,
                                verbose               = 0)

  mdl$final_mdl <- xgb_final
  mdl$folds     <- folds
  y_hat_test    <- predict(xgb_final, x_test)

  #calc goodness of fit
  mdl$test_residual_deviance <- calc_deviance(y_test, y_hat_test)
  mdl$boot_metrics           <- generate_deviance_metrics(y_test, y_hat_test, n = 10000)

  #input data
  mdl$x_test        <- x_test
  mdl$x_train       <- x_train
  mdl$y_test        <- y_test
  mdl$y_train       <- y_train

  #prediction
  mdl$y_hat_train   <- predict(xgb_final, x_train, predcontrib = F)
  mdl$y_hat_test    <- predict(xgb_final, x_test, predcontrib = F)
  mdl$contrib_train <- predict(xgb_final, x_train, predcontrib = T)
  mdl$contrib_test  <- predict(xgb_final, x_test, predcontrib = T)

  #return results
  return(mdl)
}