R/model_xgb.R

Defines functions generate_folds generate_params model_numeric

# XGBoost models ----------------------------------------------------------

#' Specifies cross validation folds for xgboost models
#'
#' @param cv_folds number of cross validation folds
#' @param nrows training data row count
#' @return list of cross validation folds

generate_folds <- function(cv_folds, nrows){
  folds_idx <- sample(c(1:cv_folds), nrows, replace = T)
  folds <- list()
  for(i in unique(folds_idx)){
    folds[[i]] <- c(1:nrows)[folds_idx == i]
  }

  folds
}

#' Automatically generates xgboost hyperparameter sets for optimziation
#' Sample is drawn from autoxgboost params: https://github.com/ja-thomas/autoxgboost/blob/master/R/autoxgbparset.R
#'
#' @param tune_rounds: number of parameter sets to generate
#' @return a dataframe with hyperparameters for tuning

generate_params <- function(tune_rounds){
  set.seed(1234)
  out <- data.frame(eta               = runif(tune_rounds, 0.01, 0.2),
                    gamma             = runif(tune_rounds, 2^-7, 2^6),
                    max_depth         = runif(tune_rounds, 3, 20) %>% round(),
                    subsample         = runif(tune_rounds, 0.05, 1),
                    colsample_bytree  = runif(tune_rounds, 0.5, 1),
                    colsample_bylevel = runif(tune_rounds, 0.5, 1),
                    lambda            = runif(tune_rounds, 2^-10, 2^10),
                    alpha             = runif(tune_rounds, 2^-10, 2^10))
  out
}

#' Automatically models numeric targets with random search hyperparameter optimziation
#' Tuning parameters mirror autoxgboost: https://liuyanguu.github.io/post/2018/10/03/autoxgboost-bayesian-optimization/
#' and may be converted to bayesian hyperparameter optimization in the future.
#'
#' @param mdl: a list of frames from the treat_numeric_data function.
#' @param target: target variable for prediction
#' @param cv_folds: number of cross validation folds
#' @param tune_rounds: number of hyperoptimization tuning rounds
#' @param verbose: print model iterations (T/F)
#' @param max_rounds: maximum number of rounds to use in model fitting
#' @return model, data, and model results

model_numeric <- function(mdl,
                          target_var,
                          cv_folds    = 5,
                          tune_rounds = 100,
                          verbose     = T,
                          max_rounds  = 10 ^ 6) {
  #generate parameters
  params <- generate_params(tune_rounds)
  params$eval <- NA
  model_list <- list()

  #specify folds
  folds <- generate_folds(cv_folds, nrow(mdl$train))

  #test parameter sets
  for(i in 1:nrow(params)) {
    xgb_mdl <- xgboost::xgb.cv(data                  = mdl$train_x %>% as.matrix(),
                               nrounds               = max_rounds,
                               early_stopping_rounds = 2L,
                               label                 = mdl$train_y,
                               folds                 = folds,
                               verbose               = 0,
                               params                = list(
                                 tree_method       = "hist",
                                 objective         = "reg:squarederror",
                                 booster           = "gbtree",
                                 grow_policy       = "lossguide",
                                 eta               = params$eta[i],
                                 gamma             = params$gamma[i],
                                 max_depth         = params$max_depth[i],
                                 subsample         = params$subsample[i],
                                 colsample_bytree  = params$colsample_bytree[i],
                                 colsample_bylevel = params$colsample_bylevel[i],
                                 lambda            = params$lambda[i],
                                 alpha             = params$alpha[i])
    )
    model_list[[i]] <- xgb_mdl
    params$eval[i] <- xgb_mdl$evaluation_log$test_rmse_mean[xgb_mdl$best_iteration]

    if(verbose == T) {
      flush.console()
      print(glue::glue("Tested param set {i} of {tune_rounds}. Eval is: {params$eval[i]}."))
    }
  }

  mdl$params_tested <- params
  mdl$best_cv_mdl_res <- model_list[[c(1:nrow(params))[params$eval == min(params$eval)]]]

  #train final model & evaluate on test
  xgb_final <- xgboost::xgboost(params                = mdl$best_cv_mdl_res$params,
                                data                  = mdl$train_x,
                                label                 = mdl$train_y,
                                nrounds               = max_rounds,
                                early_stopping_rounds = 10L,
                                verbose               = 0)
  mdl$final_mdl <- xgb_final
  mdl$test_yhat <- predict(xgb_final, mdl$test_x %>% as.matrix())

  #calc final results
  mdl$test_rmse         <- yardstick::rmse_vec(mdl$test_y, mdl$test_yhat)
  mdl$test_mae          <- yardstick::mae_vec(mdl$test_y, mdl$test_yhat)
  mdl$test_rsq          <- yardstick::rsq_vec(mdl$test_y, mdl$test_yhat)
  mdl$test_spearman_cor <- cor(mdl$test_y, mdl$test_yhat, method = "spearman")

  #return results
  mdl
}
prescient/opinionated_pipelines documentation built on Dec. 10, 2019, 12:15 a.m.