vignettes/simple_example.md

This document showcases the two basic tidytune methods of doing hyperparameter optimization: grid search and random search.

We will be using the wollowing libraries for this example and most of the examples in the package.

library(recipes)
library(magrittr)
library(tidytune)
library(rsample)
library(ParamHelpers)
library(MLmetrics) # for LogLoss
library(dplyr)

Prepare the recipe:

data("attrition")

attrition %<>% mutate(Attrition = ifelse(Attrition == 'Yes', 1, 0))

resamples <- rsample::vfold_cv(attrition, v = 2)

rec <- 
  recipe(attrition) %>%
  add_role(Attrition, new_role = 'outcome') %>%
  add_role(-Attrition, new_role = 'predictor') %>%
  step_novel(all_nominal(), -Attrition) %>%
  step_dummy(all_nominal(), -Attrition) %>%
  step_zv(all_predictors())

Prepare your scoring function:

library(xgboost)

xgboost_classif_score <- 
  function(train_df, 
           target_var, 
           params, 
           eval_df, 
           ...){

  X_train <- train_df %>% select(-matches(target_var)) %>% as.matrix()
  y_train <- train_df[[target_var]]
  xgb_train_data <- xgb.DMatrix(X_train, label = y_train)

  X_eval <- eval_df %>% select(-matches(target_var)) %>% as.matrix()
  y_eval <- eval_df[[target_var]]
  xgb_eval_data <- xgb.DMatrix(X_eval, label = y_eval)

  model <- xgb.train(params = params,
                     data = xgb_train_data,
                     watchlist = list(train = xgb_train_data, eval = xgb_eval_data),
                     objective = 'binary:logistic',
                     verbose = FALSE,
                     ...)

  preds <- predict(model, xgb_eval_data)

  list(logloss = LogLoss(preds, y_eval), 
       acc = Accuracy(ifelse(preds > 0.5, 1, 0), y_eval))

  # You can also return a simple vector score:
  # LogLoss(preds, y_eval)
}

Grid search:

set.seed(123)

xgboost_param_grid <- expand.grid(eta = c(0.1, 0.05), max_depth = c(3:10))

results_grid_search <- 
  grid_search(
    resamples = resamples, 
    recipe = rec, 
    param_grid = xgboost_param_grid, 
    scoring_func = xgboost_classif_score, 
    nrounds = 100
  )
#> Fold1 
#> Paramset 1 / 16 -Accepted.
#> Paramset 2 / 16 -Accepted.
#> Paramset 3 / 16 -Accepted.
#> Paramset 4 / 16 -Accepted.
#> Paramset 5 / 16 -Accepted.
#> Paramset 6 / 16 -Accepted.
#> Paramset 7 / 16 -Accepted.
#> Paramset 8 / 16 -Accepted.
#> Paramset 9 / 16 -Accepted.
#> Paramset 10 / 16 -Accepted.
#> Paramset 11 / 16 -Accepted.
#> Paramset 12 / 16 -Accepted.
#> Paramset 13 / 16 -Accepted.
#> Paramset 14 / 16 -Accepted.
#> Paramset 15 / 16 -Accepted.
#> Paramset 16 / 16 -Accepted.
#> Fold2 
#> Paramset 1 / 16 -Accepted.
#> Paramset 2 / 16 -Accepted.
#> Paramset 3 / 16 -Accepted.
#> Paramset 4 / 16 -Accepted.
#> Paramset 5 / 16 -Accepted.
#> Paramset 6 / 16 -Accepted.
#> Paramset 7 / 16 -Accepted.
#> Paramset 8 / 16 -Accepted.
#> Paramset 9 / 16 -Accepted.
#> Paramset 10 / 16 -Accepted.
#> Paramset 11 / 16 -Accepted.
#> Paramset 12 / 16 -Accepted.
#> Paramset 13 / 16 -Accepted.
#> Paramset 14 / 16 -Accepted.
#> Paramset 15 / 16 -Accepted.
#> Paramset 16 / 16 -Accepted.

head(results_grid_search)
#> # A tibble: 6 x 7
#>   fold_id param_id      eta max_depth logloss   acc test_ids   
#>   <chr>   <chr>       <dbl>     <int>   <dbl> <dbl> <list>     
#> 1 Fold1   Paramset01 0.100          3   0.328 0.886 <int [735]>
#> 2 Fold1   Paramset02 0.0500         3   0.335 0.879 <int [735]>
#> 3 Fold1   Paramset03 0.100          4   0.342 0.882 <int [735]>
#> 4 Fold1   Paramset04 0.0500         4   0.330 0.878 <int [735]>
#> 5 Fold1   Paramset05 0.100          5   0.354 0.879 <int [735]>
#> 6 Fold1   Paramset06 0.0500         5   0.333 0.884 <int [735]>

Random search:

set.seed(123)

xgboost_random_params <-
  makeParamSet(
    makeIntegerParam('max_depth', lower = 1, upper = 15),
    makeNumericParam('eta', lower = 0.01, upper = 0.1),
    makeNumericParam('gamma', lower = 0, upper = 5),
    makeIntegerParam('min_child_weight', lower = 1, upper = 100),
    makeNumericParam('subsample', lower = 0.25, upper = 0.9),
    makeNumericParam('colsample_bytree', lower = 0.25, upper = 0.9)
  )

results_random_search <- 
  random_search(
    resamples = resamples, 
    recipe = rec, 
    param_set = xgboost_random_params, 
    scoring_func = xgboost_classif_score,
    nrounds = 100,
    n = 4
  )
#> Fold1 
#> Paramset 1 / 4 -Accepted.
#> Paramset 2 / 4 -Accepted.
#> Paramset 3 / 4 -Accepted.
#> Paramset 4 / 4 -Accepted.
#> Fold2 
#> Paramset 1 / 4 -Accepted.
#> Paramset 2 / 4 -Accepted.
#> Paramset 3 / 4 -Accepted.
#> Paramset 4 / 4 -Accepted.

head(results_random_search)
#> # A tibble: 6 x 11
#>   fold_id param_id  max_depth    eta gamma min_child_weight subsample
#>   <chr>   <chr>         <int>  <dbl> <dbl>            <int>     <dbl>
#> 1 Fold1   Paramset1         5 0.0809 2.04                89     0.861
#> 2 Fold1   Paramset2         8 0.0903 2.76                46     0.872
#> 3 Fold1   Paramset3        11 0.0615 0.515               90     0.410
#> 4 Fold1   Paramset4         5 0.0959 4.45                70     0.666
#> 5 Fold2   Paramset1         5 0.0809 2.04                89     0.861
#> 6 Fold2   Paramset2         8 0.0903 2.76                46     0.872
#> # ... with 4 more variables: colsample_bytree <dbl>, logloss <dbl>,
#> #   acc <dbl>, test_ids <list>

To get the performance of parameter combinations across folds and extract your optimal parameters, simply do:

library(dplyr)

results_random_search %>%
  group_by_at(getParamIds(xgboost_random_params)) %>%
  summarise(logloss = mean(logloss),
            accuracy = mean(acc)) %>%
  arrange(logloss, accuracy)
#> # A tibble: 4 x 8
#> # Groups:   max_depth, eta, gamma, min_child_weight, subsample [4]
#>   max_depth    eta gamma min_child_weight subsample colsample_bytree
#>       <int>  <dbl> <dbl>            <int>     <dbl>            <dbl>
#> 1         8 0.0903 2.76                46     0.872            0.545
#> 2         5 0.0959 4.45                70     0.666            0.896
#> 3         5 0.0809 2.04                89     0.861            0.280
#> 4        11 0.0615 0.515               90     0.410            0.277
#> # ... with 2 more variables: logloss <dbl>, accuracy <dbl>


artichaud1/cook documentation built on May 21, 2019, 9:23 a.m.