This document showcases the two basic tidytune
methods of doing
hyperparameter optimization: grid search and random search.
We will be using the wollowing libraries for this example and most of the examples in the package.
library(recipes)
library(magrittr)
library(tidytune)
library(rsample)
library(ParamHelpers)
library(MLmetrics) # for LogLoss
library(dplyr)
data("attrition")
attrition %<>% mutate(Attrition = ifelse(Attrition == 'Yes', 1, 0))
resamples <- rsample::vfold_cv(attrition, v = 2)
rec <-
recipe(attrition) %>%
add_role(Attrition, new_role = 'outcome') %>%
add_role(-Attrition, new_role = 'predictor') %>%
step_novel(all_nominal(), -Attrition) %>%
step_dummy(all_nominal(), -Attrition) %>%
step_zv(all_predictors())
library(xgboost)
xgboost_classif_score <-
function(train_df,
target_var,
params,
eval_df,
...){
X_train <- train_df %>% select(-matches(target_var)) %>% as.matrix()
y_train <- train_df[[target_var]]
xgb_train_data <- xgb.DMatrix(X_train, label = y_train)
X_eval <- eval_df %>% select(-matches(target_var)) %>% as.matrix()
y_eval <- eval_df[[target_var]]
xgb_eval_data <- xgb.DMatrix(X_eval, label = y_eval)
model <- xgb.train(params = params,
data = xgb_train_data,
watchlist = list(train = xgb_train_data, eval = xgb_eval_data),
objective = 'binary:logistic',
verbose = FALSE,
...)
preds <- predict(model, xgb_eval_data)
list(logloss = LogLoss(preds, y_eval),
acc = Accuracy(ifelse(preds > 0.5, 1, 0), y_eval))
# You can also return a simple vector score:
# LogLoss(preds, y_eval)
}
set.seed(123)
xgboost_param_grid <- expand.grid(eta = c(0.1, 0.05), max_depth = c(3:10))
results_grid_search <-
grid_search(
resamples = resamples,
recipe = rec,
param_grid = xgboost_param_grid,
scoring_func = xgboost_classif_score,
nrounds = 100
)
#> Fold1
#> Paramset 1 / 16 -Accepted.
#> Paramset 2 / 16 -Accepted.
#> Paramset 3 / 16 -Accepted.
#> Paramset 4 / 16 -Accepted.
#> Paramset 5 / 16 -Accepted.
#> Paramset 6 / 16 -Accepted.
#> Paramset 7 / 16 -Accepted.
#> Paramset 8 / 16 -Accepted.
#> Paramset 9 / 16 -Accepted.
#> Paramset 10 / 16 -Accepted.
#> Paramset 11 / 16 -Accepted.
#> Paramset 12 / 16 -Accepted.
#> Paramset 13 / 16 -Accepted.
#> Paramset 14 / 16 -Accepted.
#> Paramset 15 / 16 -Accepted.
#> Paramset 16 / 16 -Accepted.
#> Fold2
#> Paramset 1 / 16 -Accepted.
#> Paramset 2 / 16 -Accepted.
#> Paramset 3 / 16 -Accepted.
#> Paramset 4 / 16 -Accepted.
#> Paramset 5 / 16 -Accepted.
#> Paramset 6 / 16 -Accepted.
#> Paramset 7 / 16 -Accepted.
#> Paramset 8 / 16 -Accepted.
#> Paramset 9 / 16 -Accepted.
#> Paramset 10 / 16 -Accepted.
#> Paramset 11 / 16 -Accepted.
#> Paramset 12 / 16 -Accepted.
#> Paramset 13 / 16 -Accepted.
#> Paramset 14 / 16 -Accepted.
#> Paramset 15 / 16 -Accepted.
#> Paramset 16 / 16 -Accepted.
head(results_grid_search)
#> # A tibble: 6 x 7
#> fold_id param_id eta max_depth logloss acc test_ids
#> <chr> <chr> <dbl> <int> <dbl> <dbl> <list>
#> 1 Fold1 Paramset01 0.100 3 0.328 0.886 <int [735]>
#> 2 Fold1 Paramset02 0.0500 3 0.335 0.879 <int [735]>
#> 3 Fold1 Paramset03 0.100 4 0.342 0.882 <int [735]>
#> 4 Fold1 Paramset04 0.0500 4 0.330 0.878 <int [735]>
#> 5 Fold1 Paramset05 0.100 5 0.354 0.879 <int [735]>
#> 6 Fold1 Paramset06 0.0500 5 0.333 0.884 <int [735]>
set.seed(123)
xgboost_random_params <-
makeParamSet(
makeIntegerParam('max_depth', lower = 1, upper = 15),
makeNumericParam('eta', lower = 0.01, upper = 0.1),
makeNumericParam('gamma', lower = 0, upper = 5),
makeIntegerParam('min_child_weight', lower = 1, upper = 100),
makeNumericParam('subsample', lower = 0.25, upper = 0.9),
makeNumericParam('colsample_bytree', lower = 0.25, upper = 0.9)
)
results_random_search <-
random_search(
resamples = resamples,
recipe = rec,
param_set = xgboost_random_params,
scoring_func = xgboost_classif_score,
nrounds = 100,
n = 4
)
#> Fold1
#> Paramset 1 / 4 -Accepted.
#> Paramset 2 / 4 -Accepted.
#> Paramset 3 / 4 -Accepted.
#> Paramset 4 / 4 -Accepted.
#> Fold2
#> Paramset 1 / 4 -Accepted.
#> Paramset 2 / 4 -Accepted.
#> Paramset 3 / 4 -Accepted.
#> Paramset 4 / 4 -Accepted.
head(results_random_search)
#> # A tibble: 6 x 11
#> fold_id param_id max_depth eta gamma min_child_weight subsample
#> <chr> <chr> <int> <dbl> <dbl> <int> <dbl>
#> 1 Fold1 Paramset1 5 0.0809 2.04 89 0.861
#> 2 Fold1 Paramset2 8 0.0903 2.76 46 0.872
#> 3 Fold1 Paramset3 11 0.0615 0.515 90 0.410
#> 4 Fold1 Paramset4 5 0.0959 4.45 70 0.666
#> 5 Fold2 Paramset1 5 0.0809 2.04 89 0.861
#> 6 Fold2 Paramset2 8 0.0903 2.76 46 0.872
#> # ... with 4 more variables: colsample_bytree <dbl>, logloss <dbl>,
#> # acc <dbl>, test_ids <list>
To get the performance of parameter combinations across folds and extract your optimal parameters, simply do:
library(dplyr)
results_random_search %>%
group_by_at(getParamIds(xgboost_random_params)) %>%
summarise(logloss = mean(logloss),
accuracy = mean(acc)) %>%
arrange(logloss, accuracy)
#> # A tibble: 4 x 8
#> # Groups: max_depth, eta, gamma, min_child_weight, subsample [4]
#> max_depth eta gamma min_child_weight subsample colsample_bytree
#> <int> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 8 0.0903 2.76 46 0.872 0.545
#> 2 5 0.0959 4.45 70 0.666 0.896
#> 3 5 0.0809 2.04 89 0.861 0.280
#> 4 11 0.0615 0.515 90 0.410 0.277
#> # ... with 2 more variables: logloss <dbl>, accuracy <dbl>
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.