knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE, eval = TRUE )
In this article we explore the idea of hyperparameter tuning using a surrogate model, which means that we have a base model that tries to minimize the loss metric (logloss in this case) on the training data, but we also have a surrogate or meta model that will try to optimize the output of the base model.
The meta model's input space is the parameter space of the base model, so to he goal of the model is to guide the search towards parameters that translate into a good base model performance.
We do that by sampling many values for the parameter combinations, and asking the surrogate model which values it thinks are gonna result in a good performance of the base model, based on historical performance of the base model.
As usual, the data prep is the first step. Nothing new here, you can skip this part if you've seen the other articles
library(recipes) library(magrittr) library(tidytune) library(rsample) library(ParamHelpers) library(MLmetrics) library(knitr) data("attrition") attrition %<>% mutate(Attrition = ifelse(Attrition == 'Yes', 1, 0)) resamples <- rsample::vfold_cv(attrition, v = 2) rec <- recipe(attrition) %>% add_role(Attrition, new_role = 'outcome') %>% add_role(-Attrition, new_role = 'predictor') %>% step_novel(all_nominal(), -Attrition) %>% step_dummy(all_nominal(), -Attrition) %>% step_zv(all_predictors())
Now, we define our custom scoring function:
library(xgboost) xgboost_classif_score <- function(train_df, target_var, params, eval_df, ...){ X_train <- train_df %>% select(-matches(target_var)) %>% as.matrix() y_train <- train_df[[target_var]] xgb_train_data <- xgb.DMatrix(X_train, label = y_train) X_eval <- eval_df %>% select(-matches(target_var)) %>% as.matrix() y_eval <- eval_df[[target_var]] xgb_eval_data <- xgb.DMatrix(X_eval, label = y_eval) model <- xgb.train(params = params, data = xgb_train_data, watchlist = list(train = xgb_train_data, eval = xgb_eval_data), objective = 'binary:logistic', verbose = FALSE, ...) preds <- predict(model, xgb_eval_data) list(logloss = LogLoss(preds, y_eval), acc = Accuracy(ifelse(preds > 0.5, 1, 0), y_eval)) # You can also return a simple vector score: # LogLoss(preds, y_eval) }
Then we perform random search. Here we try 50 paramter combinations.
set.seed(123) # Random search example xgboost_random_params <- makeParamSet( makeIntegerParam('max_depth', lower = 1, upper = 15), makeNumericParam('eta', lower = 0.01, upper = 0.1), makeNumericParam('gamma', lower = 0, upper = 5), makeIntegerParam('min_child_weight', lower = 1, upper = 100), makeNumericParam('subsample', lower = 0.25, upper = 0.9), makeNumericParam('colsample_bytree', lower = 0.25, upper = 0.9) ) results_random_search <- random_search( resamples = resamples, recipe = rec, param_set = xgboost_random_params, scoring_func = xgboost_classif_score, nrounds = 1000, early_stopping_rounds = 20, eval_metric = 'logloss', n = 50, verbosity = FALSE )
Here's what our results look like so far:
summ_random_search <- results_random_search %>% group_by_at(getParamIds(xgboost_random_params)) %>% summarise(logloss = mean(logloss)) summ_random_search %>% arrange(logloss) %>% head(10) %>% kable()
Now we are in position to fine tune our parameters using a surrogate search. The idea is to use a meta model, mapping the parameter values to the performance of the xgboost classifier. Incidentally, the meta-model is by default a ranger model. This is because random forests are good at modeling non-linearity and interactions between variables, and require very little tuning to get a decent performance.
Here we generate 1000 random parameter set candidates for each of the 10 surrogate runs, and ask the surrogate model to pass through the top 5 candidates to the underlying xgboost classifier. Therefore the surrogate search will result in 50 calls to the classifier in total.
The goal of this approach is to try and spend more time around the most promising areas of the parameter space, while allowing some exploration thanks to the random generation of candidates.
set.seed(123) results_surrogate_search <- surrogate_search( resamples = resamples, recipe = rec, param_set = xgboost_random_params, n = 10, scoring_func = xgboost_classif_score, nrounds = 1000, early_stopping_rounds = 20, eval_metric = 'logloss', input = results_random_search, surrogate_target = 'logloss', n_candidates = 1000, top_n = 5, verbosity = FALSE )
And to get the results of the search using a surrogate model:
summ_surrogate_search <- results_surrogate_search %>% group_by_at(getParamIds(xgboost_random_params)) %>% summarise(logloss = mean(logloss)) summ_surrogate_search %>% arrange(logloss) %>% head(10) %>% kable()
Now, we can get a feel for the performance of the surrogate search. Let's choose two parameters arbitrarily and plot the logloss:
library(ggplot2) library(viridis) summ_random_search %<>% mutate(type = 'random') summ_surrogate_search %<>% mutate(type = 'surrogate') plot_data <- bind_rows( summ_random_search, summ_surrogate_search ) ggplot(data = plot_data, aes(x = max_depth, y = eta)) + geom_point(aes(col = type, size = logloss)) + theme_bw() + scale_color_viridis(discrete = TRUE, direction = -1, option = "C", end = 0.85) + ggtitle('Random Search + Surrogate Search', subtitle = '50 iterations each')
We can see that the surrogate serch results are on average better than the random search, because it has focused the search on a subspace of the parameter space.
Let's try a different pair of parameters:
ggplot(data = plot_data, aes(x = gamma, y = min_child_weight)) + geom_point(aes(col = type, size = logloss)) + theme_bw() + scale_color_viridis(discrete = TRUE, direction = -1, option = "C", end = 0.85) + ggtitle('Random Search + Surrogate Search', subtitle = '50 iterations each')
Again, the surrogate search has chosen a subspace to focus on, especially along the min_child_weight
axis.
You might find that the surrogate search above is too restrictive in that it narrows down the search space too much. In other words, there is the risk that we focus too much in exploitation, based on what has been seen so far, versus exploration of new areas.
A possible solution would be to smoothly transition from the exploration phase to the exploitation phase. The surrogate_search
function ahs an option to do just that, by passing vectors to the n
, n_candidates
and top_n
parameters, instead of scalars.
In the code below, we perform surrogate searches iteratively, increasing the n_candidates
parameter at each iteration. Higher n_candidates
translates into more focus on areas with better performance, whereas lower n_candidates
values allows more randomness (and a value of 0 means pure random search). Inversely, increasing the top_n
parameter increases the randomness. Note that because we are using top_n = 5
, each surrogate search iteration returns 5 results (5 parameter combinations).
library(purrr) set.seed(123) n <- c(10, 5, 3, 2) n_candidates <- c(0, 10, 100, 1000) results_zoomin <- surrogate_search( resamples = resamples, recipe = rec, param_set = xgboost_random_params, n = n, scoring_func = xgboost_classif_score, nrounds = 1000, early_stopping_rounds = 20, eval_metric = 'logloss', input = NULL, surrogate_target = 'logloss', n_candidates = n_candidates, top_n = 5, verbosity = FALSE )
Here are the results:
perf_zomin <- results_zoomin %>% group_by_at(getParamIds(xgboost_random_params)) %>% summarise(logloss = mean(logloss)) perf_zomin %>% arrange(logloss) %>% head(10) %>% kable()
Let's see what the results look like on a 2-dimensional subspace of the parameter space:
summ_zoomin <- results_zoomin %>% group_by_at(c('surrogate_run', 'surrogate_iteration', 'param_id', getParamIds(xgboost_random_params))) %>% summarise(logloss = mean(logloss)) %>% ungroup() %>% mutate(surrogate_run = factor(surrogate_run)) ggplot(data = summ_zoomin, aes(x = max_depth, y = eta)) + geom_point(aes(col = surrogate_run, shape = surrogate_run, size = logloss)) + theme_bw() + scale_color_viridis(discrete = TRUE, direction = -1, option = "C", end = 0.85) + ggtitle('Surrogate Search - Zoom in')
This time, the search progressively narrows down towards the final exected best zone, and the logloss seems to diminish accordingly. Here are the results
We can look at the performance at each iteration:
summ_zoomin %<>% arrange( surrogate_run, surrogate_iteration ) %>% mutate( iteration = row_number() ) ggplot(data = summ_zoomin, aes(x = iteration, y = logloss)) + geom_point(aes(col = surrogate_run)) + theme_bw() + scale_color_viridis(discrete = TRUE, direction = -1, option = "C", end = 0.85) + ggtitle('Surrogate Search - Zoom in')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.