In artichaud1/tidytune: A package for Tidy Hyperparameter Tuning

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  message = FALSE,
  warning = FALSE,
  eval = TRUE
)

In this article we explore the idea of hyperparameter tuning using a surrogate model, which means that we have a base model that tries to minimize the loss metric (logloss in this case) on the training data, but we also have a surrogate or meta model that will try to optimize the output of the base model.

The meta model's input space is the parameter space of the base model, so to he goal of the model is to guide the search towards parameters that translate into a good base model performance.

We do that by sampling many values for the parameter combinations, and asking the surrogate model which values it thinks are gonna result in a good performance of the base model, based on historical performance of the base model.

Prepare the data

As usual, the data prep is the first step. Nothing new here, you can skip this part if you've seen the other articles

library(recipes)
library(magrittr)
library(tidytune)
library(rsample)
library(ParamHelpers)
library(MLmetrics)
library(knitr)

data("attrition")

attrition %<>% mutate(Attrition = ifelse(Attrition == 'Yes', 1, 0))

resamples <- rsample::vfold_cv(attrition, v = 2)

rec <- 
  recipe(attrition) %>%
  add_role(Attrition, new_role = 'outcome') %>%
  add_role(-Attrition, new_role = 'predictor') %>%
  step_novel(all_nominal(), -Attrition) %>%
  step_dummy(all_nominal(), -Attrition) %>%
  step_zv(all_predictors())

Scoring function

Now, we define our custom scoring function:

library(xgboost)

xgboost_classif_score <- 
  function(train_df, 
           target_var, 
           params, 
           eval_df, 
           ...){

  X_train <- train_df %>% select(-matches(target_var)) %>% as.matrix()
  y_train <- train_df[[target_var]]
  xgb_train_data <- xgb.DMatrix(X_train, label = y_train)

  X_eval <- eval_df %>% select(-matches(target_var)) %>% as.matrix()
  y_eval <- eval_df[[target_var]]
  xgb_eval_data <- xgb.DMatrix(X_eval, label = y_eval)

  model <- xgb.train(params = params,
                     data = xgb_train_data,
                     watchlist = list(train = xgb_train_data, eval = xgb_eval_data),
                     objective = 'binary:logistic',
                     verbose = FALSE,
                     ...)

  preds <- predict(model, xgb_eval_data)

  list(logloss = LogLoss(preds, y_eval), 
       acc = Accuracy(ifelse(preds > 0.5, 1, 0), y_eval))

  # You can also return a simple vector score:
  # LogLoss(preds, y_eval)
}

Random search

Then we perform random search. Here we try 50 paramter combinations.

set.seed(123)

# Random search example

xgboost_random_params <-
  makeParamSet(
    makeIntegerParam('max_depth', lower = 1, upper = 15),
    makeNumericParam('eta', lower = 0.01, upper = 0.1),
    makeNumericParam('gamma', lower = 0, upper = 5),
    makeIntegerParam('min_child_weight', lower = 1, upper = 100),
    makeNumericParam('subsample', lower = 0.25, upper = 0.9),
    makeNumericParam('colsample_bytree', lower = 0.25, upper = 0.9)
  )

results_random_search <- 
  random_search(
    resamples = resamples, 
    recipe = rec, 
    param_set = xgboost_random_params, 
    scoring_func = xgboost_classif_score, 
    nrounds = 1000,
    early_stopping_rounds = 20,
    eval_metric = 'logloss',
    n = 50,
    verbosity = FALSE
  )

Here's what our results look like so far:

summ_random_search <- 
  results_random_search %>%
  group_by_at(getParamIds(xgboost_random_params)) %>%
  summarise(logloss = mean(logloss))

summ_random_search %>%
  arrange(logloss) %>%
  head(10) %>%
  kable()

Surrogate search

Now we are in position to fine tune our parameters using a surrogate search. The idea is to use a meta model, mapping the parameter values to the performance of the xgboost classifier. Incidentally, the meta-model is by default a ranger model. This is because random forests are good at modeling non-linearity and interactions between variables, and require very little tuning to get a decent performance.

Here we generate 1000 random parameter set candidates for each of the 10 surrogate runs, and ask the surrogate model to pass through the top 5 candidates to the underlying xgboost classifier. Therefore the surrogate search will result in 50 calls to the classifier in total.

The goal of this approach is to try and spend more time around the most promising areas of the parameter space, while allowing some exploration thanks to the random generation of candidates.

set.seed(123)

results_surrogate_search <- 
  surrogate_search(
    resamples = resamples,
    recipe = rec,
    param_set = xgboost_random_params,
    n = 10,
    scoring_func = xgboost_classif_score,
    nrounds = 1000,
    early_stopping_rounds = 20,
    eval_metric = 'logloss',
    input = results_random_search,
    surrogate_target = 'logloss',
    n_candidates = 1000,
    top_n = 5,
    verbosity = FALSE
  )

And to get the results of the search using a surrogate model:

summ_surrogate_search <- 
  results_surrogate_search %>%
  group_by_at(getParamIds(xgboost_random_params)) %>%
  summarise(logloss = mean(logloss))

summ_surrogate_search %>%
  arrange(logloss) %>%
  head(10) %>%
  kable()

Now, we can get a feel for the performance of the surrogate search. Let's choose two parameters arbitrarily and plot the logloss:

library(ggplot2)
library(viridis)

summ_random_search %<>% mutate(type = 'random')
summ_surrogate_search %<>% mutate(type = 'surrogate')

plot_data <- 
  bind_rows(
    summ_random_search,
    summ_surrogate_search
  )

ggplot(data = plot_data, aes(x = max_depth, y = eta)) + 
  geom_point(aes(col = type, size = logloss)) + 
  theme_bw() + 
  scale_color_viridis(discrete = TRUE, direction = -1, option = "C", end = 0.85) +
  ggtitle('Random Search + Surrogate Search',
          subtitle = '50 iterations each')

We can see that the surrogate serch results are on average better than the random search, because it has focused the search on a subspace of the parameter space.

Let's try a different pair of parameters:

ggplot(data = plot_data, aes(x = gamma, y = min_child_weight)) + 
  geom_point(aes(col = type, size = logloss)) + 
  theme_bw() + 
  scale_color_viridis(discrete = TRUE, direction = -1, option = "C", end = 0.85) +
  ggtitle('Random Search + Surrogate Search',
          subtitle = '50 iterations each')

Again, the surrogate search has chosen a subspace to focus on, especially along the min_child_weight axis.

Progressive zoom in

You might find that the surrogate search above is too restrictive in that it narrows down the search space too much. In other words, there is the risk that we focus too much in exploitation, based on what has been seen so far, versus exploration of new areas.

A possible solution would be to smoothly transition from the exploration phase to the exploitation phase. The surrogate_search function ahs an option to do just that, by passing vectors to the n, n_candidates and top_n parameters, instead of scalars.

In the code below, we perform surrogate searches iteratively, increasing the n_candidates parameter at each iteration. Higher n_candidates translates into more focus on areas with better performance, whereas lower n_candidates values allows more randomness (and a value of 0 means pure random search). Inversely, increasing the top_n parameter increases the randomness. Note that because we are using top_n = 5, each surrogate search iteration returns 5 results (5 parameter combinations).

library(purrr)

set.seed(123)

n <- c(10, 5, 3, 2)
n_candidates <- c(0, 10, 100, 1000)

results_zoomin <- 
  surrogate_search(
    resamples = resamples,
    recipe = rec,
    param_set = xgboost_random_params,
    n = n,
    scoring_func = xgboost_classif_score,
    nrounds = 1000,
    early_stopping_rounds = 20,
    eval_metric = 'logloss',
    input = NULL,
    surrogate_target = 'logloss',
    n_candidates = n_candidates,
    top_n = 5,
    verbosity = FALSE
  )

Here are the results:

perf_zomin <- 
  results_zoomin %>%
  group_by_at(getParamIds(xgboost_random_params)) %>%
  summarise(logloss = mean(logloss))

perf_zomin %>%
  arrange(logloss) %>%
  head(10) %>%
  kable()

Let's see what the results look like on a 2-dimensional subspace of the parameter space:

summ_zoomin <- 
  results_zoomin %>%
  group_by_at(c('surrogate_run', 
              'surrogate_iteration',
              'param_id',
              getParamIds(xgboost_random_params))) %>%
  summarise(logloss = mean(logloss)) %>%
  ungroup() %>%
  mutate(surrogate_run = factor(surrogate_run))

ggplot(data = summ_zoomin, aes(x = max_depth, y = eta)) + 
  geom_point(aes(col = surrogate_run, shape = surrogate_run, size = logloss)) + 
  theme_bw() + 
  scale_color_viridis(discrete = TRUE, direction = -1, option = "C", end = 0.85) +
  ggtitle('Surrogate Search - Zoom in')

This time, the search progressively narrows down towards the final exected best zone, and the logloss seems to diminish accordingly. Here are the results

We can look at the performance at each iteration:

summ_zoomin %<>%
  arrange(
    surrogate_run,
    surrogate_iteration
  ) %>%
  mutate(
    iteration = row_number()
  )

ggplot(data = summ_zoomin, aes(x = iteration, y = logloss)) + 
  geom_point(aes(col = surrogate_run)) + 
  theme_bw() + 
  scale_color_viridis(discrete = TRUE, direction = -1, option = "C", end = 0.85) +
  ggtitle('Surrogate Search - Zoom in')