# library(dplyr)
# library(purrr)
# library(foreach)
# library(magrittr)
# library(stringr)
#' Create predictions given a resample and a recipe.
#' @description This is a generic function. Plug your custom `scoring_func`.
#' @param split A single split object. See `rsample` package.
#' @param rec Recipe. *Must have been prepped using `retain = TRUE`. See `recipes` package.
#' @param params List of parameters passed to the `scoring_func` function.
#' @param target_var Name of the target variable in the resample.
#' @param scoring_func Your custom train/predict/score function.
#' @param ... Optional params passed to scoring_func.
#' @return A data.frame with columns `target` and `predicted`.
#' @details `scoring_func` cab return a single score as a numeric vector,
#' or multiple scores in a data.frame.
#' @import recipes
#' @importFrom rsample assessment
#' @importFrom dplyr bind_cols
#' @importFrom rlang is_true is_false
#' @importFrom purrr is_bare_vector
train_predict <- function(split, rec, params, target_var, scoring_func, ...){
train_df <- as.data.frame(juice(rec, all_predictors(), all_outcomes()))
eval_df <- as.data.frame(bake(rec, assessment(split), all_predictors(), all_outcomes()))
score <- scoring_func(train_df, target_var, as.list(params), eval_df, ...)
if(is_bare_numeric(score)){
list(score = score)
}else{
score
}
}
#' For a grid of params, return list of model predictions.
#' @description This is a vectorized form of train_predict, vectorized over the param_grid.
#' @param split A split A resampling split. See `rsample` package.
#' @param target_var Name of the target variable in the resample.
#' @param recipe Recipe. *Must have been prepped using `retain = TRUE`. See `recipe` package.
#' @param param_grid List of list of parameters passed to the `scoring_func` function.
#' @param scoring_func Your custom train/predict/score function.
#' @param score_var Name of the score to use, in case `scoring_func` returns
#' a list of scores. If null (the default), will use the 1st score.
#' @param ... Optional params passed to scoring_func.
#' @param verbosity If > 0, print paramset index.
#' @return A list of data.frames with columns `target` and `predicted`.
#' @importFrom purrr map imap
#' @importFrom crayon green red
grid_predict <- function(split,
recipe,
param_grid,
target_var,
scoring_func,
score_var = NULL,
...,
verbosity){
params_df <- NULL
scores <- NULL
score_length <- 1
imap(param_grid,
function(params, index){
if(verbosity > 0){
cat(green(paste('Paramset', index, '/', length(param_grid), '\n')))
}
score <- train_predict(split, recipe, params, target_var, scoring_func, ...)
# Note: when index == 1, we will always accept params
# Here we remember the length of train_predict output
if(index == 1){
score_length <<- length(score)
}
if(is.null(score_var)){
score_var <- 1
}
scores <<- c(scores, score[[score_var]])
params_df <<- bind_rows(params_df, params)
score
})
}
#' Perform grid search over a grid of parameters.
#' @param resamples A data.frame with columns `splits` and `id`, created using the `rsample` package.
#' @param recipe The recipe to use. See package `recipes`.
#' @param param_grid Data frame of parameters combinations (e.g. generated by expand.grid).
#' @param scoring_func Your custom train/predict/score function.
#' Must take as parameters:
#' \itemize{
#' \item a training dataframe
#' \item the name of the target variable in the training dataframe
#' \item a list of parameters (these are the hyperparameters we are tuning)
#' \item an evaluation dataframe
#' \item dots. These are additional non-tunable parameters that could be passed to the function.
#' }
#' @param ... Optional params passed to scoring_func.
#' @param verbosity Level of verbosity. Integer or TRUE/FALSE for max/min verbosity.
#' @return A tidy data.frame, with one column per parameter, columns to identify the
#' paramset and the fold, a column giving the row indices of the evaluation dataset,
#' and columns for the performance scores (these are taken from the scoring function if
#' it returned a data.frame, otherwise it will just be a _score_ column).
#' @details `scoring_func` can return a single score as a numeric vector,
#' or multiple scores in a data.frame.
#' @export
#' @importFrom purrr transpose
#' @importFrom dplyr mutate bind_cols as_data_frame select left_join
#' @importFrom purrr pmap map_dfr transpose map
#' @importFrom tidyr unnest
#' @importFrom magrittr "%>%" "%<>%" "%$%"
#' @importFrom stringr str_replace
#' @importFrom crayon blue
#' @importFrom rlang is_true
#' @importFrom furrr future_pmap
#' @importFrom rsample prepper
grid_search <- function(resamples,
recipe,
param_grid,
scoring_func,
...,
verbosity = TRUE){
if(is_true(verbosity)){
verbosity <- Inf
}
else if(is_false(verbosity)){
verbosity <- -Inf
}
# Add assesment_ids to resamples_df. Will be added to final result
resamples %<>%
mutate(
test_ids = map(splits, ~ assessment_ids(.$data, .$in_id))
)
# Get param grid into expected format
param_grid <- transpose(param_grid)
# Extract name of target variable
rec_summ <- summary(recipe)
target_var <- rec_summ$variable[all_outcomes(rec_summ$role)]
if(length(target_var) != 1){
stop(paste('Your recipe should have exactly 1 outcome var (found',
length(target_var), ').'))
}
# Prepare recipes
recipes <- purrr::map(resamples$splits, prepper, recipe = recipe, retain = TRUE)
# Get scores using param grid
train_data <-
data_frame(
fold_id = resamples$id,
params = list(param_grid)
)
score <- future_pmap(
list(train_data$fold_id, resamples$splits, recipes, train_data$params),
function(fold_id, split, recipe, param_grid){
if(verbosity > 0){
cat(blue(paste(fold_id, '\n')))
}
grid_predict(
split = split,
recipe = recipe,
param_grid = param_grid,
target_var = target_var,
scoring_func = scoring_func,
...,
verbosity = verbosity - 1
)
}
)
train_data$score <- score
# Create paramset ids
params_df <-
train_data %>%
select(fold_id, params) %>%
unnest(params) %>%
mutate(param_id = rep(seq_along(param_grid), length.out = nrow(.)))
scores_df <-
train_data %>%
select(score) %>%
unnest(score)
res_df <-
bind_cols(
params_df,
scores_df
)
# Final result. Split params and scores into columns and add back assesment_ids
dplyr::bind_cols(
res_df,
transpose(res_df$params) %>% purrr::map(unlist) %>% dplyr::as_data_frame(),
transpose(res_df$score) %>% purrr::map(unlist) %>% dplyr::as_data_frame()
) %>%
select(-params, -score) %>%
left_join(
resamples %>% select(id, test_ids),
by = c('fold_id' = 'id')
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.