R/grid_search.R

# library(dplyr)
# library(purrr)
# library(foreach)
# library(magrittr)
# library(stringr)

#' Create predictions given a resample and a recipe.
#' @description This is a generic function. Plug your custom `scoring_func`.
#' @param split A single split object. See `rsample` package.
#' @param rec Recipe. *Must have been prepped using `retain = TRUE`. See `recipes` package.
#' @param params List of parameters passed to the `scoring_func` function.
#' @param target_var Name of the target variable in the resample.
#' @param scoring_func Your custom train/predict/score function.
#' @param ... Optional params passed to scoring_func.
#' @return A data.frame with columns `target` and `predicted`.
#' @details `scoring_func` cab return a single score as a numeric vector, 
#' or multiple scores in a data.frame. 
#' @import recipes
#' @importFrom rsample assessment
#' @importFrom dplyr bind_cols
#' @importFrom rlang is_true is_false
#' @importFrom purrr is_bare_vector
train_predict <- function(split, rec, params, target_var, scoring_func, ...){
  
  train_df <- as.data.frame(juice(rec, all_predictors(), all_outcomes()))
  eval_df <- as.data.frame(bake(rec, assessment(split), all_predictors(), all_outcomes()))
  
  score <- scoring_func(train_df, target_var, as.list(params), eval_df, ...)
  
  if(is_bare_numeric(score)){
    list(score = score)
  }else{
    score
  }
     
}

#' For a grid of params, return list of model predictions.
#' @description This is a vectorized form of train_predict, vectorized over the param_grid.
#' @param split A split A resampling split. See `rsample` package.
#' @param target_var Name of the target variable in the resample.
#' @param recipe Recipe. *Must have been prepped using `retain = TRUE`. See `recipe` package.
#' @param param_grid List of list of parameters passed to the `scoring_func` function.
#' @param scoring_func Your custom train/predict/score function.
#' @param score_var Name of the score to use, in case `scoring_func` returns 
#' a list of scores. If null (the default), will use the 1st score.
#' @param ... Optional params passed to scoring_func.
#' @param verbosity If > 0, print paramset index.
#' @return A list of data.frames with columns `target` and `predicted`.
#' @importFrom purrr map imap
#' @importFrom crayon green red
grid_predict <- function(split, 
                         recipe, 
                         param_grid, 
                         target_var, 
                         scoring_func, 
                         score_var = NULL,
                         ...,
                         verbosity){

  params_df <- NULL
  scores <- NULL
  score_length <- 1
  
  imap(param_grid,
      function(params, index){
        
        if(verbosity > 0){
          cat(green(paste('Paramset', index, '/', length(param_grid), '\n')))
        }
          
        score <- train_predict(split, recipe, params, target_var, scoring_func, ...)
        
        # Note: when index == 1, we will always accept params
        # Here we remember the length of train_predict output
        if(index == 1){
          score_length <<- length(score)
        }
        
        if(is.null(score_var)){
          score_var <- 1
        }
        
        scores <<- c(scores, score[[score_var]])
        params_df <<- bind_rows(params_df, params)
        score
        
      })
}


#' Perform grid search over a grid of parameters.
#' @param resamples A data.frame with columns `splits` and `id`, created using the `rsample` package.
#' @param recipe The recipe to use. See package `recipes`.
#' @param param_grid Data frame of parameters combinations (e.g. generated by expand.grid).
#' @param scoring_func Your custom train/predict/score function. 
#' Must take as parameters: 
#' \itemize{
#'     \item a training dataframe
#'     \item the name of the target variable in the training dataframe
#'     \item a list of parameters (these are the hyperparameters we are tuning)
#'     \item an evaluation dataframe
#'     \item dots. These are additional non-tunable parameters that could be passed to the function.
#' }
#' @param ... Optional params passed to scoring_func.
#' @param verbosity Level of verbosity. Integer or TRUE/FALSE for max/min verbosity.
#' @return A tidy data.frame, with one column per parameter, columns to identify the
#' paramset and the fold, a column giving the row indices of the evaluation dataset,
#' and columns for the performance scores (these are taken from the scoring function if
#' it returned a data.frame, otherwise it will just be a _score_ column).
#' @details `scoring_func` can return a single score as a numeric vector, 
#' or multiple scores in a data.frame. 
#' @export
#' @importFrom purrr transpose
#' @importFrom dplyr mutate bind_cols  as_data_frame select left_join
#' @importFrom purrr pmap map_dfr transpose map
#' @importFrom tidyr unnest
#' @importFrom magrittr "%>%" "%<>%" "%$%"
#' @importFrom stringr str_replace
#' @importFrom crayon blue
#' @importFrom rlang is_true
#' @importFrom furrr future_pmap
#' @importFrom rsample prepper
grid_search <- function(resamples, 
                        recipe, 
                        param_grid, 
                        scoring_func, 
                        ...,
                        verbosity = TRUE){
  
  if(is_true(verbosity)){
    verbosity <- Inf
  }
  else if(is_false(verbosity)){
    verbosity <- -Inf
  }
  
  # Add assesment_ids to resamples_df. Will be added to final result
  resamples %<>%
    mutate(
      test_ids = map(splits, ~ assessment_ids(.$data, .$in_id))
    )
  
  # Get param grid into expected format
  param_grid <- transpose(param_grid)
  
  # Extract name of target variable
  rec_summ <- summary(recipe)
  target_var <- rec_summ$variable[all_outcomes(rec_summ$role)]
  
  if(length(target_var) != 1){
    stop(paste('Your recipe should have exactly 1 outcome var (found',
               length(target_var), ').'))
  }
  
  # Prepare recipes
  recipes <- purrr::map(resamples$splits, prepper, recipe = recipe, retain = TRUE)
  
  # Get scores using param grid
  train_data <- 
    data_frame(
      fold_id = resamples$id,
      params = list(param_grid)
    ) 
  
  score <- future_pmap(
        list(train_data$fold_id, resamples$splits, recipes, train_data$params),
        
        function(fold_id, split, recipe, param_grid){
          if(verbosity > 0){
            cat(blue(paste(fold_id, '\n')))
          }
          grid_predict(
            split = split,
            recipe = recipe,
            param_grid = param_grid,
            target_var = target_var,
            scoring_func = scoring_func,
            ...,
            verbosity = verbosity - 1
          )
        }
      )
  
  train_data$score <- score
  
  # Create paramset ids
  params_df <- 
    train_data %>% 
    select(fold_id, params) %>%
    unnest(params) %>% 
    mutate(param_id = rep(seq_along(param_grid), length.out = nrow(.)))
  
  scores_df <- 
    train_data %>% 
    select(score) %>%
    unnest(score)
  
  res_df <- 
    bind_cols(
      params_df,
      scores_df
    )
  
  # Final result. Split params and scores into columns and add back assesment_ids
  dplyr::bind_cols(
    res_df,
    transpose(res_df$params) %>% purrr::map(unlist) %>% dplyr::as_data_frame(),
    transpose(res_df$score) %>% purrr::map(unlist) %>% dplyr::as_data_frame()
  ) %>% 
    select(-params, -score) %>%
    left_join(
      resamples %>% select(id, test_ids),
      by = c('fold_id' = 'id')
    )
  
}
artichaud1/cook documentation built on May 21, 2019, 9:23 a.m.