R/tidycrossval.R

#' tidycrossval: Hyperparameter tuning and cross validation using tidymodel principles
#'
#' \pkg{tidycrossval} Tidycrossval is the beginnings of a package that deals with hyperparameter
#' tuning and cross validation using tidymodel principles. Currently, this package is mainly
#' designed for my own analysis purposes, but already contains some handy functions that enable
#' hyperparameter tuning and cross validation. The package is designed to integrate with the
#' \pkg{recipes}, \pkg{parsnip} and \pkg{rsample} packages.
#'
#' @examples
#' library(tidymodels)
#' library(tidyverse)
#' library(tidycrossval)
#'
#' # load the example iris dataset
#' data(iris)
#'
#' # create a preprocessing recipe and set the threshold parameter to varying
#' # because we are going to tune this parameter like a model hyperparameter
#' rec <- iris %>%
#'  recipe(Species ~ .) %>%
#'  step_scale(all_predictors(), id = "scale") %>%
#'  step_center(all_predictors(), id = "center") %>%
#'  step_corr(all_predictors(), threshold = varying(), id = "correlation_filter")
#'
#' # create a model specification
#' clf <- nearest_neighbor(mode = "classification", neighbors = varying()) %>%
#'     set_engine("kknn")
#'
#' # create a nested_cv rsample object
#' folds <- iris %>%
#'     nested_cv(outside = vfold_cv(v = 2), inside = mc_cv(times = 1))
#'
#' # define a threshold tuning parameter
#' # it has to be related to the name of the step using the label field
#' threshold <- new_quant_param(
#'     type = "double",
#'     range = c(0.7, 1.0),
#'     inclusive = c(TRUE, TRUE),
#'     trans = NULL,
#'     label = c(correlation_filter__threshold = "threshold"))
#'
#' params <- grid_regular(
#'     neighbors(c(2, 7)),
#'     threshold %>% range_set(c(0.8, 1.0)),
#'     levels = 3L
#'    )
#'
#' # perform hyperparameter tuning on the inner folds
#' scores <- folds %>%
#'    tune(object = clf, recipe = rec, param_grid = params, scoring = accuracy, maximize = TRUE)
#'
#' # fit and score the outer folds
#' scores <- scores %>%
#'     cross_validate(object = clf, recipe = rec, scoring = metric_set(accuracy, f_meas))
#'
#' # create a new model and recipe using the best overall scoring hyperparameters
#' clf_tuned <- clf %>%
#'     update(neighbors = select_best(scores)$neighbors)
#' rec_tuned <- rec %>%
#'     update(correlation_filter__threshold = select_best(scores)$correlation_filter__threshold)
#'
#' # fit new model after tuning
#' rec_prepped <- prep(rec_tuned)
#' clf_tuned <- clf_tuned %>% fit(formula(rec_prepped), juice(rec_prepped))
#' predict(clf_tuned, juice(rec_prepped))
#'
#' # can also use a pipeline object to group the recipe and model into a single object
#' clf <- nearest_neighbor(mode = "classification", neighbors = varying()) %>%
#'     set_engine("kknn")
#'
#' clf <- pipeline(rec, clf)
#'
#' scores <- folds %>%
#'    tune(object = clf, param_grid = params, scoring = accuracy, maximize = TRUE)
#'
#' scores <- scores %>%
#'     cross_validate(object = clf, recipe = rec, scoring = metric_set(accuracy, f_meas))
#'
#' clf_tuned <- clf %>%
#'     update(!!!select_best(scores)) %>%
#'     fit(data = iris)
#'
#' predict(clf_tuned, iris)

#' @author Steven Pawley, \email{dr.stevenpawley@@gmail.com}

#' @docType package
#' @name tidycrossval
NULL
stevenpawley/tidycrossval documentation built on Oct. 3, 2019, 3:32 p.m.