tidycrossval: tidycrossval: Hyperparameter tuning and cross validation...

Description Author(s) Examples

Description

tidycrossval Tidycrossval is the beginnings of a package that deals with hyperparameter tuning and cross validation using tidymodel principles. Currently, this package is mainly designed for my own analysis purposes, but already contains some handy functions that enable hyperparameter tuning and cross validation. The package is designed to integrate with the recipes, parsnip and rsample packages.

Author(s)

Steven Pawley, dr.stevenpawley@gmail.com

Examples

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
library(tidymodels)
library(tidyverse)
library(tidycrossval)

# load the example iris dataset
data(iris)

# create a preprocessing recipe and set the threshold parameter to varying
# because we are going to tune this parameter like a model hyperparameter
rec <- iris %>%
 recipe(Species ~ .) %>%
 step_scale(all_predictors(), id = "scale") %>%
 step_center(all_predictors(), id = "center") %>%
 step_corr(all_predictors(), threshold = varying(), id = "correlation_filter")

# create a model specification
clf <- nearest_neighbor(mode = "classification", neighbors = varying()) %>%
    set_engine("kknn")

# create a nested_cv rsample object
folds <- iris %>%
    nested_cv(outside = vfold_cv(v = 2), inside = mc_cv(times = 1))

# define a threshold tuning parameter
# it has to be related to the name of the step using the label field
threshold <- new_quant_param(
    type = "double",
    range = c(0.7, 1.0),
    inclusive = c(TRUE, TRUE),
    trans = NULL,
    label = c(correlation_filter__threshold = "threshold"))

params <- grid_regular(
    neighbors(c(2, 7)),
    threshold %>% range_set(c(0.8, 1.0)),
    levels = 3L
   )

# perform hyperparameter tuning on the inner folds
scores <- folds %>%
   tune(object = clf, recipe = rec, param_grid = params, scoring = accuracy, maximize = TRUE)

# fit and score the outer folds
scores <- scores %>%
    cross_validate(object = clf, recipe = rec, scoring = metric_set(accuracy, f_meas))

# create a new model and recipe using the best overall scoring hyperparameters
clf_tuned <- clf %>%
    update(neighbors = select_best(scores)$neighbors)
rec_tuned <- rec %>%
    update(correlation_filter__threshold = select_best(scores)$correlation_filter__threshold)

# fit new model after tuning
rec_prepped <- prep(rec_tuned)
clf_tuned <- clf_tuned %>% fit(formula(rec_prepped), juice(rec_prepped))
predict(clf_tuned, juice(rec_prepped))

# can also use a pipeline object to group the recipe and model into a single object
clf <- nearest_neighbor(mode = "classification", neighbors = varying()) %>%
    set_engine("kknn")

clf <- pipeline(rec, clf)

scores <- folds %>%
   tune(object = clf, param_grid = params, scoring = accuracy, maximize = TRUE)

scores <- scores %>%
    cross_validate(object = clf, recipe = rec, scoring = metric_set(accuracy, f_meas))

clf_tuned <- clf %>%
    update(!!!select_best(scores)) %>%
    fit(data = iris)

predict(clf_tuned, iris)

stevenpawley/tidycrossval documentation built on Oct. 3, 2019, 3:32 p.m.