Nothing
#' @title R6 Class to perform nested cross-validation experiments
#'
#' @description
#' The `MLNestedCV` class is used to construct a nested cross validation object
#' and to perform a nested cross validation for a specified machine learning
#' algorithm by performing a hyperparameter optimization with the in-sample
#' observations of each of the k outer folds and validate them directly on the
#' out-of-sample observations of the respective fold.
#'
#' @details
#' The `MLNestedCV` class requires to provide a named list of predefined
#' row indices for the outer cross validation folds, e.g., created with the
#' function [splitTools::create_folds()]. This list also defines the `k` of
#' the k-fold cross-validation. Furthermore, a strategy needs to be chosen
#' ("grid" or "bayesian") for the hyperparameter optimization as well as the
#' parameter `k_tuning` to define the number of inner cross validation folds.
#'
#' @seealso [splitTools::create_folds()]
#'
#' @examples
#' dataset <- do.call(
#' cbind,
#' c(sapply(paste0("col", 1:6), function(x) {
#' rnorm(n = 500)
#' },
#' USE.NAMES = TRUE,
#' simplify = FALSE
#' ),
#' list(target = sample(0:1, 500, TRUE))
#' ))
#'
#' fold_list <- splitTools::create_folds(
#' y = dataset[, 7],
#' k = 3,
#' type = "stratified",
#' seed = 123
#' )
#'
#' cv <- MLNestedCV$new(
#' learner = LearnerKnn$new(),
#' strategy = "grid",
#' fold_list = fold_list,
#' k_tuning = 3L,
#' seed = 123,
#' ncores = 2
#' )
#'
#' # learner args (not optimized)
#' cv$learner_args <- list(
#' l = 0,
#' test = parse(text = "fold_test$x")
#' )
#'
#' # parameters for hyperparameter tuning
#' cv$parameter_grid <- expand.grid(
#' k = seq(4, 16, 8)
#' )
#' cv$split_type <- "stratified"
#'
#' # performance parameters
#' cv$predict_args <- list(type = "response")
#' cv$performance_metric <- metric("bacc")
#'
#' # set data
#' cv$set_data(
#' x = data.matrix(dataset[, -7]),
#' y = dataset[, 7]
#' )
#'
#' cv$execute()
#'
#' @export
#'
MLNestedCV <- R6::R6Class( # nolint
classname = "MLNestedCV",
inherit = MLCrossValidation,
public = list(
#' @field strategy A character. The strategy to optimize the hyperparameters
#' (either `"grid"` or `"bayesian"`).
strategy = NULL,
#' @field parameter_bounds A named list of tuples to define the parameter
#' bounds of the Bayesian hyperparameter optimization. For further details
#' please see the documentation of the `ParBayesianOptimization` package.
parameter_bounds = NULL,
#' @field parameter_grid A matrix with named columns in which each column
#' represents a parameter that should be optimized and each row represents
#' a specific hyperparameter setting that should be tested throughout the
#' procedure. For `strategy = "grid"`, each row of the `parameter_grid` is
#' considered as a setting that is evaluated. For `strategy = "bayesian"`,
#' the `parameter_grid` is passed further on to the `initGrid` argument of
#' the function [ParBayesianOptimization::bayesOpt()] in order to
#' initialize the Bayesian process. The maximum rows considered for
#' initializing the Bayesian process can be specified with the R option
#' `option("mlexperiments.bayesian.max_init")`, which is set to `50L` by
#' default.
parameter_grid = NULL,
#' @field optim_args A named list of tuples to define the parameter
#' bounds of the Bayesian hyperparameter optimization. For further details
#' please see the documentation of the `ParBayesianOptimization` package.
optim_args = NULL,
#' @field split_type A character. The splitting strategy to construct the
#' k cross-validation folds. This parameter is passed further on to the
#' function [splitTools::create_folds()] and defaults to `"stratified"`.
split_type = NULL,
#' @field split_vector A vector If another criteria than the provided `y`
#' should be considered for generating the cross-validation folds, it can
#' be defined here. It is important, that a vector of the same length as
#' `x` is provided here.
split_vector = NULL,
#' @field k_tuning An integer to define the number of cross-validation folds
#' used to tune the hyperparameters.
k_tuning = NULL,
#' @description
#' Create a new `MLNestedCV` object.
#'
#' @param fold_list A named list of predefined row indices for the cross
#' validation folds, e.g., created with the function
#' [splitTools::create_folds()].
#' @param strategy A character. The strategy to optimize the hyperparameters
#' (either `"grid"` or `"bayesian"`).
#' @param learner An initialized learner object that inherits from class
#' `"MLLearnerBase"`.
#' @param k_tuning An integer to define the number of cross-validation folds
#' used to tune the hyperparameters.
#' @param seed An integer. Needs to be set for reproducibility purposes.
#' @param ncores An integer to specify the number of cores used for
#' parallelization (default: `-1L`).
#' @param return_models A logical. If the fitted models should be returned
#' with the results (default: `FALSE`).
#'
#' @details
#' The `MLNestedCV` class requires to provide a named list of predefined
#' row indices for the outer cross validation folds, e.g., created with
#' the function [splitTools::create_folds()]. This list also defines the
#' `k` of the k-fold cross-validation. Furthermore, a strategy needs to
#' be chosen ("grid" or "bayesian") for the hyperparameter optimization
#' as well as the parameter `k_tuning` to define the number of inner
#' cross validation folds.
#'
#' @seealso [splitTools::create_folds()]
#'
#' @examples
#' dataset <- do.call(
#' cbind,
#' c(sapply(paste0("col", 1:6), function(x) {
#' rnorm(n = 500)
#' },
#' USE.NAMES = TRUE,
#' simplify = FALSE
#' ),
#' list(target = sample(0:1, 500, TRUE))
#' ))
#'
#' fold_list <- splitTools::create_folds(
#' y = dataset[, 7],
#' k = 3,
#' type = "stratified",
#' seed = 123
#' )
#'
#' cv <- MLNestedCV$new(
#' learner = LearnerKnn$new(),
#' strategy = "grid",
#' fold_list = fold_list,
#' k_tuning = 3L,
#' seed = 123,
#' ncores = 2
#' )
#'
initialize = function(
learner,
strategy = c("grid", "bayesian"),
k_tuning,
fold_list,
seed,
ncores = -1L,
return_models = FALSE
) {
super$initialize(
learner = learner,
fold_list = fold_list,
seed = seed,
ncores = ncores,
return_models = return_models
)
stopifnot(
"`fun_optim_cv` must not be `NULL`" =
!is.null(self$learner$.__enclos_env__$private$fun_optim_cv),
"`k_tuning` must be an integer >= 3" = as.integer(k_tuning) >= 3L,
"`k_tuning` must be an integer" = is.integer(as.integer(k_tuning))
)
self$k_tuning <- as.integer(k_tuning)
strategy <- match.arg(strategy)
self$strategy <- strategy
},
#' @description
#' Execute the nested cross validation.
#'
#' @return The function returns a data.table with the results of the nested
#' cross validation. More results are accessible from the field `$results`
#' of the `MLNestedCV` class.
#'
#' @details
#' All results of the cross validation are saved in the field `$results` of
#' the `MLNestedCV` class. After successful execution of the nested cross
#' validation, `$results` contains a list with the items:
#'
#' * "results.optimization" A list with the results of the hyperparameter
#' optimization.
#' * "fold" A list of folds containing the following items for each
#' cross validation fold:
#' + "fold_ids" A vector with the utilized in-sample row indices.
#' + "ground_truth" A vector with the ground truth.
#' + "predictions" A vector with the predictions.
#' + "learner.args" A list with the arguments provided to the learner.
#' + "model" If `return_models = TRUE`, the fitted model.
#' * "summary" A data.table with the summarized results (same as
#' the returned value of the `execute` method).
#' * "performance" A list with the value of the performance metric
#' calculated for each of the cross validation folds.
#'
#' @examples
#' dataset <- do.call(
#' cbind,
#' c(sapply(paste0("col", 1:6), function(x) {
#' rnorm(n = 500)
#' },
#' USE.NAMES = TRUE,
#' simplify = FALSE
#' ),
#' list(target = sample(0:1, 500, TRUE))
#' ))
#'
#' fold_list <- splitTools::create_folds(
#' y = dataset[, 7],
#' k = 3,
#' type = "stratified",
#' seed = 123
#' )
#'
#' cv <- MLNestedCV$new(
#' learner = LearnerKnn$new(),
#' strategy = "grid",
#' fold_list = fold_list,
#' k_tuning = 3L,
#' seed = 123,
#' ncores = 2
#' )
#'
#' # learner args (not optimized)
#' cv$learner_args <- list(
#' l = 0,
#' test = parse(text = "fold_test$x")
#' )
#'
#' # parameters for hyperparameter tuning
#' cv$parameter_grid <- expand.grid(
#' k = seq(4, 68, 8)
#' )
#' cv$split_type <- "stratified"
#'
#' # performance parameters
#' cv$predict_args <- list(type = "response")
#' cv$performance_metric <- metric("bacc")
#'
#' # set data
#' cv$set_data(
#' x = data.matrix(dataset[, -7]),
#' y = dataset[, 7]
#' )
#'
#' cv$execute()
#'
execute = function() {
private$prepare()
stopifnot(
"`strategy` must not be `NULL`" = !is.null(self$strategy),
"For `strategy = 'bayesian', `parameter_bounds` must be provided.\
For `strategy = 'grid', `parameter_grid` must be provided" = ifelse(
test = self$strategy == "bayesian",
yes = !is.null(self$parameter_bounds),
no = !is.null(self$parameter_grid)
),
"`execute_params` must not be empty" =
!is.null(private$method_helper$execute_params)
)
return(.run_cv(self = self, private = private))
}
),
private = list(
cv_run_model = function(...) {
kwargs <- list(...)
args <- kdry::list.append(
list(
self = self,
private = private
),
kwargs
)
do.call(.cv_run_nested_model, args)
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.