cvms: Cross-Validation for Model Selection

Documented in cross_validate_fn

#' @title Cross-validate custom model functions for model selection
#' @description
#'  \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
#'
#'  Cross-validate your model function with one or multiple model formulas at once.
#'  Perform repeated cross-validation. Preprocess the train/test split
#'  within the cross-validation. Perform hyperparameter tuning with grid search.
#'  Returns results in a \code{tibble} for easy comparison,
#'  reporting and further analysis.
#'
#'  Compared to \code{\link[cvms:cross_validate]{cross_validate()}},
#'  this function allows you supply a custom model function, a predict function,
#'  a preprocess function and the hyperparameter values to cross-validate.
#'
#'  Supports regression and classification (binary and multiclass).
#'  See \code{`type`}.
#'
#'  Note that some metrics may not be computable for some types
#'  of model objects.
#' @author Ludvig Renbo Olsen, \email{r-pkgs@@ludvigolsen.dk}
#' @export
#' @family validation functions
#' @inheritParams cross_validate
#' @inheritParams evaluate
#' @param formulas Model formulas as strings. (Character)
#'
#'  Will be converted to \code{\link[stats:formula]{formula}} objects
#'  before being passed to \code{`model_fn`}.
#'
#'  E.g. \code{c("y~x", "y~z")}.
#'
#'  Can contain random effects.
#'
#'  E.g. \code{c("y~x+(1|r)", "y~z+(1|r)")}.
#' @param model_fn Model function that returns a fitted model object.
#'  Will usually wrap an existing model function like \code{\link[e1071:svm]{e1071::svm}}
#'  or \code{\link[nnet:multinom]{nnet::multinom}}.
#'
#'  Must have the following function arguments:
#'
#'  \code{function(train_data, formula,}
#'
#'  \verb{         }\code{hyperparameters)}
#' @param predict_fn Function for predicting the targets in the test folds/sets using the fitted model object.
#'  Will usually wrap \code{\link[stats:predict]{stats::predict()}}, but doesn't have to.
#'
#'  Must have the following function arguments:
#'
#'  \code{function(test_data, model, formula,}
#'
#'  \verb{         }\code{hyperparameters, train_data)}
#'
#'  Must return predictions in the following formats, depending on \code{`type`}:
#'
#'  \subsection{Binomial}{
#'  \code{vector} or one-column \code{matrix} / \code{data.frame} with probabilities (0-1)
#'  \strong{of the second class, alphabetically}.
#'  E.g.:
#'
#'  \code{c(0.3, 0.5, 0.1, 0.5)}
#'
#'  N.B. When unsure whether a model type produces probabilities based off
#'  the alphabetic order of your classes, using 0 and 1 as classes in the
#'  dependent variable instead of the class names should increase the chance of
#'  getting probabilities of the right class.
#'  }
#'
#'  \subsection{Gaussian}{
#'  \code{vector} or one-column \code{matrix} / \code{data.frame} with the predicted value.
#'  E.g.:
#'
#'  \code{c(3.7, 0.9, 1.2, 7.3)}
#'  }
#'
#'  \subsection{Multinomial}{
#'  \code{data.frame} with one column per class containing probabilities of the class.
#'  Column names should be identical to how the class names are written in the target column.
#'  E.g.:
#'
#'  \tabular{rrr}{
#'   \strong{class_1} \tab \strong{class_2} \tab
#'   \strong{class_3} \cr
#'   0.269 \tab 0.528 \tab 0.203\cr
#'   0.368 \tab 0.322 \tab 0.310\cr
#'   0.375 \tab 0.371 \tab 0.254\cr
#'   ... \tab ... \tab ...}
#'  }
#' @param preprocess_fn Function for preprocessing the training and test sets.
#'
#'  Can, for instance, be used to standardize both the training and test sets
#'  with the scaling and centering parameters from the training set.
#'
#'  Must have the following function arguments:
#'
#'  \code{function(train_data, test_data,}
#'
#'  \verb{         }\code{formula, hyperparameters)}
#'
#'  Must return a \code{list} with the preprocessed \code{`train_data`} and \code{`test_data`}. It may also contain
#'  a \code{tibble} with the \code{parameters} used in preprocessing:
#'
#'  \code{list("train" = train_data,}
#'
#'  \verb{     }\code{"test" = test_data,}
#'
#'  \verb{     }\code{"parameters" = preprocess_parameters)}
#'
#'  Additional elements in the returned \code{list} will be ignored.
#'
#'  The optional parameters \code{tibble} will be included in the output.
#'  It could have the following format:
#'
#'  \tabular{rrr}{
#'   \strong{Measure} \tab \strong{var_1} \tab \strong{var_2} \cr
#'   Mean \tab 37.921 \tab 88.231\cr
#'   SD \tab 12.4 \tab 5.986\cr
#'   ... \tab ... \tab ...}
#'
#'  N.B. When \code{`preprocess_once`} is \code{FALSE}, the current formula and
#'  hyperparameters will be provided. Otherwise,
#'  these arguments will be \code{NULL}.
#' @param preprocess_once Whether to apply the preprocessing once
#'  (\strong{ignoring} the formula and hyperparameters arguments in \code{`preprocess_fn`})
#'  or for every model separately. (Logical)
#'
#'  When preprocessing does not depend on the current formula or hyperparameters,
#'  we can do the preprocessing of each train/test split once, to save time.
#'  This \strong{may require holding a lot more data in memory} though,
#'  why it is not the default setting.
#' @param hyperparameters Either a \code{named list} with hyperparameter values to combine in a grid
#'  or a \code{data.frame} with one row per hyperparameter combination.
#'
#'  \subsection{Named list for grid search}{
#'  Add \code{".n"} to sample the combinations. Can be the number of combinations to use,
#'  or a percentage between \code{0} and \code{1}.
#'
#'  E.g.
#'
#'  \code{list(".n" = 10,  # sample 10 combinations}
#'
#'  \verb{     }\code{"lrn_rate" = c(0.1, 0.01, 0.001),}
#'
#'  \verb{     }\code{"h_layers" = c(10, 100, 1000),}
#'
#'  \verb{     }\code{"drop_out" = runif(5, 0.3, 0.7))}
#'  }
#'
#'  \subsection{\code{data.frame} with specific hyperparameter combinations}{
#'  One row per combination to test.
#'
#'  E.g.
#'
#'  \tabular{rrr}{
#'   \strong{lrn_rate} \tab \strong{h_layers} \tab \strong{drop_out} \cr
#'   0.1 \tab 10 \tab 0.65\cr
#'   0.1 \tab 1000 \tab 0.65\cr
#'   0.01 \tab 1000 \tab 0.63\cr
#'   ... \tab ... \tab ...}
#'  }
#' @param metrics \code{list} for enabling/disabling metrics.
#'
#'   E.g. \code{list("RMSE" = FALSE)} would remove \code{RMSE} from the regression results,
#'   and \code{list("Accuracy" = TRUE)} would add the regular \code{Accuracy} metric
#'   to the classification results.
#'   Default values (\code{TRUE}/\code{FALSE}) will be used for the remaining available metrics.
#'
#'   You can enable/disable all metrics at once by including
#'   \code{"all" = TRUE/FALSE} in the \code{list}. This is done prior to enabling/disabling
#'   individual metrics, why f.i. \code{list("all" = FALSE, "RMSE" = TRUE)} would return only the \code{RMSE} metric.
#'
#'   The \code{list} can be created with
#'   \code{\link[cvms:gaussian_metrics]{gaussian_metrics()}},
#'   \code{\link[cvms:binomial_metrics]{binomial_metrics()}}, or
#'   \code{\link[cvms:multinomial_metrics]{multinomial_metrics()}}.
#'
#'   Also accepts the string \code{"all"}.
#' @param verbose Whether to message process information
#'  like the number of model instances to fit. (Logical)
#' @details
#'
#'  Packages used:
#'
#'  \subsection{Results}{
#'  \subsection{Shared}{
#'
#'  AIC : \code{\link[stats:AIC]{stats::AIC}}
#'
#'  AICc : \code{\link[MuMIn:AICc]{MuMIn::AICc}}
#'
#'  BIC : \code{\link[stats:BIC]{stats::BIC}}
#'
#'  }
#'  \subsection{Gaussian}{
#'
#'  r2m : \code{\link[MuMIn:r.squaredGLMM]{MuMIn::r.squaredGLMM}}
#'
#'  r2c : \code{\link[MuMIn:r.squaredGLMM]{MuMIn::r.squaredGLMM}}
#'
#'  }
#'  \subsection{Binomial and Multinomial}{
#'
#'  ROC and related metrics:
#'
#'  Binomial: \code{\link[pROC:roc]{pROC::roc}}
#'
#'  Multinomial: \code{\link[pROC:multiclass.roc]{pROC::multiclass.roc}}
#'  }
#'  }
#' @return
#'  \code{tibble} with results for each model.
#'
#'  N.B. The \strong{Fold} column in the nested \code{tibble}s contains the test fold in that train/test split.
#'
#'  \subsection{Shared across families}{
#'
#'  A nested \code{tibble} with \strong{coefficients} of the models from all iterations. The coefficients
#'  are extracted from the model object with \code{\link[parameters:model_parameters]{parameters::model_parameters()}} or
#'  \code{\link[stats:coef]{coef()}} (with some restrictions on the output).
#'  If these attempts fail, a default coefficients \code{tibble} filled with \code{NA}s is returned.
#'
#'  Nested \code{tibble} with the used \strong{preprocessing parameters},
#'  if a passed \code{preprocess_fn} returns the parameters in a \code{tibble}.
#'
#'  Number of \emph{total} \strong{folds}.
#'
#'  Number of \strong{fold columns}.
#'
#'  Count of \strong{convergence warnings}, using a limited set of keywords (e.g. "convergence"). If a
#'  convergence warning does not contain one of these keywords, it will be counted with \strong{other warnings}.
#'  Consider discarding models that did not converge on all iterations.
#'  Note: you might still see results, but these should be taken with a grain of salt!
#'
#'  Nested \code{tibble} with the \strong{warnings and messages} caught for each model.
#'
#'  A nested \strong{Process} information object with information
#'  about the evaluation.
#'
#'  Name of \strong{dependent} variable.
#'
#'  Names of \strong{fixed} effects.
#'
#'  Names of \strong{random} effects, if any.
#'
#'  }
#'
#'  ----------------------------------------------------------------
#'
#'  \subsection{Gaussian Results}{
#'
#'  ----------------------------------------------------------------
#'
#'  Average \strong{\code{RMSE}}, \strong{\code{MAE}}, \strong{\code{NRMSE(IQR)}},
#'  \strong{\code{RRSE}}, \strong{\code{RAE}}, \strong{\code{RMSLE}} of all the iterations*,
#'  \emph{\strong{omitting potential NAs} from non-converged iterations}.
#'
#'  See the additional metrics (disabled by default) at \code{\link[cvms:gaussian_metrics]{?gaussian_metrics}}.
#'
#'  A nested \code{tibble} with the \strong{predictions} and targets.
#'
#'  A nested \code{tibble} with the non-averaged \strong{results} from all iterations.
#'
#'  * In \emph{repeated cross-validation},
#'  the metrics are first averaged for each fold column (repetition) and then averaged again.
#'
#'  }
#'
#'  ----------------------------------------------------------------
#'
#'  \subsection{Binomial Results}{
#'
#'  ----------------------------------------------------------------
#'
#'  Based on the \strong{collected} predictions from the test folds*,
#'  a confusion matrix and a \code{ROC} curve are created to get the following:
#'
#'  \code{ROC}:
#'
#'  \strong{\code{AUC}}, \strong{\code{Lower CI}}, and \strong{\code{Upper CI}}
#'
#'  \code{Confusion Matrix}:
#'
#'  \strong{\code{Balanced Accuracy}},
#'  \strong{\code{F1}},
#'  \strong{\code{Sensitivity}},
#'  \strong{\code{Specificity}},
#'  \strong{\code{Positive Predictive Value}},
#'  \strong{\code{Negative Predictive Value}},
#'  \strong{\code{Kappa}},
#'  \strong{\code{Detection Rate}},
#'  \strong{\code{Detection Prevalence}},
#'  \strong{\code{Prevalence}}, and
#'  \strong{\code{MCC}} (Matthews correlation coefficient).
#'
#'  See the additional metrics (disabled by default) at
#'  \code{\link[cvms:binomial_metrics]{?binomial_metrics}}.
#'
#'  Also includes:
#'
#'  A nested \code{tibble} with \strong{predictions}, predicted classes (depends on \code{cutoff}), and the targets.
#'  Note, that the predictions are \emph{not necessarily} of the \emph{specified} \code{positive} class, but of
#'  the \emph{model's} positive class (second level of dependent variable, alphabetically).
#'
#'  The \code{\link[pROC:roc]{pROC::roc}} \strong{\code{ROC}} curve object(s).
#'
#'  A nested \code{tibble} with the \strong{confusion matrix}/matrices.
#'  The \code{Pos_} columns tells you whether a row is a
#'  True Positive (\code{TP}), True Negative (\code{TN}),
#'  False Positive (\code{FP}), or False Negative (\code{FN}),
#'  depending on which level is the "positive" class. I.e. the level you wish to predict.
#'
#'  A nested \code{tibble} with the \strong{results} from all fold columns.
#'
#'  The name of the \strong{Positive Class}.
#'
#'  * In \emph{repeated cross-validation}, an evaluation is made per fold column (repetition) and averaged.
#'
#'  }
#'
#'  ----------------------------------------------------------------
#'
#'  \subsection{Multinomial Results}{
#'
#'  ----------------------------------------------------------------
#'
#'  For each class, a \emph{one-vs-all} binomial evaluation is performed. This creates
#'  a \strong{Class Level Results} \code{tibble} containing the same metrics as the binomial results
#'  described above (excluding \code{MCC}, \code{AUC}, \code{Lower CI} and \code{Upper CI}),
#'  along with a count of the class in the target column (\strong{\code{Support}}).
#'  These metrics are used to calculate the \strong{macro-averaged} metrics. The nested class level results
#'  \code{tibble} is also included in the output \code{tibble},
#'  and could be reported along with the macro and overall metrics.
#'
#'  The output \code{tibble} contains the macro and overall metrics.
#'  The metrics that share their name with the metrics in the nested
#'  class level results \code{tibble} are averages of those metrics
#'  (note: does not remove \code{NA}s before averaging).
#'  In addition to these, it also includes the \strong{\code{Overall Accuracy}} and
#'  the multiclass \strong{\code{MCC}}.
#'
#'  \strong{Note:} \strong{\code{Balanced Accuracy}} is the macro-averaged metric,
#'  \emph{not} the macro sensitivity as sometimes used!
#'
#'  Other available metrics (disabled by default, see \code{metrics}):
#'  \strong{\code{Accuracy}},
#'  \emph{multiclass} \strong{\code{AUC}},
#'  \strong{\code{Weighted Balanced Accuracy}},
#'  \strong{\code{Weighted Accuracy}},
#'  \strong{\code{Weighted F1}},
#'  \strong{\code{Weighted Sensitivity}},
#'  \strong{\code{Weighted Sensitivity}},
#'  \strong{\code{Weighted Specificity}},
#'  \strong{\code{Weighted Pos Pred Value}},
#'  \strong{\code{Weighted Neg Pred Value}},
#'  \strong{\code{Weighted Kappa}},
#'  \strong{\code{Weighted Detection Rate}},
#'  \strong{\code{Weighted Detection Prevalence}}, and
#'  \strong{\code{Weighted Prevalence}}.
#'
#'  Note that the "Weighted" average metrics are weighted by the \code{Support}.
#'
#'  Also includes:
#'
#'  A nested \code{tibble} with the \strong{predictions}, predicted classes, and targets.
#'
#'  A \code{list} of \strong{ROC} curve objects when \code{AUC} is enabled.
#'
#'  A nested \code{tibble} with the multiclass \strong{Confusion Matrix}.
#'
#'  \strong{Class Level Results}
#'
#'  Besides the binomial evaluation metrics and the \code{Support},
#'  the nested class level results \code{tibble} also contains a
#'  nested \code{tibble} with the \strong{Confusion Matrix} from the one-vs-all evaluation.
#'  The \code{Pos_} columns tells you whether a row is a
#'  True Positive (\code{TP}), True Negative (\code{TN}),
#'  False Positive (\code{FP}), or False Negative (\code{FN}),
#'  depending on which level is the "positive" class. In our case, \code{1} is the current class
#'  and \code{0} represents all the other classes together.
#'  }
#' @examples
#' \donttest{
#' # Attach packages
#' library(cvms)
#' library(groupdata2) # fold()
#' library(dplyr) # %>% arrange() mutate()
#'
#' # Note: More examples of custom functions can be found at:
#' # model_fn: model_functions()
#' # predict_fn: predict_functions()
#' # preprocess_fn: preprocess_functions()
#'
#' # Data is part of cvms
#' data <- participant.scores
#'
#' # Set seed for reproducibility
#' set.seed(7)
#'
#' # Fold data
#' data <- fold(
#'   data,
#'   k = 4,
#'   cat_col = "diagnosis",
#'   id_col = "participant"
#' ) %>%
#'   mutate(diagnosis = as.factor(diagnosis)) %>%
#'   arrange(.folds)
#'
#' # Cross-validate multiple formulas
#'
#' formulas_gaussian <- c(
#'   "score ~ diagnosis",
#'   "score ~ age"
#' )
#' formulas_binomial <- c(
#'   "diagnosis ~ score",
#'   "diagnosis ~ age"
#' )
#'
#' #
#' # Gaussian
#' #
#'
#' # Create model function that returns a fitted model object
#' lm_model_fn <- function(train_data, formula, hyperparameters) {
#'   lm(formula = formula, data = train_data)
#' }
#'
#' # Create predict function that returns the predictions
#' lm_predict_fn <- function(test_data, model, formula,
#'                           hyperparameters, train_data) {
#'   stats::predict(
#'     object = model,
#'     newdata = test_data,
#'     type = "response",
#'     allow.new.levels = TRUE
#'   )
#' }
#'
#' # Cross-validate the model function
#' cross_validate_fn(
#'   data,
#'   formulas = formulas_gaussian,
#'   type = "gaussian",
#'   model_fn = lm_model_fn,
#'   predict_fn = lm_predict_fn,
#'   fold_cols = ".folds"
#' )
#'
#' #
#' # Binomial
#' #
#'
#' # Create model function that returns a fitted model object
#' glm_model_fn <- function(train_data, formula, hyperparameters) {
#'   glm(formula = formula, data = train_data, family = "binomial")
#' }
#'
#' # Create predict function that returns the predictions
#' glm_predict_fn <- function(test_data, model, formula,
#'                            hyperparameters, train_data) {
#'   stats::predict(
#'     object = model,
#'     newdata = test_data,
#'     type = "response",
#'     allow.new.levels = TRUE
#'   )
#' }
#'
#' # Cross-validate the model function
#' cross_validate_fn(
#'   data,
#'   formulas = formulas_binomial,
#'   type = "binomial",
#'   model_fn = glm_model_fn,
#'   predict_fn = glm_predict_fn,
#'   fold_cols = ".folds"
#' )
#'
#' #
#' # Support Vector Machine (svm)
#' # with hyperparameter tuning
#' #
#'
#' # Only run if the `e1071` package is installed
#' if (requireNamespace("e1071", quietly = TRUE)){
#'
#' # Create model function that returns a fitted model object
#' # We use the hyperparameters arg to pass in the kernel and cost values
#' svm_model_fn <- function(train_data, formula, hyperparameters) {
#'
#'   # Expected hyperparameters:
#'   #  - kernel
#'   #  - cost
#'   if (!"kernel" %in% names(hyperparameters))
#'     stop("'hyperparameters' must include 'kernel'")
#'   if (!"cost" %in% names(hyperparameters))
#'     stop("'hyperparameters' must include 'cost'")
#'
#'   e1071::svm(
#'     formula = formula,
#'     data = train_data,
#'     kernel = hyperparameters[["kernel"]],
#'     cost = hyperparameters[["cost"]],
#'     scale = FALSE,
#'     type = "C-classification",
#'     probability = TRUE
#'   )
#' }
#'
#' # Create predict function that returns the predictions
#' svm_predict_fn <- function(test_data, model, formula,
#'                            hyperparameters, train_data) {
#'   predictions <- stats::predict(
#'     object = model,
#'     newdata = test_data,
#'     allow.new.levels = TRUE,
#'     probability = TRUE
#'   )
#'
#'   # Extract probabilities
#'   probabilities <- dplyr::as_tibble(
#'     attr(predictions, "probabilities")
#'   )
#'
#'   # Return second column
#'   probabilities[[2]]
#' }
#'
#' # Specify hyperparameters to try
#' # The optional ".n" samples 4 combinations
#' svm_hparams <- list(
#'   ".n" = 4,
#'   "kernel" = c("linear", "radial"),
#'   "cost" = c(1, 5, 10)
#' )
#'
#' # Cross-validate the model function
#' cv <- cross_validate_fn(
#'   data,
#'   formulas = formulas_binomial,
#'   type = "binomial",
#'   model_fn = svm_model_fn,
#'   predict_fn = svm_predict_fn,
#'   hyperparameters = svm_hparams,
#'   fold_cols = ".folds"
#' )
#'
#' cv
#'
#' # The `HParams` column has the nested hyperparameter values
#' cv %>%
#'   select(Dependent, Fixed, HParams, `Balanced Accuracy`, F1, AUC, MCC) %>%
#'   tidyr::unnest(cols = "HParams") %>%
#'   arrange(desc(`Balanced Accuracy`), desc(F1))
#'
#' #
#' # Use parallelization
#' # The below examples show the speed gains when running in parallel
#' #
#'
#' # Attach doParallel and register four cores
#' # Uncomment:
#' # library(doParallel)
#' # registerDoParallel(4)
#'
#' # Specify hyperparameters such that we will
#' # cross-validate 20 models
#' hparams <- list(
#'   "kernel" = c("linear", "radial"),
#'   "cost" = 1:5
#' )
#'
#' # Cross-validate a list of 20 models in parallel
#' # Make sure to uncomment the parallel argument
#' system.time({
#'   cross_validate_fn(
#'     data,
#'     formulas = formulas_gaussian,
#'     type = "gaussian",
#'     model_fn = svm_model_fn,
#'     predict_fn = svm_predict_fn,
#'     hyperparameters = hparams,
#'     fold_cols = ".folds"
#'     #, parallel = TRUE  # Uncomment
#'   )
#' })
#'
#' # Cross-validate a list of 20 models sequentially
#' system.time({
#'   cross_validate_fn(
#'     data,
#'     formulas = formulas_gaussian,
#'     type = "gaussian",
#'     model_fn = svm_model_fn,
#'     predict_fn = svm_predict_fn,
#'     hyperparameters = hparams,
#'     fold_cols = ".folds"
#'     #, parallel = TRUE  # Uncomment
#'   )
#' })
#'
#' } # closes `e1071` package check
#' }
cross_validate_fn <- function(data,
                              formulas,
                              type,
                              model_fn,
                              predict_fn,
                              preprocess_fn = NULL,
                              preprocess_once = FALSE,
                              hyperparameters = NULL,
                              fold_cols = ".folds",
                              cutoff = 0.5,
                              positive = 2,
                              metrics = list(),
                              rm_nc = FALSE,
                              parallel = FALSE,
                              verbose = TRUE) {
  cross_validate_list(
    data = data,
    formulas = formulas,
    model_fn = model_fn,
    predict_fn = predict_fn,
    preprocess_fn = preprocess_fn,
    preprocess_once = preprocess_once,
    hyperparameters = hyperparameters,
    fold_cols = fold_cols,
    family = type,
    cutoff = cutoff,
    positive = positive,
    metrics = metrics,
    rm_nc = rm_nc,
    parallel_ = parallel,
    verbose = verbose
  )
}