Nothing
# __________________ #< ccfcdb8c94c692518bdaf6f1509380e1 ># __________________
# Cross-validate lm lmer glm glmer ####
#' @title Cross-validate regression models for model selection
#' @description
#' \Sexpr[results=rd, stage=render]{lifecycle::badge("stable")}
#'
#' Cross-validate one or multiple linear or logistic regression
#' models at once. Perform repeated cross-validation.
#' Returns results in a \code{tibble} for easy comparison,
#' reporting and further analysis.
#'
#' See \code{\link[cvms:cross_validate_fn]{cross_validate_fn()}} for use
#' with custom model functions.
#' @author Ludvig Renbo Olsen, \email{r-pkgs@@ludvigolsen.dk}
#' @author Benjamin Hugh Zachariae
#' @export
#' @family validation functions
#' @param data \code{data.frame}.
#'
#' Must include one or more grouping factors for identifying folds
#' - as made with \code{\link[groupdata2:fold]{groupdata2::fold()}}.
#' @param formulas Model formulas as strings. (Character)
#'
#' E.g. \code{c("y~x", "y~z")}.
#'
#' Can contain random effects.
#'
#' E.g. \code{c("y~x+(1|r)", "y~z+(1|r)")}.
#' @param fold_cols Name(s) of grouping factor(s) for identifying folds. (Character)
#'
#' Include names of multiple grouping factors for repeated cross-validation.
#' @param family Name of the family. (Character)
#'
#' Currently supports \strong{\code{"gaussian"}} for linear regression
#' with \code{\link[stats:lm]{lm()}} / \code{\link[lme4:lmer]{lme4::lmer()}}
#' and \strong{\code{"binomial"}} for binary classification
#' with \code{\link[stats:glm]{glm()}} / \code{\link[lme4:glmer]{lme4::glmer()}}.
#'
#' See \code{\link[cvms:cross_validate_fn]{cross_validate_fn()}} for use with other model functions.
#' @param control Construct control structures for mixed model fitting
#' (with \code{\link[lme4:lmer]{lme4::lmer()}} or \code{\link[lme4:glmer]{lme4::glmer()}}).
#' See \code{\link[lme4:lmerControl]{lme4::lmerControl}} and
#' \code{\link[lme4:glmerControl]{lme4::glmerControl}}.
#'
#' N.B. Ignored if fitting \code{\link[stats:lm]{lm()}} or \code{\link[stats:glm]{glm()}} models.
#' @param REML Restricted Maximum Likelihood. (Logical)
#' @param cutoff Threshold for predicted classes. (Numeric)
#'
#' N.B. \strong{Binomial models only}
#' @param positive Level from dependent variable to predict.
#' Either as character (\emph{preferable}) or level index (\code{1} or \code{2} - alphabetically).
#'
#' E.g. if we have the levels \code{"cat"} and \code{"dog"} and we want \code{"dog"} to be the positive class,
#' we can either provide \code{"dog"} or \code{2}, as alphabetically, \code{"dog"} comes after \code{"cat"}.
#'
#' \strong{Note:} For \emph{reproducibility}, it's preferable to \strong{specify the name directly}, as
#' different \code{\link[base:locales]{locales}} may sort the levels differently.
#'
#' Used when calculating confusion matrix metrics and creating \code{ROC} curves.
#'
#' The \code{Process} column in the output can be used to verify this setting.
#'
#' N.B. Only affects evaluation metrics, not the model training or returned predictions.
#'
#' N.B. \strong{Binomial models only}.
#' @param metrics \code{list} for enabling/disabling metrics.
#'
#' E.g. \code{list("RMSE" = FALSE)} would remove \code{RMSE} from the results,
#' and \code{list("Accuracy" = TRUE)} would add the regular \code{Accuracy} metric
#' to the classification results.
#' Default values (\code{TRUE}/\code{FALSE}) will be used for the remaining available metrics.
#'
#' You can enable/disable all metrics at once by including
#' \code{"all" = TRUE/FALSE} in the \code{list}. This is done prior to enabling/disabling
#' individual metrics, why \code{list("all" = FALSE, "RMSE" = TRUE)}
#' would return only the \code{RMSE} metric.
#'
#' The \code{list} can be created with
#' \code{\link[cvms:gaussian_metrics]{gaussian_metrics()}} or
#' \code{\link[cvms:binomial_metrics]{binomial_metrics()}}.
#'
#' Also accepts the string \code{"all"}.
#' @param preprocessing Name of preprocessing to apply.
#'
#' Available preprocessings are:
#'
#' \tabular{rrr}{
#' \strong{Name} \tab \strong{Description} \cr
#' "standardize" \tab Centers and scales the numeric predictors.\cr
#' "range" \tab Normalizes the numeric predictors to the \code{0}-\code{1} range.
#' Values outside the min/max range in the test fold are truncated to \code{0}/\code{1}.\cr
#' "scale" \tab Scales the numeric predictors to have a standard deviation of one.\cr
#' "center" \tab Centers the numeric predictors to have a mean of zero.\cr
#' }
#'
#' The preprocessing parameters (\code{mean}, \code{SD}, etc.) are extracted from the training folds and
#' applied to both the training folds and the test fold.
#' They are returned in the \strong{Preprocess} column for inspection.
#'
#' N.B. The preprocessings should not affect the results
#' to a noticeable degree, although \code{"range"} might due to the truncation.
#' @param rm_nc Remove non-converged models from output. (Logical)
#' @param verbose Whether to message process information
#' like the number of model instances to fit and which model function was applied. (Logical)
#' @param parallel Whether to cross-validate the \code{list} of models in parallel. (Logical)
#'
#' Remember to register a parallel backend first.
#' E.g. with \code{doParallel::registerDoParallel}.
#' @param link,models,model_verbose Deprecated.
#' @details
#'
#' Packages used:
#'
#' \subsection{Models}{
#'
#' Gaussian: \code{\link[stats:lm]{stats::lm}}, \code{\link[lme4:lmer]{lme4::lmer}}
#'
#' Binomial: \code{\link[stats:glm]{stats::glm}}, \code{\link[lme4:glmer]{lme4::glmer}}
#' }
#' \subsection{Results}{
#' \subsection{Shared}{
#'
#' \code{AIC} : \code{\link[stats:AIC]{stats::AIC}}
#'
#' \code{AICc} : \code{\link[MuMIn:AICc]{MuMIn::AICc}}
#'
#' \code{BIC} : \code{\link[stats:BIC]{stats::BIC}}
#'
#' }
#' \subsection{Gaussian}{
#'
#' \code{r2m} : \code{\link[MuMIn:r.squaredGLMM]{MuMIn::r.squaredGLMM}}
#'
#' \code{r2c} : \code{\link[MuMIn:r.squaredGLMM]{MuMIn::r.squaredGLMM}}
#'
#' }
#' \subsection{Binomial}{
#'
#' \code{ROC and AUC}: \code{\link[pROC:roc]{pROC::roc}}
#'
#' }
#' }
#' @return
#' \code{tibble} with results for each model.
#'
#' \subsection{Shared across families}{
#' A nested \code{tibble} with \strong{coefficients} of the models from all iterations.
#'
#' Number of \emph{total} \strong{folds}.
#'
#' Number of \strong{fold columns}.
#'
#' Count of \strong{convergence warnings}. Consider discarding models that did not converge on all
#' iterations. Note: you might still see results, but these should be taken with a grain of salt!
#'
#' Count of \strong{other warnings}. These are warnings without keywords such as "convergence".
#'
#' Count of \strong{Singular Fit messages}.
#' See \code{\link[lme4:isSingular]{lme4::isSingular}} for more information.
#'
#' Nested \code{tibble} with the \strong{warnings and messages} caught for each model.
#'
#' A nested \strong{Process} information object with information
#' about the evaluation.
#'
#' Name of \strong{dependent} variable.
#'
#' Names of \strong{fixed} effects.
#'
#' Names of \strong{random} effects, if any.
#'
#' Nested \code{tibble} with \strong{preprocess}ing parameters, if any.
#' }
#'
#' ----------------------------------------------------------------
#'
#' \subsection{Gaussian Results}{
#'
#' ----------------------------------------------------------------
#'
#' Average \strong{\code{RMSE}}, \strong{\code{MAE}}, \strong{\code{NRMSE(IQR)}},
#' \strong{\code{RRSE}}, \strong{\code{RAE}}, \strong{\code{RMSLE}},
#' \strong{\code{AIC}}, \strong{\code{AICc}},
#' and \strong{\code{BIC}} of all the iterations*,
#' \emph{\strong{omitting potential NAs} from non-converged iterations}.
#' Note that the Information Criterion metrics (\code{AIC}, \code{AICc}, and \code{BIC}) are also averages.
#'
#' See the additional metrics (disabled by default) at \code{\link[cvms:gaussian_metrics]{?gaussian_metrics}}.
#'
#' A nested \code{tibble} with the \strong{predictions} and targets.
#'
#' A nested \code{tibble} with the non-averaged \strong{results} from all iterations.
#'
#' * In \emph{repeated cross-validation},
#' the metrics are first averaged for each fold column (repetition) and then averaged again.
#'
#' }
#'
#' ----------------------------------------------------------------
#'
#' \subsection{Binomial Results}{
#'
#' ----------------------------------------------------------------
#'
#' Based on the \strong{collected} predictions from the test folds*,
#' a confusion matrix and a \code{ROC} curve are created to get the following:
#'
#' \code{ROC}:
#'
#' \strong{\code{AUC}}, \strong{\code{Lower CI}}, and \strong{\code{Upper CI}}
#'
#' \code{Confusion Matrix}:
#'
#' \strong{\code{Balanced Accuracy}},
#' \strong{\code{F1}},
#' \strong{\code{Sensitivity}},
#' \strong{\code{Specificity}},
#' \strong{\code{Positive Predictive Value}},
#' \strong{\code{Negative Predictive Value}},
#' \strong{\code{Kappa}},
#' \strong{\code{Detection Rate}},
#' \strong{\code{Detection Prevalence}},
#' \strong{\code{Prevalence}}, and
#' \strong{\code{MCC}} (Matthews correlation coefficient).
#'
#' See the additional metrics (disabled by default) at
#' \code{\link[cvms:binomial_metrics]{?binomial_metrics}}.
#'
#' Also includes:
#'
#' A nested \code{tibble} with \strong{predictions}, predicted classes (depends on \code{cutoff}), and the targets.
#' Note, that the predictions are \emph{not necessarily} of the \emph{specified} \code{positive} class, but of
#' the \emph{model's} positive class (second level of dependent variable, alphabetically).
#'
#' The \code{\link[pROC:roc]{pROC::roc}} \strong{\code{ROC}} curve object(s).
#'
#' A nested \code{tibble} with the \strong{confusion matrix}/matrices.
#' The \code{Pos_} columns tells you whether a row is a
#' True Positive (\code{TP}), True Negative (\code{TN}),
#' False Positive (\code{FP}), or False Negative (\code{FN}),
#' depending on which level is the "positive" class. I.e. the level you wish to predict.
#'
#' A nested \code{tibble} with the \strong{results} from all fold columns.
#'
#' The name of the \strong{Positive Class}.
#'
#' * In \emph{repeated cross-validation}, an evaluation is made per fold column (repetition) and averaged.
#'
#' }
#' @examples
#' \donttest{
#' # Attach packages
#' library(cvms)
#' library(groupdata2) # fold()
#' library(dplyr) # %>% arrange()
#'
#' # Data is part of cvms
#' data <- participant.scores
#'
#' # Set seed for reproducibility
#' set.seed(7)
#'
#' # Fold data
#' data <- fold(
#' data,
#' k = 4,
#' cat_col = "diagnosis",
#' id_col = "participant"
#' ) %>%
#' arrange(.folds)
#'
#' #
#' # Cross-validate a single model
#' #
#'
#' # Gaussian
#' cross_validate(
#' data,
#' formulas = "score~diagnosis",
#' family = "gaussian",
#' REML = FALSE
#' )
#'
#' # Binomial
#' cross_validate(
#' data,
#' formulas = "diagnosis~score",
#' family = "binomial"
#' )
#'
#' #
#' # Cross-validate multiple models
#' #
#'
#' formulas <- c(
#' "score~diagnosis+(1|session)",
#' "score~age+(1|session)"
#' )
#'
#' cross_validate(
#' data,
#' formulas = formulas,
#' family = "gaussian",
#' REML = FALSE
#' )
#'
#' #
#' # Use parallelization
#' #
#'
#' # Attach doParallel and register four cores
#' # Uncomment:
#' # library(doParallel)
#' # registerDoParallel(4)
#'
#' # Cross-validate a list of model formulas in parallel
#' # Make sure to uncomment the parallel argument
#' cross_validate(
#' data,
#' formulas = formulas,
#' family = "gaussian"
#' #, parallel = TRUE # Uncomment
#' )
#' }
#' @importFrom stats binomial gaussian glm lm
#' @importFrom rlang .data
#' @importFrom lifecycle deprecated deprecate_warn deprecate_stop
cross_validate <- function(
data,
formulas,
family,
fold_cols = ".folds",
control = NULL,
REML = FALSE,
cutoff = 0.5,
positive = 2,
metrics = list(),
preprocessing = NULL,
rm_nc = FALSE,
parallel = FALSE,
verbose = FALSE,
link = deprecated(),
models = deprecated(),
model_verbose = deprecated()) {
if (!rlang::is_missing(link)) {
deprecate_stop("1.0.0", "cvms::cross_validate(link = )")
}
if (!rlang::is_missing(models)) {
deprecate_warn(
"1.0.0", "cvms::cross_validate(models = )",
"cvms::cross_validate(formulas = )"
)
formulas <- models
}
if (!rlang::is_missing(model_verbose)) {
deprecate_warn(
"1.0.0", "cvms::cross_validate(model_verbose = )",
"cvms::cross_validate(verbose = )"
)
verbose <- model_verbose
}
# This function does not accept 'multinomial'
checkmate::assert_choice(
x = family,
choices = c(
"gaussian",
"binomial"
)
)
call_cross_validate(
data = data,
formulas = formulas,
fold_cols = fold_cols,
family = family,
preprocessing = preprocessing,
control = control,
REML = REML,
cutoff = cutoff,
positive = positive,
metrics = metrics,
rm_nc = rm_nc,
parallel = parallel,
verbose = verbose
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.