Nothing
#' Nested cross-validation for caret
#'
#' This function applies nested cross-validation (CV) to training of models
#' using the `caret` package. The function also allows the option of embedded
#' filtering of predictors for feature selection nested within the outer loop of
#' CV. Predictions on the outer test folds are brought back together and error
#' estimation/ accuracy determined. The default is 10x10 nested CV.
#'
#' @param y Response vector. For classification this should be a factor.
#' @param x Matrix or dataframe of predictors
#' @param method String specifying which model to use. See [caret::train()] for
#' details.
#' @param filterFUN Filter function, e.g. [ttest_filter()] or [relieff_filter()].
#' Any function can be provided and is passed `y` and `x`. Must return a
#' character vector with names of filtered predictors.
#' @param filter_options List of additional arguments passed to the filter
#' function specified by `filterFUN`.
#' @param weights Weights applied to each sample for models which can use
#' weights. Note `weights` and `balance` cannot be used at the same time.
#' Weights are not applied in filters.
#' @param balance Specifies method for dealing with imbalanced class data.
#' Current options are `"randomsample"` or `"smote"`. See [randomsample()] and
#' [smote()]
#' @param balance_options List of additional arguments passed to the balancing
#' function
#' @param outer_method String of either `"cv"` or `"LOOCV"` specifying whether
#' to do k-fold CV or leave one out CV (LOOCV) for the outer folds
#' @param n_outer_folds Number of outer CV folds
#' @param n_inner_folds Sets number of inner CV folds. Note if `trControl` or
#' `inner_folds` is specified then these supersede `n_inner_folds`.
#' @param outer_folds Optional list containing indices of test folds for outer
#' CV. If supplied, `n_outer_folds` is ignored.
#' @param inner_folds Optional list of test fold indices for inner CV. This must
#' be structured as a list of the outer folds each containing a list of inner
#' folds. Can only be supplied if balancing is not applied. If supplied,
#' `n_inner_folds` is ignored.
#' @param pass_outer_folds Logical indicating whether the same outer folds are
#' used for fitting of the final model when final CV is applied. Note this can
#' only be applied when `n_outer_folds` and the number of inner CV folds
#' specified in `n_inner_folds` or `trControl` are the same and that no
#' balancing is applied.
#' @param metric A string that specifies what summary metric will be used to
#' select the optimal model. By default, "logLoss" is used for classification
#' and "RMSE" is used for regression. Note this differs from the default
#' setting in caret which uses "Accuracy" for classification. See details.
#' @param trControl A list of values generated by the `caret` function
#' [caret::trainControl()]. This defines how inner CV training through `caret`
#' is performed. Default for the inner loop is 10-fold CV. Setting this
#' argument overrules `n_inner_folds`. See
#' http://topepo.github.io/caret/using-your-own-model-in-train.html.
#' @param tuneGrid Data frame of tuning values, see [caret::train()].
#' @param savePredictions Indicates whether hold-out predictions for each inner
#' CV fold should be saved for ROC curves, accuracy etc see
#' [caret::trainControl]. Default is `"final"` to capture predictions for
#' inner CV ROC.
#' @param outer_train_predict Logical whether to save predictions on outer
#' training folds to calculate performance on outer training folds.
#' @param cv.cores Number of cores for parallel processing of the outer loops.
#' @param multicore_fork Logical whether to use forked multicore parallel
#' processing. Forked multicore processing uses `parallel::mclapply`. It is
#' only available on unix/mac as windows does not allow forking. It is set to
#' `FALSE` by default in windows and `TRUE` in unix/mac. Non-forked parallel
#' processing is executed using `parallel::parLapply` or `pbapply::pblapply`
#' if `verbose` is `TRUE`.
#' @param finalCV Logical whether to perform one last round of CV on the whole
#' dataset to determine the final model parameters. If set to `FALSE`, the
#' median of the best hyperparameters from outer CV folds for continuous/
#' ordinal hyperparameters, or highest voted for categorical hyperparameters,
#' are used to fit the final model. Performance metrics are independent of
#' this last step. If set to `NA`, final model fitting is skipped altogether,
#' which gives a useful speed boost if performance metrics are all that is
#' needed.
#' @param na.option Character value specifying how `NA`s are dealt with.
#' `"omit"` is equivalent to `na.action = na.omit`. `"omitcol"` removes cases
#' if there are `NA` in 'y', but columns (predictors) containing `NA` are
#' removed from 'x' to preserve cases. Any other value means that `NA` are
#' ignored (a message is given).
#' @param verbose Logical whether to print messages and show progress
#' @param ... Arguments passed to [caret::train()]
#' @return An object with S3 class "nestcv.train"
#' \item{call}{the matched call}
#' \item{output}{Predictions on the left-out outer folds}
#' \item{outer_result}{List object of results from each outer fold containing
#' predictions on left-out outer folds, caret result and number of filtered
#' predictors at each fold.}
#' \item{outer_folds}{List of indices of outer test folds}
#' \item{dimx}{dimensions of `x`}
#' \item{xsub}{subset of `x` containing all predictors used in both outer CV
#' folds and the final model}
#' \item{y}{original response vector}
#' \item{yfinal}{final response vector (post-balancing)}
#' \item{final_fit}{Final fitted caret model using best tune parameters}
#' \item{final_vars}{Column names of filtered predictors entering final model}
#' \item{summary_vars}{Summary statistics of filtered predictors}
#' \item{roc}{ROC AUC for binary classification where available.}
#' \item{trControl}{`caret::trainControl` object used for inner CV}
#' \item{bestTunes}{best tuned parameters from each outer fold}
#' \item{finalTune}{final parameters used for final model}
#' \item{summary}{Overall performance summary. Accuracy and balanced accuracy
#' for classification. ROC AUC for binary classification. RMSE for
#' regression.}
#' @details
#' When `finalCV = TRUE`, the final fit on the whole data using is performed
#' first. This helps flag errors generated by `caret` such as missing packages.
#' Parallelisation of the final fit when `finalCV = TRUE` is performed in
#' `caret` using `registerDoParallel`. `caret` itself uses `foreach`.
#'
#' Parallelisation is performed on the outer CV folds using `parallel::mclapply`
#' by default on unix/mac and `parallel::parLapply` on windows. `mclapply` uses
#' forking which is faster. But some models (eg. xgbTree) use multi-threading
#' which may cause issues in some circumstances with forked multicore
#' processing. Setting `multicore_fork` to `FALSE` is slower but can alleviate
#' some caret errors.
#'
#' If the outer folds are run using parallelisation, then parallelisation in
#' caret must be off, otherwise an error will be generated. Alternatively if you
#' wish to use parallelisation in caret, then parallelisation in `nestcv.train`
#' can be fully disabled by leaving `cv.cores = 1`.
#'
#' For classification, `metric` defaults to using 'logLoss' with the `trControl`
#' arguments `classProbs = TRUE, summaryFunction = mnLogLoss`, rather than
#' 'Accuracy' which is the default classification metric in `caret`. See
#' [caret::trainControl()]. LogLoss is arguably more consistent than Accuracy
#' for tuning parameters in datasets with small sample size.
#'
#' Models can be fitted with a single set of fixed parameters, in which case
#' `trControl` defaults to `trainControl(method = "none")` which disables inner
#' CV as it is unnecessary. See
#' https://topepo.github.io/caret/model-training-and-tuning.html#fitting-models-without-parameter-tuning
#'
#' @author Myles Lewis
#' @examples
#' \donttest{
#' ## sigmoid function
#' sigmoid <- function(x) {1 / (1 + exp(-x))}
#'
#' ## load iris dataset and simulate a binary outcome
#' data(iris)
#' x <- iris[, 1:4]
#' colnames(x) <- c("marker1", "marker2", "marker3", "marker4")
#' x <- as.data.frame(apply(x, 2, scale))
#' y2 <- sigmoid(0.5 * x$marker1 + 2 * x$marker2) > runif(nrow(x))
#' y2 <- factor(y2, labels = c("class1", "class2"))
#'
#' ## Example using random forest with caret
#' cvrf <- nestcv.train(y2, x, method = "rf",
#' n_outer_folds = 3,
#' cv.cores = 2)
#' summary(cvrf)
#'
#' ## Example of glmnet tuned using caret
#' ## set up small tuning grid for quick execution
#' ## length.out of 20-100 is usually recommended for lambda
#' ## and more alpha values ranging from 0-1
#' tg <- expand.grid(lambda = exp(seq(log(2e-3), log(1e0), length.out = 5)),
#' alpha = 1)
#'
#' ncv <- nestcv.train(y = y2, x = x,
#' method = "glmnet",
#' n_outer_folds = 3,
#' tuneGrid = tg, cv.cores = 2)
#' summary(ncv)
#'
#' ## plot tuning for outer fold #1
#' plot(ncv$outer_result[[1]]$fit, xTrans = log)
#'
#' ## plot final ROC curve
#' plot(ncv$roc)
#'
#' ## plot ROC for left-out inner folds
#' inroc <- innercv_roc(ncv)
#' plot(inroc)
#'
#' ## example to show use of custom fold indices for 5 x 5-fold nested CV
#' library(caret)
#' y <- iris$Species
#' out_folds <- createFolds(y, k = 5)
#' in_folds <- lapply(out_folds, function(i) {
#' ytrain <- y[-i]
#' createFolds(ytrain, k = 5)
#' })
#'
#' res <- nestcv.train(y, x, method="rf", cv.cores = 2,
#' pass_outer_folds = TRUE,
#' inner_folds = in_folds,
#' outer_folds = out_folds)
#' summary(res)
#' res$outer_folds
#' res$final_fit$control$indexOut # same as outer_folds
#' }
#' @importFrom caret createFolds train trainControl mnLogLoss confusionMatrix
#' defaultSummary
#' @importFrom data.table rbindlist
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach registerDoSEQ
#' @importFrom parallel mclapply
#' @importFrom pROC roc
#' @importFrom stats predict setNames
#' @importFrom utils capture.output
#' @export
#'
nestcv.train <- function(y, x,
method = "rf",
filterFUN = NULL,
filter_options = NULL,
weights = NULL,
balance = NULL,
balance_options = NULL,
outer_method = c("cv", "LOOCV"),
n_outer_folds = 10,
n_inner_folds = 10,
outer_folds = NULL,
inner_folds = NULL,
pass_outer_folds = FALSE,
cv.cores = 1,
multicore_fork = (Sys.info()["sysname"] != "Windows"),
metric = ifelse(is.factor(y), "logLoss", "RMSE"),
trControl = NULL,
tuneGrid = NULL,
savePredictions = "final",
outer_train_predict = FALSE,
finalCV = TRUE,
na.option = "pass",
verbose = TRUE,
...) {
start <- Sys.time()
nestcv.call <- match.call(expand.dots = TRUE)
outer_method <- match.arg(outer_method)
if (is.character(y)) y <- factor(y)
if (!is.null(balance) & is.numeric(y)) {
stop("`balance` can only be used for classification")}
ok <- checkxy(y, x, na.option, weights)
y <- y[ok$r]
x <- x[ok$r, ok$c]
weights <- weights[ok$r]
if (!is.null(balance) & !is.null(weights)) {
stop("`balance` and `weights` cannot be used at the same time")}
if (is.null(outer_folds)) {
outer_folds <- switch(outer_method,
cv = createFolds(y, k = n_outer_folds),
LOOCV = 1:length(y))
} else {
if ("n_outer_folds" %in% names(nestcv.call)) {
if (n_outer_folds != length(outer_folds)) stop("Mismatch between n_outer_folds and length(outer_folds)")
}
n_outer_folds <- length(outer_folds)
}
if (!is.null(inner_folds)) {
if (length(inner_folds) != length(outer_folds)) stop("Mismatch in length(outer_folds) and length(inner_folds)")
if ("n_inner_folds" %in% names(nestcv.call)) {
if (n_inner_folds != length(inner_folds)) stop("Mismatch between n_inner_folds and length(inner_folds)")
}
n_inner_folds <- length(inner_folds[[1]])
outer_train_size <- sapply(swapFoldIndex(outer_folds), length)
chk <- vapply(seq_along(outer_train_size), function(i) {
max(unlist(inner_folds[[i]])) > outer_train_size[i]
}, logical(1))
if (any(chk)) stop("inner_folds contains index out of range")
if (!is.null(balance)) stop("`balance` cannot be used if `inner_folds` is specified")
inner_train_folds <- lapply(inner_folds, swapFoldIndex)
} else inner_train_folds <- NULL
if (is.null(trControl)) {
trControl <- if (is.factor(y)) {
trainControl(method = "cv",
number = n_inner_folds,
classProbs = TRUE,
savePredictions = savePredictions,
summaryFunction = mnLogLoss)
} else trainControl(method = "cv",
number = n_inner_folds,
savePredictions = savePredictions)
}
# switch off inner CV if tuneGrid is single row
if (!is.null(tuneGrid)) {
if (nrow(tuneGrid) == 1) {
trControl <- trainControl(method = "none", classProbs = TRUE)
inner_train_folds <- NULL
}
}
if (is.na(finalCV)) {
final_fit <- finalTune <- filtx <- yfinal <- xsub <- NA
} else {
# fit final model with CV on whole dataset first
if (verbose) message("Fitting final model using CV on whole data")
dat <- nest_filt_bal(NULL, y, x, filterFUN, filter_options,
balance, balance_options)
yfinal <- dat$ytrain
filtx <- dat$filt_xtrain
if (finalCV) {
# use CV on whole data to finalise parameters
trControlFinal <- trControl
if (pass_outer_folds) {
if (n_outer_folds == trControl$number && trControl$method == "cv" &&
is.null(balance)) {
train_folds <- swapFoldIndex(outer_folds, length(y))
trControlFinal$index <- train_folds
trControlFinal$indexOut <- outer_folds
} else message("Cannot pass `outer_folds` to final CV")
}
if (cv.cores >= 2) {
if (Sys.info()["sysname"] == "Windows") {
cl <- makeCluster(cv.cores)
registerDoParallel(cl)
} else {
# unix
registerDoParallel(cores = cv.cores)
}
}
printlog <- capture.output({
final_fit <- caret::train(x = filtx, y = yfinal,
method = method,
weights = weights,
metric = metric,
trControl = trControlFinal,
tuneGrid = tuneGrid, ...)
})
finalTune <- final_fit$bestTune
if (cv.cores >= 2) {
if (Sys.info()["sysname"] == "Windows") stopCluster(cl)
foreach::registerDoSEQ()
}
}
}
if (verbose && (!multicore_fork || Sys.getenv("RSTUDIO") == "1")) {
message("Performing ", n_outer_folds, "-fold outer CV, using ",
plural(cv.cores, "core(s)"))}
if (!multicore_fork && cv.cores >= 2) {
cl <- makeCluster(cv.cores)
dots <- list(...)
varlist <- c("outer_folds", "inner_train_folds", "y", "x", "method", "filterFUN",
"filter_options", "weights", "balance", "balance_options",
"metric", "trControl", "tuneGrid", "outer_train_predict",
"nestcv.trainCore", "dots")
clusterExport(cl, varlist = varlist, envir = environment())
if (verbose) {
if (!requireNamespace("pbapply", quietly = TRUE)) {
stop("Package 'pbapply' must be installed", call. = FALSE)}
outer_res <- pbapply::pblapply(seq_along(outer_folds), function(i) {
args <- c(list(i=i, y=y, x=x, outer_folds = outer_folds,
inner_train_folds = inner_train_folds, method=method,
filterFUN=filterFUN, filter_options=filter_options,
weights=weights, balance=balance,
balance_options=balance_options, metric=metric,
trControl=trControl, tuneGrid=tuneGrid,
outer_train_predict=outer_train_predict), dots)
do.call(nestcv.trainCore, args)
}, cl = cl)
} else {
outer_res <- parLapply(cl = cl, seq_along(outer_folds), function(i) {
args <- c(list(i=i, y=y, x=x, outer_folds = outer_folds,
inner_train_folds = inner_train_folds, method=method,
filterFUN=filterFUN, filter_options=filter_options,
weights=weights, balance=balance,
balance_options=balance_options, metric=metric,
trControl=trControl, tuneGrid=tuneGrid,
outer_train_predict=outer_train_predict), dots)
do.call(nestcv.trainCore, args)
})
}
stopCluster(cl)
} else {
# linux/mac, using forked parallel processing
outer_res <- mclapply(seq_along(outer_folds), function(i) {
nestcv.trainCore(i, y, x, outer_folds, inner_train_folds,
method, filterFUN, filter_options,
weights, balance, balance_options,
metric, trControl, tuneGrid, outer_train_predict,
verbose, ...)
}, mc.cores = cv.cores, mc.allow.recursive = FALSE)
}
predslist <- lapply(outer_res, '[[', 'preds')
output <- data.table::rbindlist(predslist)
output <- as.data.frame(output)
if (!is.null(rownames(x))) {
rownames(output) <- unlist(lapply(predslist, rownames))}
summary <- predSummary(output)
caret.roc <- NULL
if (is.factor(y) & nlevels(y) == 2) {
caret.roc <- pROC::roc(output$testy, output$predyp, direction = "<",
quiet = TRUE)
}
bestTunes <- lapply(outer_res, function(i) i$fit$bestTune)
bestTunes <- as.data.frame(data.table::rbindlist(bestTunes))
rownames(bestTunes) <- paste('Fold', seq_len(nrow(bestTunes)))
if (!is.na(finalCV) && !finalCV) {
# use outer folds for final parameters, fit single final model
if (verbose) message("Fitting single final model")
finalTune <- finaliseTune(bestTunes)
fitControl <- trainControl(method = "none", classProbs = is.factor(y))
final_fit <- caret::train(x = filtx, y = yfinal, method = method,
weights = weights,
trControl = fitControl,
tuneGrid = finalTune, ...)
}
if (!is.na(finalCV)) {
all_vars <- unlist(lapply(outer_res, function(i) {
colnames(i$fit$trainingData)
}))
all_vars <- unique(c(all_vars, colnames(filtx)))
all_vars <- all_vars[all_vars %in% colnames(x)]
xsub <- x[, all_vars]
}
end <- Sys.time()
if (verbose) message("Duration: ", format(end - start))
out <- list(call = nestcv.call,
output = output,
outer_result = outer_res,
outer_method = outer_method,
outer_folds = outer_folds,
dimx = dim(x),
xsub = xsub,
y = y,
yfinal = yfinal,
final_fit = final_fit,
final_vars = colnames(filtx),
roc = caret.roc,
trControl = trControl,
bestTunes = bestTunes,
finalTune = finalTune,
summary = summary)
class(out) <- "nestcv.train"
out
}
nestcv.trainCore <- function(i, y, x, outer_folds, inner_train_folds,
method, filterFUN, filter_options,
weights, balance, balance_options,
metric, trControl, tuneGrid,
outer_train_predict, verbose = FALSE, ...) {
start <- Sys.time()
if (verbose) message_parallel("Starting Fold ", i, " ...")
test <- outer_folds[[i]]
dat <- nest_filt_bal(test, y, x, filterFUN, filter_options,
balance, balance_options)
ytrain <- dat$ytrain
ytest <- dat$ytest
filt_xtrain <- dat$filt_xtrain
filt_xtest <- dat$filt_xtest
trControl$index <- inner_train_folds[[i]]
printlog <- capture.output({
fit <- caret::train(x = filt_xtrain, y = ytrain,
method = method,
weights = weights[-test],
metric = metric,
trControl = trControl,
tuneGrid = tuneGrid, ...)
})
predy <- predict(fit, newdata = filt_xtest)
preds <- data.frame(predy=predy, testy=ytest)
if (is.factor(y)) {
predyp <- predict(fit, newdata = filt_xtest, type = "prob")
if (nlevels(y) == 2) predyp <- predyp[,2] # binomial
preds <- cbind(preds, predyp)
}
rownames(preds) <- rownames(filt_xtest)
if (outer_train_predict) {
predy <- predict(fit, newdata = filt_xtrain)
train_preds <- data.frame(ytrain=ytrain, predy=predy)
if (is.factor(y)) {
predyp <- predict(fit, newdata = filt_xtrain, type = "prob")
if (nlevels(y) == 2) predyp <- predyp[,2] # binomial
train_preds <- cbind(train_preds, predyp)
}
} else train_preds <- NULL
if (verbose) {
end <- Sys.time()
message_parallel(" Fold ", i, " done (",
format(end - start, digits = 3), ")")
}
ret <- list(preds = preds,
train_preds = train_preds,
fit = fit,
nfilter = ncol(filt_xtrain))
ret
}
# finalise the caret model tuning from bestTune dataframe
#' @importFrom stats median
finaliseTune <- function(x) {
fintune <- lapply(colnames(x), function(i) {
if (is.numeric(x[, i])) return(median(x[, i]))
tab <- table(x[, i])
names(tab)[which.max(tab)] # majority vote for factors
})
names(fintune) <- colnames(x)
data.frame(fintune, check.names = FALSE)
}
#' @export
summary.nestcv.train <- function(object,
digits = max(3L, getOption("digits") - 3L),
...) {
cat("Nested cross-validation with caret\n")
if (!is.null(object$call$method)) {
cat("Method: ", object$call$method, "\n")
} else cat("Method: rf\n")
if (!is.null(object$call$filterFUN)) {
cat("Filter: ", object$call$filterFUN, "\n")
} else cat("No filter\n")
cat("Outer loop: ", switch(object$outer_method,
cv = paste0(length(object$outer_folds), "-fold cv\n"),
LOOCV = "leave-one-out CV\n"))
cat("Inner loop: ", paste0(object$trControl$number, "-fold ",
object$trControl$method, "\n"))
balance <- object$call$balance
if (!is.null(balance)) {
cat("Balancing: ", balance, "\n")
}
cat(object$dimx[1], "observations,", object$dimx[2], "predictors\n")
if (!is.numeric(object$y)) print(c(table(object$y)))
cat("\n")
nfilter <- unlist(lapply(object$outer_result, '[[', 'nfilter'))
foldres <- object$bestTunes
foldres$n.filter <- nfilter
print(foldres, digits = digits, print.gap = 2L)
cat("\nFinal parameters:\n")
if (length(object$finalTune)==1 && is.na(object$finalTune)) {
cat("NA\n")
} else {
print(object$finalTune, digits = digits, print.gap = 2L, row.names = FALSE)
}
cat("\nResult:\n")
print(object$summary, digits = digits, print.gap = 2L)
out <- list(dimx = object$dimx, folds = foldres,
final_param = object$finalTune, result = object$summary)
invisible(out)
}
#' @method predict nestcv.train
#' @export
predict.nestcv.train <- function(object, newdata, ...) {
if (any(!object$final_vars %in% colnames(newdata)))
stop("newdata is missing some predictors", call. = FALSE)
predict(object$final_fit, newdata = newdata[, object$final_vars], ...)
}
#' Summarise variables
#'
#' @param x Matrix or dataframe with variables in columns
#' @return A matrix with variables in rows and mean, median and SD for each
#' variable or number of levels if the variable is a factor. If `NA` are
#' detected, an extra column `n.NA` is added with the numbers of `NA` for each
#' variable.
#' @export
#'
summary_vars <- function(x) {
if (is.matrix(x)) {
if (!is.numeric(x)) return("Not numeric")
mat <- x
selCols <- TRUE
} else {
selCols <- unlist(lapply(x, is.numeric))
mat <- as.matrix(x[, selCols])
}
summary_mat <- cbind(colMeans(mat, na.rm = TRUE),
matrixStats::colMedians(mat, na.rm = TRUE),
matrixStats::colSds(mat, na.rm = TRUE))
colnames(summary_mat) <- c("mean", "median", "sd")
if (any(!selCols)) {
nlevels_mat <- unlist(lapply(x[, !selCols], function(i) nlevels(as.factor(i))))
out <- matrix(NA, nrow = ncol(x), ncol = 4,
dimnames = list(colnames(x), c("mean", "median", "sd", "nlevels")))
out[selCols, 1:3] <- summary_mat
out[!selCols, 4] <- nlevels_mat
} else out <- summary_mat
n.NA <- apply(x, 2, function(i) sum(is.na(i)))
if (any(n.NA > 0)) out <- cbind(out, n.NA)
out
}
swapFoldIndex <- function(folds, len = max(unlist(folds))) {
lapply(folds, function(i) setdiff(seq_len(len), i))
}
# Function which prints a message using shell echo; useful for printing
# messages from inside mclapply when running in Rstudio
message_parallel <- function(...) {
if (Sys.getenv("RSTUDIO") != "1") return()
system(sprintf('echo "%s"', paste0(..., collapse = "")))
}
plural <- function(n, text) {
text <- if (n == 1) gsub("\\(s\\)", "", text) else gsub("\\(|\\)", "", text)
paste(n, text)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.