R/nestcv.train.R
In nestedcv: Nested Cross-Validation with 'glmnet' and 'caret'

Documented in nestcv.train summary_vars

#' Nested cross-validation for caret
#'
#' This function applies nested cross-validation (CV) to training of models
#' using the `caret` package. The function also allows the option of embedded
#' filtering of predictors for feature selection nested within the outer loop of
#' CV. Predictions on the outer test folds are brought back together and error
#' estimation/ accuracy determined. The default is 10x10 nested CV.
#'
#' @param y Response vector. For classification this should be a factor.
#' @param x Matrix or dataframe of predictors
#' @param method String specifying which model to use. See [caret::train()] for
#'   details.
#' @param filterFUN Filter function, e.g. [ttest_filter()] or [relieff_filter()].
#'   Any function can be provided and is passed `y` and `x`. Ideally returns a
#'   numeric vector with indices of filtered predictors. The custom function can
#'   return a character vector of names of the filtered predictors, but this
#'   will not work with the `penalty.factor` argument in [nestcv.glmnet()].
#' @param filter_options List of additional arguments passed to the filter
#'   function specified by `filterFUN`.
#' @param weights Weights applied to each sample for models which can use
#'   weights. Note `weights` and `balance` cannot be used at the same time.
#'   Weights are not applied in filters.
#' @param balance Specifies method for dealing with imbalanced class data.
#'   Current options are `"randomsample"` or `"smote"`. See [randomsample()] and
#'   [smote()]
#' @param balance_options List of additional arguments passed to the balancing
#'   function
#' @param modifyX Character string specifying the name of a function to modify
#'   `x`. This can be an imputation function for replacing missing values, or a
#'   more complex function which alters or even adds columns to `x`. The
#'   required return value of this function depends on the `modifyX_useY`
#'   setting.
#' @param modifyX_useY Logical value whether the `x` modifying function makes
#'   use of response training data from `y`. If `FALSE` then the `modifyX`
#'   function simply needs to return a modified `x` object. If `TRUE` then the
#'   `modifyX` function must return a model type object on which `predict()` can
#'   be called, so that train and test partitions of `x` can be modified
#'   independently.
#' @param modifyX_options List of additional arguments passed to the `x`
#'   modifying function
#' @param outer_method String of either `"cv"` or `"LOOCV"` specifying whether
#'   to do k-fold CV or leave one out CV (LOOCV) for the outer folds
#' @param n_outer_folds Number of outer CV folds
#' @param n_inner_folds Sets number of inner CV folds. Note if `trControl` or
#'   `inner_folds` is specified then these supersede `n_inner_folds`.
#' @param outer_folds Optional list containing indices of test folds for outer
#'   CV. If supplied, `n_outer_folds` is ignored.
#' @param inner_folds Optional list of test fold indices for inner CV. This must
#'   be structured as a list of the outer folds each containing a list of inner
#'   folds. Can only be supplied if balancing is not applied. If supplied,
#'   `n_inner_folds` is ignored.
#' @param pass_outer_folds Logical indicating whether the same outer folds are
#'   used for fitting of the final model when final CV is applied. Note this can
#'   only be applied when `n_outer_folds` and the number of inner CV folds
#'   specified in `n_inner_folds` or `trControl` are the same and that no
#'   balancing is applied.
#' @param metric A string that specifies what summary metric will be used to
#'   select the optimal model. By default, "logLoss" is used for classification
#'   and "RMSE" is used for regression. Note this differs from the default
#'   setting in caret which uses "Accuracy" for classification. See details.
#' @param trControl A list of values generated by the `caret` function
#'   [caret::trainControl()]. This defines how inner CV training through `caret`
#'   is performed. Default for the inner loop is 10-fold CV. Setting this
#'   argument overrules `n_inner_folds`. See
#'   http://topepo.github.io/caret/using-your-own-model-in-train.html.
#' @param tuneGrid Data frame of tuning values, see [caret::train()].
#' @param savePredictions Indicates whether hold-out predictions for each inner
#'   CV fold should be saved for ROC curves, accuracy etc see
#'   [caret::trainControl]. Default is `"final"` to capture predictions for
#'   inner CV ROC.
#' @param outer_train_predict Logical whether to save predictions on outer
#'   training folds to calculate performance on outer training folds.
#' @param parallel_mode Either "mclapply", "parLapply" or "future". This
#'   determines which parallel backend to use. The default is
#'   `parallel::mclapply` on unix/mac and `parallel::parLapply` on windows.
#' @param cv.cores Number of cores for parallel processing of the outer loops.
#'   Ignored if `parallel_mode = "future"`.
#' @param finalCV Logical whether to perform one last round of CV on the whole
#'   dataset to determine the final model parameters. If set to `FALSE`, the
#'   median of the best hyperparameters from outer CV folds for continuous/
#'   ordinal hyperparameters, or highest voted for categorical hyperparameters,
#'   are used to fit the final model. Performance metrics are independent of
#'   this last step. If set to `NA`, final model fitting is skipped altogether,
#'   which gives a useful speed boost if performance metrics are all that is
#'   needed.
#' @param na.option Character value specifying how `NA`s are dealt with.
#'   `"omit"` is equivalent to `na.action = na.omit`. `"omitcol"` removes cases
#'   if there are `NA` in 'y', but columns (predictors) containing `NA` are
#'   removed from 'x' to preserve cases. Any other value means that `NA` are
#'   ignored (a message is given).
#' @param verbose Logical whether to print messages and show progress
#' @param ... Arguments passed to [caret::train()]
#' @return An object with S3 class "nestcv.train"
#'   \item{call}{the matched call}
#'   \item{output}{Predictions on the left-out outer folds}
#'   \item{outer_result}{List object of results from each outer fold containing
#'   predictions on left-out outer folds, caret result and number of filtered
#'   predictors at each fold.}
#'   \item{outer_folds}{List of indices of outer test folds}
#'   \item{dimx}{dimensions of `x`}
#'   \item{xsub}{subset of `x` containing all predictors used in both outer CV
#'   folds and the final model}
#'   \item{y}{original response vector}
#'   \item{yfinal}{final response vector (post-balancing)}
#'   \item{final_fit}{Final fitted caret model using best tune parameters}
#'   \item{final_vars}{Column names of filtered predictors entering final model}
#'   \item{summary_vars}{Summary statistics of filtered predictors}
#'   \item{roc}{ROC AUC for binary classification where available.}
#'   \item{trControl}{`caret::trainControl` object used for inner CV}
#'   \item{bestTunes}{best tuned parameters from each outer fold}
#'   \item{finalTune}{final parameters used for final model}
#'   \item{summary}{Overall performance summary. Accuracy and balanced accuracy
#'   for classification. ROC AUC for binary classification. RMSE for
#'   regression.}
#' @details
#' When `finalCV = TRUE`, the final fit on the whole data using is performed
#' first. This helps flag errors generated by `caret` such as missing packages.
#' Parallelisation of the final fit when `finalCV = TRUE` is performed in
#' `caret` using `registerDoParallel`. `caret` itself uses `foreach`.
#' 
#' Parallelisation is performed on the outer CV folds using `parallel::mclapply`
#' by default on unix/mac and `parallel::parLapply` on windows. `mclapply` uses
#' forking which is faster. But some models use multi-threading which may cause
#' issues in some circumstances with forked multicore processing. `future` is
#' another option as a parallel system.
#'
#' If the outer folds are run using parallelisation, then parallelisation in
#' caret must be off, otherwise an error will be generated. Alternatively if you
#' wish to use parallelisation in caret, then parallelisation in `nestcv.train`
#' can be fully disabled by leaving `cv.cores = 1`.
#'   
#' xgboost models fitted via caret using `method = "xgbTree"` or `"xgbLinear"`
#' invoke openMP multithreading on linux/windows by default which causes
#' `nestcv.train` to fail when `cv.cores` >1 (nested parallelisation). Mac OS is
#' unaffected. In order to prevent this, `nestcv.train()` sets openMP threads to
#' 1 if `cv.cores` >1.
#'
#' For classification, `metric` defaults to using 'logLoss' with the `trControl`
#' arguments `classProbs = TRUE, summaryFunction = mnLogLoss`, rather than
#' 'Accuracy' which is the default classification metric in `caret`. See
#' [caret::trainControl()]. LogLoss is arguably more consistent than Accuracy
#' for tuning parameters in datasets with small sample size.
#'
#' Models can be fitted with a single set of fixed parameters, in which case
#' `trControl` defaults to `trainControl(method = "none")` which disables inner
#' CV as it is unnecessary. See
#' https://topepo.github.io/caret/model-training-and-tuning.html#fitting-models-without-parameter-tuning
#'
#' @author Myles Lewis
#' @examples
#' \donttest{
#' ## sigmoid function
#' sigmoid <- function(x) {1 / (1 + exp(-x))}
#' 
#' ## load iris dataset and simulate a binary outcome
#' data(iris)
#' x <- iris[, 1:4]
#' colnames(x) <- c("marker1", "marker2", "marker3", "marker4")
#' x <- as.data.frame(apply(x, 2, scale))
#' y2 <- sigmoid(0.5 * x$marker1 + 2 * x$marker2) > runif(nrow(x))
#' y2 <- factor(y2, labels = c("class1", "class2"))
#' 
#' ## Example using random forest with caret
#' cvrf <- nestcv.train(y2, x, method = "rf",
#'                      n_outer_folds = 3,
#'                      cv.cores = 2)
#' summary(cvrf)
#' 
#' ## Example of glmnet tuned using caret
#' ## set up small tuning grid for quick execution
#' ## length.out of 20-100 is usually recommended for lambda
#' ## and more alpha values ranging from 0-1
#' tg <- expand.grid(lambda = exp(seq(log(2e-3), log(1e0), length.out = 5)),
#'                   alpha = 1)
#' 
#' ncv <- nestcv.train(y = y2, x = x,
#'                     method = "glmnet",
#'                     n_outer_folds = 3,
#'                     tuneGrid = tg, cv.cores = 2)
#' summary(ncv)
#' 
#' ## plot tuning for outer fold #1
#' plot(ncv$outer_result[[1]]$fit, xTrans = log)
#' 
#' ## plot final ROC curve
#' plot(ncv$roc)
#' 
#' ## plot ROC for left-out inner folds
#' inroc <- innercv_roc(ncv)
#' plot(inroc)
#' 
#' ## example to show use of custom fold indices for 5 x 5-fold nested CV
#' library(caret)
#' y <- iris$Species
#' out_folds <- createFolds(y, k = 5)
#' in_folds <- lapply(out_folds, function(i) {
#'   ytrain <- y[-i]
#'   createFolds(ytrain, k = 5)
#' })
#' 
#' res <- nestcv.train(y, x, method="rf", cv.cores = 2,
#'                     pass_outer_folds = TRUE,
#'                     inner_folds = in_folds,
#'                     outer_folds = out_folds)
#' summary(res)
#' res$outer_folds
#' res$final_fit$control$indexOut  # same as outer_folds
#' }
#' @importFrom caret createFolds train trainControl mnLogLoss confusionMatrix
#'   defaultSummary
#' @importFrom data.table rbindlist
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach registerDoSEQ
#' @importFrom parallel mclapply
#' @importFrom pROC roc
#' @importFrom stats predict setNames
#' @importFrom utils capture.output
#' @export
#' 
nestcv.train <- function(y, x,
                         method = "rf",
                         filterFUN = NULL,
                         filter_options = NULL,
                         weights = NULL,
                         balance = NULL,
                         balance_options = NULL,
                         modifyX = NULL,
                         modifyX_useY = FALSE,
                         modifyX_options = NULL,
                         outer_method = c("cv", "LOOCV"),
                         n_outer_folds = 10,
                         n_inner_folds = 10,
                         outer_folds = NULL,
                         inner_folds = NULL,
                         pass_outer_folds = FALSE,
                         parallel_mode = NULL,
                         cv.cores = 1,
                         metric = ifelse(is.factor(y), "logLoss", "RMSE"),
                         trControl = NULL,
                         tuneGrid = NULL,
                         savePredictions = "final",
                         outer_train_predict = FALSE,
                         finalCV = TRUE,
                         na.option = "pass",
                         verbose = TRUE,
                         ...) {
  start <- Sys.time()
  nestcv.call <- match.call(expand.dots = TRUE)
  outer_method <- match.arg(outer_method)
  if (is.character(y)) y <- factor(y)
  if (!is.null(balance) & is.numeric(y)) {
    stop("`balance` can only be used for classification")}
  ok <- checkxy(y, x, na.option, weights)
  y <- y[ok$r]
  x <- x[ok$r, ok$c, drop = FALSE]
  weights <- weights[ok$r]
  if (!is.null(balance) & !is.null(weights)) {
    stop("`balance` and `weights` cannot be used at the same time")}
  
  if (is.null(outer_folds)) {
    outer_folds <- switch(outer_method,
                          cv = createFolds(y, k = n_outer_folds),
                          LOOCV = 1:length(y))
  } else {
    if ("n_outer_folds" %in% names(nestcv.call)) {
      if (n_outer_folds != length(outer_folds)) stop("Mismatch between n_outer_folds and length(outer_folds)")
    }
    n_outer_folds <- length(outer_folds)
  }
  
  if (!is.null(inner_folds)) {
    if (length(inner_folds) != length(outer_folds)) stop("Mismatch in length(outer_folds) and length(inner_folds)")
    if ("n_inner_folds" %in% names(nestcv.call)) {
      if (n_inner_folds != length(inner_folds[[1]])) stop("Mismatch between n_inner_folds and length(inner_folds)")
    }
    n_inner_folds <- length(inner_folds[[1]])
    outer_train_size <- sapply(swapFoldIndex(outer_folds), length)
    chk <- vapply(seq_along(outer_train_size), function(i) {
      max(unlist(inner_folds[[i]])) > outer_train_size[i]
    }, logical(1))
    if (any(chk)) stop("inner_folds contains index out of range")
    if (!is.null(balance)) stop("`balance` cannot be used if `inner_folds` is specified")
    inner_train_folds <- lapply(inner_folds, swapFoldIndex)
  } else inner_train_folds <- NULL
  
  if (is.null(trControl)) {
    trControl <- if (is.factor(y)) {
      trainControl(method = "cv", 
                   number = n_inner_folds,
                   classProbs = TRUE,
                   savePredictions = savePredictions,
                   summaryFunction = mnLogLoss)
    } else trainControl(method = "cv", 
                        number = n_inner_folds,
                        savePredictions = savePredictions)
  }
  # switch off inner CV if tuneGrid is single row
  if (!is.null(tuneGrid)) {
    if (nrow(tuneGrid) == 1) {
      trControl <- trainControl(method = "none", classProbs = TRUE)
      inner_train_folds <- NULL
    }
  }

  # disable openMP multithreading (fix for xgboost)
  if (cv.cores >= 2) {
    threads <- RhpcBLASctl::omp_get_max_threads()
    if (!is.na(threads) && threads > 1) {
      RhpcBLASctl::omp_set_num_threads(1L)
      on.exit(RhpcBLASctl::omp_set_num_threads(threads))
    }
  }
  
  if (is.null(parallel_mode)) {
    parallel_mode <- if (Sys.info()["sysname"] == "Windows" & cv.cores >= 2) {
      "parLapply"
    } else "mclapply"
  } else {
    parallel_mode <- match.arg(parallel_mode,
                                 c("mclapply", "parLapply", "future"))
  }
  
  if (cv.cores >= 2 & parallel_mode == "mclapply" & 
      Sys.info()["sysname"] == "Darwin" & Sys.getenv("RSTUDIO") != "1") {
    verbose <- FALSE  # prevent all messages in mac gui
  }
  
  verbose <- as.numeric(verbose)
  if (is.na(finalCV)) {
    final_fit <- finalTune <- filtx <- yfinal <- xsub <- NA
  } else {
    # fit final model with CV on whole dataset first
    if (verbose == 1) message("Fitting final model using CV on whole data")
    dat <- nest_filt_bal(NULL, y, x, filterFUN, filter_options,
                         balance, balance_options,
                         modifyX, modifyX_useY, modifyX_options)
    yfinal <- dat$ytrain
    filtx <- dat$filt_xtrain
    if (finalCV) {
      # use CV on whole data to finalise parameters
      trControlFinal <- trControl
      if (pass_outer_folds) {
        if (n_outer_folds == trControl$number && trControl$method == "cv" &&
            is.null(balance)) {
          train_folds <- swapFoldIndex(outer_folds, length(y))
          trControlFinal$index <- train_folds
          trControlFinal$indexOut <- outer_folds
        } else message("Cannot pass `outer_folds` to final CV")
      }

      if (parallel_mode != "future" & cv.cores >= 2) {
        # start parallel backend
        if (Sys.info()["sysname"] == "Windows") {
          cl <- makeCluster(cv.cores)
          registerDoParallel(cl)
        } else {
          # unix
          registerDoParallel(cores = cv.cores)
        }
      }
      printlog <- capture.output({
        final_fit <- caret::train(x = filtx, y = yfinal,
                                  method = method,
                                  weights = weights,
                                  metric = metric,
                                  trControl = trControlFinal,
                                  tuneGrid = tuneGrid, ...)
      })
      finalTune <- final_fit$bestTune
      if (parallel_mode != "future" & cv.cores >= 2) {
        # stop parallel backend
        if (Sys.info()["sysname"] == "Windows") stopCluster(cl)
        foreach::registerDoSEQ()
      }
    }
  }

  if (verbose == 1 && (parallel_mode != "mclapply" || Sys.getenv("RSTUDIO") == "1")) {
    message("Performing ", n_outer_folds, "-fold outer CV, using ",
            plural(cv.cores, "core(s)"))}
  if (parallel_mode == "parLapply") {
    cl <- makeCluster(cv.cores)
    dots <- list(...)
    varlist <- c("outer_folds", "inner_train_folds", "y", "x", "method", "filterFUN",
                 "filter_options", "weights", "balance", "balance_options",
                 "modifyX", "modifyX_useY", "modifyX_options",
                 "metric", "trControl", "tuneGrid", "outer_train_predict",
                 "nestcv.trainCore", "dots")
    clusterExport(cl, varlist = varlist, envir = environment())
    if (verbose) {
      if (!requireNamespace("pbapply", quietly = TRUE)) {
        stop("Package 'pbapply' must be installed", call. = FALSE)}
      outer_res <- pbapply::pblapply(seq_along(outer_folds), function(i) {
        args <- c(list(i=i, y=y, x=x, outer_folds = outer_folds,
                       inner_train_folds = inner_train_folds, method=method,
                       filterFUN=filterFUN, filter_options=filter_options,
                       weights=weights, balance=balance,
                       balance_options=balance_options,
                       modifyX=modifyX, modifyX_useY=modifyX_useY,
                       modifyX_options=modifyX_options,
                       metric=metric, trControl=trControl, tuneGrid=tuneGrid,
                       outer_train_predict=outer_train_predict), dots)
        do.call(nestcv.trainCore, args)
      }, cl = cl)
    } else {
      outer_res <- parLapply(cl = cl, seq_along(outer_folds), function(i) {
        args <- c(list(i=i, y=y, x=x, outer_folds = outer_folds,
                       inner_train_folds = inner_train_folds, method=method,
                       filterFUN=filterFUN, filter_options=filter_options,
                       weights=weights, balance=balance,
                       balance_options=balance_options,
                       modifyX=modifyX, modifyX_useY=modifyX_useY,
                       modifyX_options=modifyX_options,
                       metric=metric, trControl=trControl, tuneGrid=tuneGrid,
                       outer_train_predict=outer_train_predict), dots)
        do.call(nestcv.trainCore, args)
      })
    }
    stopCluster(cl)
  } else if (parallel_mode == "future") {
    # future
    outer_res <- future_lapply(seq_along(outer_folds), function(i) {
      nestcv.trainCore(i, y, x, outer_folds, inner_train_folds,
                       method, filterFUN, filter_options,
                       weights, balance, balance_options,
                       modifyX, modifyX_useY, modifyX_options,
                       metric, trControl, tuneGrid, outer_train_predict,
                       verbose, ...)
    }, future.seed = TRUE)
  } else {
    # linux/mac, using forked parallel processing
    outer_res <- mclapply(seq_along(outer_folds), function(i) {
      nestcv.trainCore(i, y, x, outer_folds, inner_train_folds,
                       method, filterFUN, filter_options,
                       weights, balance, balance_options,
                       modifyX, modifyX_useY, modifyX_options,
                       metric, trControl, tuneGrid, outer_train_predict,
                       verbose, ...)
    }, mc.cores = cv.cores, mc.allow.recursive = FALSE)
  }

  predslist <- lapply(outer_res, '[[', 'preds')
  output <- data.table::rbindlist(predslist)
  output <- as.data.frame(output)
  if (!is.null(rownames(x))) {
    rownames(output) <- unlist(lapply(predslist, rownames))}
  summary <- predSummary(output)
  caret.roc <- NULL
  if (is.factor(y) & nlevels(y) == 2) {
    caret.roc <- pROC::roc(output$testy, output$predyp, direction = "<", 
                           quiet = TRUE)
  }
  bestTunes <- lapply(outer_res, function(i) i$fit$bestTune)
  bestTunes <- as.data.frame(data.table::rbindlist(bestTunes))
  rownames(bestTunes) <- paste('Fold', seq_len(nrow(bestTunes)))
  
  if (!is.na(finalCV) && !finalCV) {
    # use outer folds for final parameters, fit single final model
    if (verbose == 1) message("Fitting single final model")
    finalTune <- finaliseTune(bestTunes)
    fitControl <- trainControl(method = "none", classProbs = is.factor(y))
    final_fit <- caret::train(x = filtx, y = yfinal, method = method,
                              weights = weights,
                              trControl = fitControl,
                              tuneGrid = finalTune, ...)
  }
  
  if (!is.na(finalCV)) {
    all_vars <- unlist(lapply(outer_res, function(i) {
      colnames(i$fit$trainingData)
    }))
    all_vars <- unique(c(all_vars, colnames(filtx)))
    all_vars <- all_vars[all_vars %in% colnames(x)]
    xsub <- x[, all_vars]
  }
  
  end <- Sys.time()
  if (verbose == 1) message("Duration: ", format(end - start))
  out <- list(call = nestcv.call,
              output = output,
              outer_result = outer_res,
              outer_method = outer_method,
              outer_folds = outer_folds,
              dimx = dim(x),
              xsub = xsub,
              y = y,
              yfinal = yfinal,
              final_fit = final_fit,
              final_vars = colnames(filtx),
              roc = caret.roc,
              trControl = trControl,
              bestTunes = bestTunes,
              finalTune = finalTune,
              summary = summary)
  if (!is.na(finalCV) & !is.null(modifyX)) {
    out$xfinal <- filtx
    if (modifyX_useY) out$modify_fit <- dat$modify_fit
  }
  class(out) <- "nestcv.train"
  out
}


nestcv.trainCore <- function(i, y, x, outer_folds, inner_train_folds,
                             method, filterFUN, filter_options,
                             weights, balance, balance_options,
                             modifyX, modifyX_useY, modifyX_options,
                             metric, trControl, tuneGrid,
                             outer_train_predict, verbose = FALSE, ...) {
  start <- Sys.time()
  if (verbose == 1) message_parallel("Starting Fold ", i, " ...")
  test <- outer_folds[[i]]
  dat <- nest_filt_bal(test, y, x, filterFUN, filter_options,
                       balance, balance_options,
                       modifyX, modifyX_useY, modifyX_options)
  ytrain <- dat$ytrain
  ytest <- dat$ytest
  filt_xtrain <- dat$filt_xtrain
  filt_xtest <- dat$filt_xtest
  
  trControl$index <- inner_train_folds[[i]]
  printlog <- capture.output({
    fit <- caret::train(x = filt_xtrain, y = ytrain,
                        method = method,
                        weights = weights[-test],
                        metric = metric,
                        trControl = trControl,
                        tuneGrid = tuneGrid, ...)
  })
  
  predy <- predict(fit, newdata = filt_xtest)
  preds <- data.frame(predy=predy, testy=ytest)
  if (is.factor(y)) {
    predyp <- predict(fit, newdata = filt_xtest, type = "prob")
    if (nlevels(y) == 2) predyp <- predyp[,2]  # binomial
    preds <- cbind(preds, predyp)
  }
  rownames(preds) <- rownames(filt_xtest)
  if (outer_train_predict) {
    predy <- predict(fit, newdata = filt_xtrain)
    train_preds <- data.frame(ytrain=ytrain, predy=predy)
    if (is.factor(y)) {
      predyp <- predict(fit, newdata = filt_xtrain, type = "prob")
      if (nlevels(y) == 2) predyp <- predyp[,2]  # binomial
      train_preds <- cbind(train_preds, predyp)
    }
  } else train_preds <- NULL
  if (verbose == 1) {
    end <- Sys.time()
    message_parallel("                     Fold ", i, " done (",
                     format(end - start, digits = 3), ")")
  } else if (verbose == 2) cat_parallel("=")
  ret <- list(preds = preds,
              train_preds = train_preds,
              fit = fit,
              nfilter = ncol(filt_xtrain))
  ret
}


# finalise the caret model tuning from bestTune dataframe
#' @importFrom stats median
finaliseTune <- function(x) {
  fintune <- lapply(colnames(x), function(i) {
    if (is.numeric(x[, i])) return(median(x[, i]))
    tab <- table(x[, i])
    names(tab)[which.max(tab)]  # majority vote for factors
  })
  names(fintune) <- colnames(x)
  data.frame(fintune, check.names = FALSE)
}


#' @export
summary.nestcv.train <- function(object, 
                                 digits = max(3L, getOption("digits") - 3L), 
                                 ...) {
  cat("Nested cross-validation with caret\n")
  if (!is.null(object$call$method)) {
    cat("Method: ", object$call$method, "\n")
  } else cat("Method:  rf\n")
  if (!is.null(object$call$filterFUN)) {
    cat("Filter: ", object$call$filterFUN, "\n")
  } else cat("No filter\n")
  if (!is.null(object$call$modifyX))
    cat("Modifier: ", object$call$modifyX, "\n")
  cat("Outer loop: ", switch(object$outer_method,
                             cv = paste0(length(object$outer_folds), "-fold cv\n"),
                             LOOCV = "leave-one-out CV\n"))
  cat("Inner loop: ", paste0(object$trControl$number, "-fold ",
                             object$trControl$method, "\n"))
  balance <- object$call$balance
  if (!is.null(balance)) {
    cat("Balancing: ", balance, "\n")
  }
  cat(object$dimx[1], "observations,", object$dimx[2], "predictors\n")
  if (!is.numeric(object$y)) print(c(table(object$y)))
  cat("\n")
  nfilter <- unlist(lapply(object$outer_result, '[[', 'nfilter'))
  foldres <- object$bestTunes
  foldres$n.filter <- nfilter
  print(foldres, digits = digits, print.gap = 2L)
  cat("\nFinal parameters:\n")
  if (length(object$finalTune)==1 && is.na(object$finalTune)) {
    cat("NA\n")
  } else {
    print(object$finalTune, digits = digits, print.gap = 2L, row.names = FALSE)
  }
  cat("\nResult:\n")
  print(object$summary, digits = digits, print.gap = 2L)
  out <- list(dimx = object$dimx, folds = foldres,
              final_param = object$finalTune, result = object$summary)
  invisible(out)
}


#' @method predict nestcv.train
#' @export
predict.nestcv.train <- function(object, newdata,
                                 modify = FALSE, ...) {
  if (modify) {
    if (is.null(object$modify_fit)) stop("`modify_fit` is missing")
    newdata <- predict(object$modify_fit, newdata)
  }
  if (any(!object$final_vars %in% colnames(newdata))) 
    stop("newdata is missing some predictors", call. = FALSE)
  predict(object$final_fit,
          newdata = newdata[, object$final_vars, drop = FALSE], ...)
}


#' Summarise variables
#' 
#' @param x Matrix or dataframe with variables in columns
#' @return A matrix with variables in rows and mean, median and SD for each
#'   variable or number of levels if the variable is a factor. If `NA` are
#'   detected, an extra column `n.NA` is added with the numbers of `NA` for each
#'   variable.
#' @export
#' 
summary_vars <- function(x) {
  if (is.matrix(x)) {
    if (!is.numeric(x)) return("Not numeric")
    mat <- x
    selCols <- TRUE
  } else {
    selCols <- unlist(lapply(x, is.numeric))
    mat <- as.matrix(x[, selCols])
  }
  summary_mat <- cbind(colMeans(mat, na.rm = TRUE),
                       matrixStats::colMedians(mat, na.rm = TRUE),
                       matrixStats::colSds(mat, na.rm = TRUE))
  colnames(summary_mat) <- c("mean", "median", "sd")
  if (any(!selCols)) {
    nlevels_mat <- unlist(lapply(x[, !selCols], function(i) nlevels(as.factor(i))))
    out <- matrix(NA, nrow = ncol(x), ncol = 4,
                  dimnames = list(colnames(x), c("mean", "median", "sd", "nlevels")))
    out[selCols, 1:3] <- summary_mat
    out[!selCols, 4] <- nlevels_mat
  } else out <- summary_mat
  n.NA <- apply(x, 2, function(i) sum(is.na(i)))
  if (any(n.NA > 0)) out <- cbind(out, n.NA)
  out
}


swapFoldIndex <- function(folds, len = max(unlist(folds))) {
  lapply(folds, function(i) setdiff(seq_len(len), i))
}


# Prints a message using shell echo from inside mclapply when run in Rstudio
message_parallel <- function(...) {
  if (Sys.getenv("RSTUDIO") != "1") return()
  system(sprintf('echo "%s"', paste0(..., collapse = "")))
}


plural <- function(n, text) {
  text <- if (n == 1) gsub("\\(s\\)", "", text) else gsub("\\(|\\)", "", text)
  paste(n, text)
}