# ----------------------
# Author: Andreas Alfons
# KU Leuven
# ----------------------
#' Model selection based on cross-validation
#'
#' Combine cross-validation results for various models into one object and
#' select the model with the best prediction performance.
#'
#' Keep in mind that objects inheriting from class \code{"cv"} or
#' \code{"cvSelect"} may contain multiple columns of cross-validation
#' results. This is the case if the response is univariate but the
#' \code{\link[stats]{predict}} method of the fitted model returns a
#' matrix.
#'
#' The \code{.reshape} argument determines how to handle such objects. If
#' \code{.reshape} is \code{FALSE}, all objects are required to have the same
#' number of columns and the best model for each column is selected. A typical
#' use case for this behavior would be if the investigated models contain
#' cross-validation results for a raw and a reweighted fit. It might then be
#' of interest to researchers to compare the best model for the raw estimators
#' with the best model for the reweighted estimators.
#'
#' If \code{.reshape} is \code{TRUE}, objects with more than one column of
#' results are first transformed with \code{\link{cvReshape}} to have only one
#' column. Then the best overall model is selected.
#'
#' It should also be noted that the argument names of \code{.reshape},
#' \code{.selectBest} and \code{.seFacor} start with a dot to avoid conflicts
#' with the argument names used for the objects containing cross-validation
#' results.
#'
#' @param \dots objects inheriting from class \code{"cv"} or \code{"cvSelect"}
#' that contain cross-validation results.
#' @param .reshape a logical indicating whether objects with more than one
#' column of cross-validation results should be reshaped to have only one
#' column (see \dQuote{Details}).
#' @param .selectBest a character string specifying a criterion for selecting
#' the best model. Possible values are \code{"min"} (the default) or
#' \code{"hastie"}. The former selects the model with the smallest prediction
#' error. The latter is useful for nested models or for models with a tuning
#' parameter controlling the complexity of the model (e.g., penalized
#' regression). It selects the most parsimonious model whose prediction error
#' is no larger than \code{.seFactor} standard errors above the prediction error
#' of the best overall model. Note that the models are thereby assumed to be
#' ordered from the most parsimonious one to the most complex one. In
#' particular a one-standard-error rule is frequently applied.
#' @param .seFactor a numeric value giving a multiplication factor of the
#' standard error for the selection of the best model. This is ignored if
#' \code{.selectBest} is \code{"min"}.
#'
#' @aliases print.cvSelect
#'
#' @returnClass cvSelect
#' @returnItem n an integer giving the number of observations.
#' @returnItem K an integer vector giving the number of folds used in
#' cross-validation for the respective model.
#' @returnItem R an integer vector giving the number of replications used in
#' cross-validation for the respective model.
#' @returnItem best an integer vector giving the indices of the models with
#' the best prediction performance.
#' @returnItem cv a data frame containing the estimated prediction errors for
#' the models. For models for which repeated cross-validation was performed,
#' those are average values over all replications.
#' @returnItem se a data frame containing the estimated standard errors of the
#' prediction loss for the models.
#' @returnItem selectBest a character string specifying the criterion used for
#' selecting the best model.
#' @returnItem seFactor a numeric value giving the multiplication factor of
#' the standard error used for the selection of the best model.
#' @returnItem reps a data frame containing the estimated prediction errors
#' from all replications for those models for which repeated cross-validation
#' was performed. This is only returned if repeated cross-validation was
#' performed for at least one of the models.
#'
#' @note Even though the function allows to compare cross-validation results
#' obtained with a different number of folds or a different number of
#' replications, such comparisons should be made with care. Hence warnings
#' are issued in those cases. For maximum comparability, the same data folds
#' should be used in cross-validation for all models to be compared.
#'
#' @author Andreas Alfons
#'
#' @references
#' Hastie, T., Tibshirani, R. and Friedman, J. (2009) \emph{The Elements of
#' Statistical Learning: Data Mining, Inference, and Prediction}. Springer,
#' 2nd edition.
#'
#' @seealso \code{\link{cvFit}}, \code{\link{cvTuning}}
#'
#' @example inst/doc/examples/example-cvSelect.R
#'
#' @keywords utilities
#'
#' @export
cvSelect <- function(..., .reshape = FALSE, .selectBest = c("min", "hastie"),
.seFactor = 1) {
## initializations
objects <- list(...)
m <- length(objects)
if(m == 0) stop("empty list of objects")
# check class of objects
isCvSelect <- sapply(objects, inherits, "cvSelect")
if(!all(sapply(objects, inherits, "cv") | isCvSelect)) {
stop("all objects must inherit from class \"cv\" or \"cvSelect\"")
}
.selectBest <- match.arg(.selectBest)
# remove empty objects
keep <- sapply(objects, function(x) ncv(x) > 0 && !isTRUE(nfits(x) == 0))
objects <- objects[keep]
m <- length(objects)
if(m == 0) stop("all objects are empty")
isCvSelect <- isCvSelect[keep]
# check names for the supplied objects
fits <- names(objects)
if(is.null(fits)) {
fits <- defaultFitNames(m)
} else if(any(i <- fits == "")) fits[i] <- defaultFitNames(m)[i]
names(objects) <- fits
# check dimensions or reshape objects with more than one column
d <- sapply(objects, ncv)
if(isTRUE(.reshape)) {
.reshape <- which(d > 1)
objects[.reshape] <- lapply(objects[.reshape], cvReshape)
isCvSelect[.reshape] <- TRUE
} else if(length(unique(d)) > 1) {
stop("all objects must have the same dimension")
}
## prepare objects of class "cvSelect"
if(any(isCvSelect)) {
# prepare names
fits <- as.list(fits)
fits[isCvSelect] <- mapply(function(f, x) paste(f, x$cv$Fit, sep="."),
fits[isCvSelect], objects[isCvSelect], SIMPLIFY=FALSE)
fits <- unlist(fits)
# prepare basic information
objects[isCvSelect] <- lapply(objects[isCvSelect],
function(x) {
m <- nrow(x$cv) # number of fits in current object
x$n <- rep(x$n, length.out=m)
x$K <- rep(x$K, length.out=m)
x$R <- rep(x$R, length.out=m)
# remove column specifying fit from results
x$cv <- x$cv[, -1, drop=FALSE]
x$se <- x$se[, -1, drop=FALSE]
if(!is.null(x$reps)) x$reps <- x$reps[, -1, drop=FALSE]
x
})
}
## combine basic information
n <- unique(unlist(lapply(objects, function(x) x$n), use.names=FALSE))
if(length(n) > 1) stop("different numbers of observations")
K <- unlist(lapply(objects, function(x) x$K), use.names=FALSE)
if(length(unique(K)) > 1) warning("different number of folds")
R <- unlist(lapply(objects, function(x) x$R), use.names=FALSE)
if(length(unique(R)) > 1) warning("different number of replications")
names(K) <- names(R) <- fits
## combine CV results and standard errors
cv <- lapply(objects,
function(x) {
cv <- x$cv # extract CV results
if(is.null(dim(cv))) t(cv) else as.matrix(cv) # return matrix
})
se <- lapply(objects,
function(x) {
se <- x$se # extract standard errors
if(is.null(dim(se))) t(se) else as.matrix(se) # return matrix
})
if(m > 1) {
# check if names are the same for all objects
cvNames <- colnames(cv[[1]])
otherNames <- lapply(cv[-1], colnames)
adjustNames <- !all(sapply(otherNames, identical, cvNames))
if(adjustNames) cvNames <- defaultCvNames(length(cvNames))
}
cv <- do.call("rbind", cv)
se <- do.call("rbind", se)
if(m > 1 && adjustNames) {
colnames(cv) <- colnames(se) <- cvNames
}
cv <- data.frame(Fit=factor(fits, levels=fits), cv, row.names=NULL)
se <- data.frame(Fit=factor(fits, levels=fits), se, row.names=NULL)
## combine repeated CV results
haveReps <- any(i <- sapply(objects, function(x) !is.null(x$reps)))
if(haveReps) {
# FIXME: safer solution that does not require the correct order of fits
reps <- lapply(objects[i], function(x) as.matrix(x$reps))
reps <- do.call("rbind", reps)
if(m > 1 && adjustNames) colnames(reps) <- cvNames
i <- which(R > 1)
reps <- data.frame(Fit=factor(rep(fits[i], R[i]), levels=fits),
reps, row.names=NULL)
}
## find best model
if(.selectBest == "min") {
.seFactor <- NA
best <- sapply(cv[, -1, drop=FALSE], selectMin)
} else {
.seFactor <- rep(.seFactor, length.out=1)
best <- sapply(names(cv)[-1],
function(j) selectHastie(cv[, j], se[, j], seFactor=.seFactor))
}
## construct return object
out <- list(n=n, K=K, R=R, best=best, cv=cv, se=se,
selectBest=.selectBest, seFactor=.seFactor)
if(haveReps) out$reps <- reps
class(out) <- "cvSelect"
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.