Nothing
# --------------------------------------
# Author: Andreas Alfons
# Erasmus Universiteit Rotterdam
# --------------------------------------
#' Cross-validation for linear models
#'
#' Estimate the prediction error of a linear model via (repeated) \eqn{K}-fold
#' cross-validation. Cross-validation functions are available for least
#' squares fits computed with \code{\link[stats]{lm}} as well as for the
#' following robust alternatives: MM-type models computed with
#' \code{\link[robustbase]{lmrob}} and least trimmed squares fits computed with
#' \code{\link[robustbase]{ltsReg}}.
#'
#' (Repeated) \eqn{K}-fold cross-validation is performed in the following
#' way. The data are first split into \eqn{K} previously obtained blocks of
#' approximately equal size. Each of the \eqn{K} data blocks is left out once
#' to fit the model, and predictions are computed for the observations in the
#' left-out block with the \code{\link[stats]{predict}} method of the fitted
#' model. Thus a prediction is obtained for each observation.
#'
#' The response variable and the obtained predictions for all observations are
#' then passed to the prediction loss function \code{cost} to estimate the
#' prediction error. For repeated cross-validation, this process is replicated
#' and the estimated prediction errors from all replications as well as their
#' average are included in the returned object.
#'
#' @aliases cvExamples
#'
#' @param object an object returned from a model fitting function. Methods
#' are implemented for objects of class \code{"lm"} computed with
#' \code{\link[stats]{lm}}, objects of class \code{"lmrob"} computed with
#' \code{\link[robustbase]{lmrob}}, and object of class \code{"lts"} computed
#' with \code{\link[robustbase]{ltsReg}}.
#' @param cost a cost function measuring prediction loss. It should expect
#' the observed values of the response to be passed as the first argument and
#' the predicted values as the second argument, and must return either a
#' non-negative scalar value, or a list with the first component containing
#' the prediction error and the second component containing the standard
#' error. The default is to use the root mean squared prediction error
#' for the \code{"lm"} method and the root trimmed mean squared prediction
#' error for the \code{"lmrob"} and \code{"lts"} methods (see
#' \code{\link{cost}}).
#' @param K an integer giving the number of folds into which the data should
#' be split (the default is five). Keep in mind that this should be chosen
#' such that all folds are of approximately equal size. Setting \code{K}
#' equal to the number of observations or groups yields leave-one-out
#' cross-validation.
#' @param R an integer giving the number of replications for repeated
#' \eqn{K}-fold cross-validation. This is ignored for for leave-one-out
#' cross-validation and other non-random splits of the data.
#' @param foldType a character string specifying the type of folds to be
#' generated. Possible values are \code{"random"} (the default),
#' \code{"consecutive"} or \code{"interleaved"}.
#' @param grouping a factor specifying groups of observations. If supplied,
#' the data are split according to the groups rather than individual
#' observations such that all observations within a group belong to the same
#' fold.
#' @param folds an object of class \code{"cvFolds"} giving the folds of the
#' data for cross-validation (as returned by \code{\link{cvFolds}}). If
#' supplied, this is preferred over the arguments for generating
#' cross-validation folds.
#' @param fit a character string specifying for which fit to estimate the
#' prediction error. Possible values are \code{"reweighted"} (the default) for
#' the prediction error of the reweighted fit, \code{"raw"} for the prediction
#' error of the raw fit, or \code{"both"} for the prediction error of both
#' fits.
#' @param seed optional initial seed for the random number generator (see
#' \code{\link{.Random.seed}}).
#' @param \dots additional arguments to be passed to the prediction loss
#' function \code{cost}.
#'
#' @return An object of class \code{"cv"} with the following components:
#' \item{n}{an integer giving the number of observations or groups.}
#' \item{K}{an integer giving the number of folds.}
#' \item{R}{an integer giving the number of replications.}
#' \item{cv}{a numeric vector containing the estimated prediction
#' errors. For the \code{"lm"} and \code{"lmrob"} methods, this is a single
#' numeric value. For the \code{"lts"} method, this contains one value for
#' each of the requested fits. In the case of repeated cross-validation, those
#' are average values over all replications.}
#' \item{se}{a numeric vector containing the estimated standard
#' errors of the prediction loss. For the \code{"lm"} and \code{"lmrob"}
#' methods, this is a single numeric value. For the \code{"lts"} method, this
#' contains one value for each of the requested fits.}
#' \item{reps}{a numeric matrix containing the estimated prediction
#' errors from all replications. For the \code{"lm"} and \code{"lmrob"}
#' methods, this is a matrix with one column. For the \code{"lts"} method,
#' this contains one column for each of the requested fits. However, this is
#' only returned for repeated cross-validation.}
#' \item{seed}{the seed of the random number generator before
#' cross-validation was performed.}
#' \item{call}{the matched function call.}
#'
#' @note The \code{repCV} methods are simple wrapper functions that extract the
#' data from the fitted model and call \code{\link{cvFit}} to perform
#' cross-validation. In addition, \code{cvLm}, \code{cvLmrob} and \code{cvLts}
#' are aliases for the respective methods.
#'
#' @author Andreas Alfons
#'
#' @seealso \code{\link{cvFit}}, \code{\link{cvFolds}}, \code{\link{cost}},
#' \code{\link[stats]{lm}}, \code{\link[robustbase]{lmrob}},
#' \code{\link[robustbase]{ltsReg}}
#'
#' @example inst/doc/examples/example-repCV.R
#'
#' @keywords utilities
#'
#' @export
repCV <- function(object, ...) UseMethod("repCV")
## LS regression
#' @rdname repCV
#' @method repCV lm
#' @export
repCV.lm <- function(object, cost = rmspe, K = 5, R = 1,
foldType = c("random", "consecutive", "interleaved"),
grouping = NULL, folds = NULL, seed = NULL, ...) {
## initializations
matchedCall <- match.call()
# retrieve data from model fit
if(is.null(data <- object$model)) {
haveDataArgument <- !is.null(object$call$data)
if(haveDataArgument) {
# try to retrieve data from 'x' and 'y' components
# this only works if the data argument was used to fit the model
if(!is.null(x <- object[["x"]]) && !is.null(y <- object$y)) {
x <- removeIntercept(x)
data <- data.frame(y, x)
}
}
if(!haveDataArgument || is.null(data)) {
# try to retrieve data from terms component
data <- try(model.frame(object$terms), silent=TRUE)
if(inherits(data, "try-error")) stop("model data not available")
}
}
if(is.null(y <- object$y)) y <- model.response(data)
## call function cvFit() to perform cross-validation
out <- cvFit(object, data=data, y=y, cost=cost, K=K, R=R,
foldType=foldType, grouping=grouping, folds=folds,
costArgs=list(...), envir=parent.frame(), seed=seed)
out$call <- matchedCall
out
}
## MM and SDMD regression
#' @rdname repCV
#' @method repCV lmrob
#' @export
#' @import robustbase
repCV.lmrob <- function(object, cost = rtmspe, K = 5, R = 1,
foldType = c("random", "consecutive", "interleaved"),
grouping = NULL, folds = NULL, seed = NULL, ...) {
## initializations
matchedCall <- match.call()
# retrieve data from model fit
if(is.null(data <- object$model)) {
haveDataArgument <- !is.null(object$call$data)
if(haveDataArgument) {
# try to retrieve data from 'x' and 'y' components
# this only works if the data argument was used to fit the model
if(!is.null(x <- object[["x"]]) && !is.null(y <- object$y)) {
x <- removeIntercept(x)
data <- data.frame(y, x)
}
}
if(!haveDataArgument || is.null(data)) {
# try to retrieve data from terms component
data <- try(model.frame(object$terms), silent=TRUE)
if(inherits(data, "try-error")) stop("model data not available")
}
}
if(is.null(y <- object$y)) y <- model.response(data)
## call function cvFit() to perform cross-validation
out <- cvFit(object, data=data, y=y, cost=cost, K=K, R=R,
foldType=foldType, grouping=grouping, folds=folds,
costArgs=list(...), envir=parent.frame(), seed=seed)
out$call <- matchedCall
out
}
## LTS regression
#' @rdname repCV
#' @method repCV lts
#' @export
#' @import robustbase
repCV.lts <- function(object, cost = rtmspe, K = 5, R = 1,
foldType = c("random", "consecutive", "interleaved"), grouping = NULL,
folds = NULL, fit = c("reweighted", "raw", "both"), seed = NULL, ...) {
## initializations
matchedCall <- match.call()
object <- object
if(is.null(x <- object$X) || is.null(y <- object$Y)) {
if(is.null(data <- object$model)) {
if(is.null(x)) x <- try(model.matrix(object$terms), silent=TRUE)
if(is.null(y)) y <- try(model.response(object$terms), silent=TRUE)
if(inherits(x, "try-error") || inherits(y, "try-error")) {
stop("model data not available")
}
} else {
x <- model.matrix(object$terms, data)
y <- model.response(data)
}
}
# predictor matrix is stored with column for intercept (if any)
x <- removeIntercept(x)
## prepare cross-validation
# extract function call for model fit
call <- object$call
call[[1]] <- as.name("ltsReg")
# if the model was fitted with formula method, 'formula' and 'data'
# arguments are removed from call and 'x' and 'y' are used instead
call$formula <- NULL
call$data <- NULL
call$intercept <- object$intercept
## call function cvFit() to perform cross-validation
out <- cvFit(call, x=x, y=y, cost=cost, K=K, R=R, foldType=foldType,
grouping=grouping, folds=folds, predictArgs=list(fit=fit),
costArgs=list(...), envir=parent.frame(), seed=seed)
out$call <- matchedCall
out
}
# ----------------------
## LS regression
#' @rdname repCV
#' @export
cvLm <- repCV.lm
## MM and SDMD regression
#' @rdname repCV
#' @export
#' @import robustbase
cvLmrob <- repCV.lmrob
## LTS regression
#' @rdname repCV
#' @export
#' @import robustbase
cvLts <- repCV.lts
# ----------------------
#' @method predict lts
#' @export
#' @import robustbase
# there is no predict() method for "lts" objects in package 'robustbase'
predict.lts <- function(object, newdata,
fit = c("reweighted", "raw", "both"), ...) {
## initializations
fit <- match.arg(fit)
coef <- switch(fit,
reweighted=coef(object),
raw=object$raw.coefficients,
both=cbind(reweighted=coef(object), raw=object$raw.coefficients))
terms <- delete.response(object$terms) # extract terms for model matrix
if(missing(newdata) || is.null(newdata)) {
if(is.null(newdata <- object$X)) {
if(is.null(data <- object$model)) {
newdata <- try(model.matrix(terms), silent=TRUE)
if(inherits(newdata, "try-error")) {
stop("model data not available")
}
} else newdata <- model.matrix(terms, data)
}
} else {
# interpret vector as row
if(is.null(dim(newdata))) newdata <- t(newdata)
# check dimensions if model was not specified with a formula,
# otherwise use the terms object to extract model matrix
if(is.null(terms)) {
newdata <- as.matrix(newdata)
if(object$intercept) {
# if model has an intercept, add a column of ones to the new
# data matrix (unless it already contains intercept column)
newdata <- addIntercept(newdata, check=TRUE)
}
# check dimensions of new data
p <- if(is.null(dim(coef))) length(coef) else nrow(coef)
if(ncol(newdata) != p) {
stop(sprintf("new data must have %d columns", p))
}
} else newdata <- model.matrix(terms, as.data.frame(newdata))
}
## compute predictions
# ensure that a vector is returned if only one fit is requested
drop(newdata %*% coef)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.