R/repCV.R
In cvTools: Cross-Validation Tools for Regression Models

Documented in repCV repCV repCV.lm repCV.lm repCV.lmrob repCV.lmrob repCV.lts repCV.lts

# --------------------------------------
# Author: Andreas Alfons
#         Erasmus Universiteit Rotterdam
# --------------------------------------

#' Cross-validation for linear models
#'
#' Estimate the prediction error of a linear model via (repeated) \eqn{K}-fold
#' cross-validation.  Cross-validation functions are available for least
#' squares fits computed with \code{\link[stats]{lm}} as well as for the
#' following robust alternatives: MM-type models computed with
#' \code{\link[robustbase]{lmrob}} and least trimmed squares fits computed with
#' \code{\link[robustbase]{ltsReg}}.
#'
#' (Repeated) \eqn{K}-fold cross-validation is performed in the following
#' way.  The data are first split into \eqn{K} previously obtained blocks of
#' approximately equal size.  Each of the \eqn{K} data blocks is left out once
#' to fit the model, and predictions are computed for the observations in the
#' left-out block with the \code{\link[stats]{predict}} method of the fitted
#' model.  Thus a prediction is obtained for each observation.
#'
#' The response variable and the obtained predictions for all observations are
#' then passed to the prediction loss function \code{cost} to estimate the
#' prediction error.  For repeated cross-validation, this process is replicated
#' and the estimated prediction errors from all replications as well as their
#' average are included in the returned object.
#'
#' @aliases cvExamples
#'
#' @param object  an object returned from a model fitting function.  Methods
#' are implemented for objects of class \code{"lm"} computed with
#' \code{\link[stats]{lm}}, objects of class \code{"lmrob"} computed with
#' \code{\link[robustbase]{lmrob}}, and object of class \code{"lts"} computed
#' with \code{\link[robustbase]{ltsReg}}.
#' @param cost  a cost function measuring prediction loss.  It should expect
#' the observed values of the response to be passed as the first argument and
#' the predicted values as the second argument, and must return either a
#' non-negative scalar value, or a list with the first component containing
#' the prediction error and the second component containing the standard
#' error.  The default is to use the root mean squared prediction error
#' for the \code{"lm"} method and the root trimmed mean squared prediction
#' error for the \code{"lmrob"} and \code{"lts"} methods (see
#' \code{\link{cost}}).
#' @param K  an integer giving the number of folds into which the data should
#' be split (the default is five).  Keep in mind that this should be chosen
#' such that all folds are of approximately equal size.  Setting \code{K}
#' equal to the number of observations or groups yields leave-one-out
#' cross-validation.
#' @param R  an integer giving the number of replications for repeated
#' \eqn{K}-fold cross-validation.  This is ignored for for leave-one-out
#' cross-validation and other non-random splits of the data.
#' @param foldType  a character string specifying the type of folds to be
#' generated.  Possible values are \code{"random"} (the default),
#' \code{"consecutive"} or \code{"interleaved"}.
#' @param grouping  a factor specifying groups of observations.  If supplied,
#' the data are split according to the groups rather than individual
#' observations such that all observations within a group belong to the same
#' fold.
#' @param folds  an object of class \code{"cvFolds"} giving the folds of the
#' data for cross-validation (as returned by \code{\link{cvFolds}}).  If
#' supplied, this is preferred over the arguments for generating
#' cross-validation folds.
#' @param fit  a character string specifying for which fit to estimate the
#' prediction error.  Possible values are \code{"reweighted"} (the default) for
#' the prediction error of the reweighted fit, \code{"raw"} for the prediction
#' error of the raw fit, or \code{"both"} for the prediction error of both
#' fits.
#' @param seed  optional initial seed for the random number generator (see
#' \code{\link{.Random.seed}}).
#' @param \dots  additional arguments to be passed to the prediction loss
#' function \code{cost}.
#'
#' @return An object of class \code{"cv"} with the following components:
#' \item{n}{an integer giving the number of observations or groups.}
#' \item{K}{an integer giving the number of folds.}
#' \item{R}{an integer giving the number of replications.}
#' \item{cv}{a numeric vector containing the estimated prediction
#' errors.  For the \code{"lm"} and \code{"lmrob"} methods, this is a single
#' numeric value.  For the \code{"lts"} method, this contains one value for
#' each of the requested fits.  In the case of repeated cross-validation, those
#' are average values over all replications.}
#' \item{se}{a numeric vector containing the estimated standard
#' errors of the prediction loss.  For the \code{"lm"} and \code{"lmrob"}
#' methods, this is a single numeric value.  For the \code{"lts"} method, this
#' contains one value for each of the requested fits.}
#' \item{reps}{a numeric matrix containing the estimated prediction
#' errors from all replications.  For the \code{"lm"} and \code{"lmrob"}
#' methods, this is a matrix with one column.  For the \code{"lts"} method,
#' this contains one column for each of the requested fits.  However, this is
#' only returned for repeated cross-validation.}
#' \item{seed}{the seed of the random number generator before
#' cross-validation was performed.}
#' \item{call}{the matched function call.}
#'
#' @note The \code{repCV} methods are simple wrapper functions that extract the
#' data from the fitted model and call \code{\link{cvFit}} to perform
#' cross-validation.  In addition, \code{cvLm}, \code{cvLmrob} and \code{cvLts}
#' are aliases for the respective methods.
#'
#' @author Andreas Alfons
#'
#' @seealso \code{\link{cvFit}}, \code{\link{cvFolds}}, \code{\link{cost}},
#' \code{\link[stats]{lm}}, \code{\link[robustbase]{lmrob}},
#' \code{\link[robustbase]{ltsReg}}
#'
#' @example inst/doc/examples/example-repCV.R
#'
#' @keywords utilities
#'
#' @export

repCV <- function(object, ...) UseMethod("repCV")


## LS regression
#' @rdname repCV
#' @method repCV lm
#' @export

repCV.lm <- function(object, cost = rmspe, K = 5, R = 1,
        foldType = c("random", "consecutive", "interleaved"),
        grouping = NULL, folds = NULL, seed = NULL, ...) {
    ## initializations
    matchedCall <- match.call()
    # retrieve data from model fit
    if(is.null(data <- object$model)) {
        haveDataArgument <- !is.null(object$call$data)
        if(haveDataArgument) {
            # try to retrieve data from 'x' and 'y' components
            # this only works if the data argument was used to fit the model
            if(!is.null(x <- object[["x"]]) && !is.null(y <- object$y)) {
                x <- removeIntercept(x)
                data <- data.frame(y, x)
            }
        }
        if(!haveDataArgument || is.null(data)) {
            # try to retrieve data from terms component
            data <- try(model.frame(object$terms), silent=TRUE)
            if(inherits(data, "try-error")) stop("model data not available")
        }
    }
    if(is.null(y <- object$y)) y <- model.response(data)
    ## call function cvFit() to perform cross-validation
    out <- cvFit(object, data=data, y=y, cost=cost, K=K, R=R,
        foldType=foldType, grouping=grouping, folds=folds,
        costArgs=list(...), envir=parent.frame(), seed=seed)
    out$call <- matchedCall
    out
}


## MM and SDMD regression
#' @rdname repCV
#' @method repCV lmrob
#' @export
#' @import robustbase

repCV.lmrob <- function(object, cost = rtmspe, K = 5, R = 1,
        foldType = c("random", "consecutive", "interleaved"),
        grouping = NULL, folds = NULL, seed = NULL, ...) {
    ## initializations
    matchedCall <- match.call()
    # retrieve data from model fit
    if(is.null(data <- object$model)) {
        haveDataArgument <- !is.null(object$call$data)
        if(haveDataArgument) {
            # try to retrieve data from 'x' and 'y' components
            # this only works if the data argument was used to fit the model
            if(!is.null(x <- object[["x"]]) && !is.null(y <- object$y)) {
                x <- removeIntercept(x)
                data <- data.frame(y, x)
            }
        }
        if(!haveDataArgument || is.null(data)) {
            # try to retrieve data from terms component
            data <- try(model.frame(object$terms), silent=TRUE)
            if(inherits(data, "try-error")) stop("model data not available")
        }
    }
    if(is.null(y <- object$y)) y <- model.response(data)
    ## call function cvFit() to perform cross-validation
    out <- cvFit(object, data=data, y=y, cost=cost, K=K, R=R,
        foldType=foldType, grouping=grouping, folds=folds,
        costArgs=list(...), envir=parent.frame(), seed=seed)
    out$call <- matchedCall
    out
}


## LTS regression
#' @rdname repCV
#' @method repCV lts
#' @export
#' @import robustbase

repCV.lts <- function(object, cost = rtmspe, K = 5, R = 1,
        foldType = c("random", "consecutive", "interleaved"), grouping = NULL,
        folds = NULL, fit = c("reweighted", "raw", "both"), seed = NULL, ...) {
    ## initializations
    matchedCall <- match.call()
    object <- object
    if(is.null(x <- object$X) || is.null(y <- object$Y)) {
        if(is.null(data <- object$model)) {
            if(is.null(x)) x <- try(model.matrix(object$terms), silent=TRUE)
            if(is.null(y)) y <- try(model.response(object$terms), silent=TRUE)
            if(inherits(x, "try-error") || inherits(y, "try-error")) {
                stop("model data not available")
            }
        } else {
            x <- model.matrix(object$terms, data)
            y <- model.response(data)
        }
    }
    # predictor matrix is stored with column for intercept (if any)
    x <- removeIntercept(x)
    ## prepare cross-validation
    # extract function call for model fit
    call <- object$call
    call[[1]] <- as.name("ltsReg")
    # if the model was fitted with formula method, 'formula' and 'data'
    # arguments are removed from call and 'x' and 'y' are used instead
    call$formula <- NULL
    call$data <- NULL
    call$intercept <- object$intercept
    ## call function cvFit() to perform cross-validation
    out <- cvFit(call, x=x, y=y, cost=cost, K=K, R=R, foldType=foldType,
        grouping=grouping, folds=folds, predictArgs=list(fit=fit),
        costArgs=list(...), envir=parent.frame(), seed=seed)
    out$call <- matchedCall
    out
}


# ----------------------

## LS regression
#' @rdname repCV
#' @export
cvLm <- repCV.lm

## MM and SDMD regression
#' @rdname repCV
#' @export
#' @import robustbase
cvLmrob <- repCV.lmrob

## LTS regression
#' @rdname repCV
#' @export
#' @import robustbase
cvLts <- repCV.lts

# ----------------------

#' @method predict lts
#' @export
#' @import robustbase

# there is no predict() method for "lts" objects in package 'robustbase'
predict.lts <- function(object, newdata,
        fit = c("reweighted", "raw", "both"), ...) {
    ## initializations
    fit <- match.arg(fit)
    coef <- switch(fit,
        reweighted=coef(object),
        raw=object$raw.coefficients,
        both=cbind(reweighted=coef(object), raw=object$raw.coefficients))
    terms <- delete.response(object$terms)  # extract terms for model matrix
    if(missing(newdata) || is.null(newdata)) {
        if(is.null(newdata <- object$X)) {
            if(is.null(data <- object$model)) {
                newdata <- try(model.matrix(terms), silent=TRUE)
                if(inherits(newdata, "try-error")) {
                    stop("model data not available")
                }
            } else newdata <- model.matrix(terms, data)
        }
    } else {
        # interpret vector as row
        if(is.null(dim(newdata))) newdata <- t(newdata)
        # check dimensions if model was not specified with a formula,
        # otherwise use the terms object to extract model matrix
        if(is.null(terms)) {
            newdata <- as.matrix(newdata)
            if(object$intercept) {
                # if model has an intercept, add a column of ones to the new
                # data matrix (unless it already contains intercept column)
                newdata <- addIntercept(newdata, check=TRUE)
            }
            # check dimensions of new data
            p <- if(is.null(dim(coef))) length(coef) else nrow(coef)
            if(ncol(newdata) != p) {
                stop(sprintf("new data must have %d columns", p))
            }
        } else newdata <- model.matrix(terms, as.data.frame(newdata))
    }
    ## compute predictions
    # ensure that a vector is returned if only one fit is requested
    drop(newdata %*% coef)
}