robustHD: Robust Methods for High-Dimensional Data

Documented in perry.seqModel perry.sparseLTS

# --------------------------------------
# Author: Andreas Alfons
#         Erasmus Universiteit Rotterdam
# --------------------------------------

#' Resampling-based prediction error for a sequential regression model
#'
#' Estimate the prediction error of a previously fit sequential regression
#' model such as a robust least angle regression model or a sparse least
#' trimmed squares regression model.
#'
#' The prediction error can be estimated via (repeated) \eqn{K}-fold
#' cross-validation, (repeated) random splitting (also known as random
#' subsampling or Monte Carlo cross-validation), or the bootstrap.  In
#' each iteration, the optimal model is thereby selected from the training
#' data and used to make predictions for the test data.
#'
#' @method perry seqModel
#' @aliases perry.rlars
#'
#' @param object  the model fit for which to estimate the prediction error.
#' @param splits  an object of class \code{"cvFolds"} (as returned by
#' \code{\link[perry]{cvFolds}}) or a control object of class
#' \code{"foldControl"} (see \code{\link[perry]{foldControl}}) defining the
#' folds of the data for (repeated) \eqn{K}-fold cross-validation, an object of
#' class \code{"randomSplits"} (as returned by
#' \code{\link[perry]{randomSplits}}) or a control object of class
#' \code{"splitControl"} (see \code{\link[perry]{splitControl}}) defining
#' random data splits, or an object of class \code{"bootSamples"} (as returned
#' by \code{\link[perry]{bootSamples}}) or a control object of class
#' \code{"bootControl"} (see \code{\link[perry]{bootControl}}) defining
#' bootstrap samples.
#' @param fit  a character string specifying for which fit to estimate the
#' prediction error.  Possible values are \code{"reweighted"} (the default) for
#' the prediction error of the reweighted fit, \code{"raw"} for the prediction
#' error of the raw fit, or \code{"both"} for the prediction error of both
#' fits.
#' @param cost  a cost function measuring prediction loss.  It should expect
#' vectors to be passed as its first two arguments, the first corresponding to
#' the observed values of the response and the second to the predicted values,
#' and must return a non-negative scalar value.  The default is to use the root
#' mean squared prediction error for non-robust models and the root trimmed
#' mean squared prediction error for robust models (see
#' \code{\link[perry]{cost}}).
#' @param ncores  a positive integer giving the number of processor cores to be
#' used for parallel computing (the default is 1 for no parallelization).  If
#' this is set to \code{NA}, all available processor cores are used.
#' @param cl  a \pkg{parallel} cluster for parallel computing as generated by
#' \code{\link[parallel]{makeCluster}}.  If supplied, this is preferred over
#' \code{ncores}.
#' @param seed  optional initial seed for the random number generator (see
#' \code{\link{.Random.seed}}).  Note that also in case of parallel computing,
#' resampling is performed on the manager process rather than the worker
#' processes. On the parallel worker processes, random number streams are
#' used and the seed is set via \code{\link{clusterSetRNGStream}}.
#' @param \dots  additional arguments to be passed to the prediction loss
#' function \code{cost}.
#'
#' @return An object of class \code{"perry"} with the following components:
#' \describe{
#'   \item{\code{pe}}{a numeric vector containing the estimated prediction
#'   errors for the requested model fits.  In case of more than one replication,
#'   this gives the average value over all replications.}
#'   \item{\code{se}}{a numeric vector containing the estimated standard errors
#'   of the prediction loss for the requested model fits.}
#'   \item{\code{reps}}{a numeric matrix in which each column contains the
#'   estimated prediction errors from all replications for the requested model
#'   fits.  This is only returned in case of more than one replication.}
#'   \item{\code{splits}}{an object giving the data splits used to estimate the
#'   prediction error.}
#'   \item{\code{y}}{the response.}
#'   \item{\code{yHat}}{a list containing the predicted values from all
#'   replications.}
#'   \item{\code{call}}{the matched function call.}
#' }
#'
#' @author Andreas Alfons
#'
#' @seealso \code{\link{rlars}}, \code{\link{sparseLTS}},
#' \code{\link[=predict.seqModel]{predict}}, \code{\link[perry]{perry}},
#' \code{\link[perry]{cost}}
#'
#' @example inst/doc/examples/example-perry.R
#'
#' @keywords utilities
#'
#' @export
#' @import perry

perry.seqModel <- function(object, splits = foldControl(), cost, ncores = 1,
                           cl = NULL, seed = NULL, ...) {
  ## initializations
  matchedCall <- match.call()
  matchedCall[[1]] <- as.name("perry")
  # retrieve data from model fit
  if(is.null(x <- object$x) || is.null(y <- object$y)) {
    if(is.null(x)) x <- try(model.matrix(object$terms), silent=TRUE)
    if(is.null(y)) y <- try(model.response(object$terms), silent=TRUE)
    if(inherits(x, "try-error") || inherits(y, "try-error")) {
      stop("model data not available")
    }
  }
  # predictor matrix is stored with column for intercept
  x <- removeIntercept(x)
  # get default cost function depending on whether model fit is robust
  if(missing(cost)) cost <- if(object$robust) rtmspe else rmspe
  ## construct call to fit models in prediction error estimation
  call <- object$call
  # if the model was fitted with formula method, 'formula' and 'data'
  # arguments are removed from call and 'x' and 'y' are used instead
  call$formula <- NULL
  call$data <- NULL
  ## call function perryFit() to perform prediction error estimation
  out <- perryFit(call, x=x, y=y, splits=splits, cost=cost, costArgs=list(...),
                  envir=parent.frame(), ncores=ncores, cl=cl, seed=seed)
  out$call <- matchedCall
  out
}


#' @rdname perry.seqModel
#' @method perry sparseLTS
#' @export

perry.sparseLTS <- function(object, splits = foldControl(),
                            fit = c("reweighted", "raw", "both"),
                            cost = rtmspe, ncores = 1, cl = NULL,
                            seed = NULL, ...) {
  ## initializations
  matchedCall <- match.call()
  matchedCall[[1]] <- as.name("perry")
  fit <- match.arg(fit)
  if(is.null(x <- object$x) || is.null(y <- object$y)) {
    if(is.null(x)) x <- try(model.matrix(object$terms), silent=TRUE)
    if(is.null(y)) y <- try(model.response(object$terms), silent=TRUE)
    if(inherits(x, "try-error") || inherits(y, "try-error")) {
      stop("model data not available")
    }
  }
  # predictor matrix is stored with column for intercept (if any)
  x <- removeIntercept(x)
  ## prepare prediction error estimation
  # extract function call for fitting the model
  call <- object$call
  # if the model was fitted with formula method, 'formula' and 'data'
  # arguments are removed from call and 'x' and 'y' are used instead
  call$formula <- NULL
  call$data <- NULL
  call$intercept <- object$intercept
  ## call function perryFit() to perform prediction error estimation
  out <- perryFit(call, x=x, y=y, splits=splits, predictArgs=list(fit=fit),
                  cost=cost, costArgs=list(...), envir=parent.frame(),
                  ncores=ncores, cl=cl, seed=seed)
  if(fit == "both") peNames(out) <- c("reweighted", "raw")
  out$call <- matchedCall
  out
}