Nothing
# --------------------------------------
# Author: Andreas Alfons
# Erasmus Universiteit Rotterdam
#
# based on code by Sarah Gelper
# --------------------------------------
#' (Robust) least angle regression for time series data
#'
#' (Robustly) sequence groups of candidate predictors and their respective
#' lagged values according to their predictive content and find the optimal
#' model along the sequence. Note that lagged values of the response are
#' included as a predictor group as well.
#'
#' @aliases print.tslars
#'
#' @param formula a formula describing the full model.
#' @param data an optional data frame, list or environment (or object coercible
#' to a data frame by \code{\link{as.data.frame}}) containing the variables in
#' the model. If not found in data, the variables are taken from
#' \code{environment(formula)}, typically the environment from which
#' \code{tslars} or \code{rtslars} is called.
#' @param x a numeric matrix or data frame containing the candidate predictor
#' series.
#' @param y a numeric vector containing the response series.
#' @param h an integer giving the forecast horizon (defaults to 1).
#' @param pMax an integer giving the maximum number of lags in the model
#' (defaults to 3).
#' @param sMax an integer giving the number of predictor series to be
#' sequenced. If it is \code{NA} (the default), predictor groups are sequenced
#' as long as there are twice as many observations as predictor variables.
#' @param centerFun a function to compute a robust estimate for the center
#' (defaults to \code{\link[stats]{median}}).
#' @param scaleFun a function to compute a robust estimate for the scale
#' (defaults to \code{\link[stats]{mad}}).
#' @param regFun a function to compute robust linear regressions that can be
#' interpreted as weighted least squares (defaults to
#' \code{\link[robustbase]{lmrob}}).
#' @param regArgs a list of arguments to be passed to \code{regFun}.
#' @param combine a character string specifying how to combine the data
#' cleaning weights from the robust regressions with each predictor group.
#' Possible values are \code{"min"} for taking the minimum weight for each
#' observation, \code{"euclidean"} for weights based on Euclidean distances
#' of the multivariate set of standardized residuals (i.e., multivariate
#' winsorization of the standardized residuals assuming independence), or
#' \code{"mahalanobis"} for weights based on Mahalanobis distances of the
#' multivariate set of standardized residuals (i.e., multivariate winsorization
#' of the standardized residuals).
#' @param winsorize a logical indicating whether to clean the data by
#' multivariate winsorization.
#' @param const numeric; tuning constant for multivariate winsorization to be
#' used in the initial corralation estimates based on adjusted univariate
#' winsorization (defaults to 2).
#' @param prob numeric; probability for the quantile of the
#' \eqn{\chi^{2}}{chi-squared} distribution to be used in multivariate
#' winsorization (defaults to 0.95).
#' @param fit a logical indicating whether to fit submodels along the sequence
#' (\code{TRUE}, the default) or to simply return the sequence (\code{FALSE}).
#' @param s an integer vector of length two giving the first and last
#' step along the sequence for which to compute submodels. The default
#' is to start with a model containing only an intercept (step 0) and
#' iteratively add all series along the sequence (step \code{sMax}). If
#' the second element is \code{NA}, predictor groups are added to the
#' model as long as there are twice as many observations as predictor
#' variables. If only one value is supplied, it is recycled.
#' @param crit a character string specifying the optimality criterion to be
#' used for selecting the final model. Currently, only \code{"BIC"} for the
#' Bayes information criterion is implemented.
#' @param ncores a positive integer giving the number of processor cores to be
#' used for parallel computing (the default is 1 for no parallelization). If
#' this is set to \code{NA}, all available processor cores are used. For
#' each lag length, parallel computing for obtaining the data cleaning weights
#' and for fitting models along the sequence is implemented on the \R level
#' using package \pkg{parallel}. Otherwise parallel computing for some of of
#' the more computer-intensive computations in the sequencing step is
#' implemented on the C++ level via OpenMP (\url{https://www.openmp.org/}).
#' @param cl a \pkg{parallel} cluster for parallel computing as generated by
#' \code{\link[parallel]{makeCluster}}. This is preferred over \code{ncores}
#' for tasks that are parallelized on the \R level, in which case \code{ncores}
#' is only used for tasks that are parallelized on the C++ level.
#' @param seed optional initial seed for the random number generator
#' (see \code{\link{.Random.seed}}), which is useful because many robust
#' regression functions (including \code{\link[robustbase]{lmrob}}) involve
#' randomness. On parallel \R worker processes, random number streams are
#' used and the seed is set via \code{\link{clusterSetRNGStream}}.
#' @param model a logical indicating whether the model data should be included
#' in the returned object.
#' @param \dots additional arguments to be passed down.
#'
#' @return
#' If \code{fit} is \code{FALSE}, an integer matrix in which each column
#' contains the indices of the sequenced predictor series for the corresponding
#' lag length.
#'
#' Otherwise an object of class \code{"tslars"} with the following components:
#' \describe{
#' \item{\code{pFit}}{a list containing the fits for the respective lag
#' lengths (see \code{\link{tslarsP}}).}
#' \item{\code{pOpt}}{an integer giving the optimal number of lags.}
#' \item{\code{pMax}}{the maximum number of lags considered.}
#' \item{\code{x}}{the matrix of candidate predictor series (if \code{model}
#' is \code{TRUE}).}
#' \item{\code{y}}{the response series (if \code{model} is \code{TRUE}).}
#' \item{\code{call}}{the matched function call.}
#' }
#'
#' @note The predictor group of lagged values of the response is indicated by
#' the index 0.
#'
#' @author Andreas Alfons, based on code by Sarah Gelper
#'
#' @references
#' Alfons, A., Croux, C. and Gelper, S. (2016) Robust groupwise least angle
#' regression. \emph{Computational Statistics & Data Analysis}, \bold{93},
#' 421--435. \doi{10.1016/j.csda.2015.02.007}
#'
#' @seealso \code{\link[=coef.tslars]{coef}},
#' \code{\link[=fitted.tslars]{fitted}},
#' \code{\link[=plot.tslars]{plot}},
#' \code{\link[=predict.tslars]{predict}},
#' \code{\link[=residuals.tslars]{residuals}},
#' \code{\link{tslarsP}}, \code{\link[robustbase]{lmrob}}
#'
#' @keywords regression robust ts
#'
#' @export
tslars <- function(x, ...) UseMethod("tslars")
#' @rdname tslars
#' @method tslars formula
#' @export
tslars.formula <- function(formula, data, ...) {
## initializations
call <- match.call() # get function call
call[[1]] <- as.name("tslars")
# prepare model frame
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data"), names(mf), 0)
mf <- mf[c(1, m)]
mf$drop.unused.levels <- TRUE
mf[[1]] <- as.name("model.frame")
mf <- eval(mf, parent.frame())
mt <- attr(mf, "terms")
if(is.empty.model(mt)) stop("empty model")
# extract response and candidate predictors from model frame
y <- model.response(mf, "numeric")
x <- model.matrix(mt, mf)
# remove first column for intercept, if existing
if(attr(mt, "intercept")) x <- x[, -1, drop=FALSE]
## call default method
out <- tslars.default(x, y, ...)
if(inherits(out, "tslars")) {
out$call <- call # add call to return object
out$terms <- mt # add model terms to return object
}
out
}
#' @rdname tslars
#' @method tslars default
#' @export
tslars.default <- function(x, y, h = 1, pMax = 3, sMax = NA, fit = TRUE,
s = c(0, sMax), crit = "BIC", ncores = 1,
cl = NULL, model = TRUE, ...) {
## call fit function with classical functions for center, scale,
## correlation and regression
call <- match.call() # get function call
call[[1]] <- as.name("tslars")
out <- tslarsFit(x, y, h=h, pMax=pMax, sMax=sMax, robust=FALSE,
centerFun=mean, scaleFun=sd, fit=fit, s=s, crit=crit,
ncores=ncores, cl=cl, model=model)
if(inherits(out, "tslars")) out$call <- call # add call to return object
out
}
#' @rdname tslars
#' @export
rtslars <- function(x, ...) UseMethod("rtslars")
#' @rdname tslars
#' @method rtslars formula
#' @export
rtslars.formula <- function(formula, data, ...) {
## initializations
call <- match.call() # get function call
call[[1]] <- as.name("rtslars")
# prepare model frame
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data"), names(mf), 0)
mf <- mf[c(1, m)]
mf$drop.unused.levels <- TRUE
mf[[1]] <- as.name("model.frame")
mf <- eval(mf, parent.frame())
mt <- attr(mf, "terms")
if(is.empty.model(mt)) stop("empty model")
# extract response and candidate predictors from model frame
y <- model.response(mf, "numeric")
x <- model.matrix(mt, mf)
# remove first column for intercept, if existing
if(attr(mt, "intercept")) x <- x[, -1, drop=FALSE]
## call default method and modify return object
out <- rtslars.default(x, y, ...)
if(inherits(out, "tslars")) {
out$call <- call # add call to return object
out$terms <- mt # add model terms to return object
}
out
}
#' @rdname tslars
#' @method rtslars default
#' @export
rtslars.default <- function(x, y, h = 1, pMax = 3, sMax = NA,
centerFun = median, scaleFun = mad,
regFun = lmrob, regArgs = list(),
combine = c("min", "euclidean", "mahalanobis"),
winsorize = FALSE, const = 2, prob = 0.95,
fit = TRUE, s = c(0, sMax), crit = "BIC",
ncores = 1, cl = NULL, seed = NULL,
model = TRUE, ...) {
## call fit function with classical functions for center, scale,
## correlation and regression
call <- match.call() # get function call
call[[1]] <- as.name("rtslars")
out <- tslarsFit(x, y, h=h, pMax=pMax, sMax=sMax, robust=TRUE,
centerFun=centerFun, scaleFun=scaleFun, regFun=regFun,
regArgs=regArgs, combine=combine, winsorize=winsorize,
const=const, prob=prob, fit=fit, s=s, crit=crit,
ncores=ncores, cl=cl, seed=seed, model=model)
if(inherits(out, "tslars")) out$call <- call # add call to return object
out
}
## fit function that allows to specify functions for center, scale, correlation
## and regression
tslarsFit <- function(x, y, h = 1, pMax = 3, sMax = NA,
robust = FALSE, centerFun = mean, scaleFun = sd,
regFun = lm.fit, regArgs = list(),
combine = c("min", "euclidean", "mahalanobis"),
winsorize = FALSE, const = 2, prob = 0.95, fit = TRUE,
s = c(0, sMax), crit = "BIC", ncores = 1, cl = NULL,
seed = NULL, model = TRUE) {
## initializations
n <- length(y)
x <- as.matrix(x)
if(nrow(x) != n) stop(sprintf("'x' must have %d rows", n))
robust <- isTRUE(robust)
winsorize <- robust && isTRUE(winsorize)
fit <- isTRUE(fit)
crit <- match.arg(crit)
if(!is.null(seed)) set.seed(seed)
if(is.na(ncores)) ncores <- detectCores() # use all available cores
if(!is.numeric(ncores) || is.infinite(ncores) || ncores < 1) {
ncores <- 1 # use default value
warning("invalid value of 'ncores'; using default value")
} else ncores <- as.integer(ncores)
if(fit || (robust && !winsorize)) {
# check whether parallel computing should be used
haveCl <- inherits(cl, "cluster")
haveNcores <- !haveCl && ncores > 1
useParallel <- haveNcores || haveCl
# set up multicore or snow cluster if not supplied
if(haveNcores) {
if(.Platform$OS.type == "windows") {
cl <- makePSOCKcluster(rep.int("localhost", ncores))
} else cl <- makeForkCluster(ncores)
on.exit(stopCluster(cl))
}
if(useParallel) {
# set seed of the random number stream
if(!is.null(seed)) clusterSetRNGStream(cl, iseed=seed)
else if(haveNcores) clusterSetRNGStream(cl)
}
}
## call 'tslarsPFit()' for each number of lags and choose optimal lag length
# TODO: check 'h' and 'pMax'
p <- seq_len(pMax)
out <- lapply(p,
function(i) {
select <- (pMax-i+1):n # ensure the same observations
tslarsPFit(x[select, , drop=FALSE], y[select], h=h, p=i,
sMax=sMax, robust=robust, centerFun=centerFun,
scaleFun=scaleFun, regFun=regFun, regArgs=regArgs,
combine=combine, winsorize=winsorize, const=const,
prob=prob, fit=fit, s=s, crit=crit, cl=cl,
model=FALSE)
})
names(out) <- p
## find optimal lag length
if(fit) {
if(pMax == 1) pOpt <- 1
else if(crit == "BIC") {
# ensure that BIC data is available in fits for the different lag lengths
out <- lapply(out, function(x) {
if(is.null(x$crit)) x$crit <- bicSelect(x)
x
})
# extract BIC for optimal step from fits for the different lag lengths
bicOpt <- sapply(out, function(x) {
bic <- x$crit
bic$values[getBest(bic)]
})
# find optimal lag length
whichOpt <- which.min(bicOpt)
pOpt <- p[whichOpt]
} else stop("not implemented yet")
## construct return object
out <- list(pFit=out, pOpt=pOpt, pMax=pMax)
if(isTRUE(model)) out[c("x", "y")] <- list(x=x, y=y)
class(out) <- "tslars"
} else out <- do.call(cbind, out)
out
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.