Nothing
# Time series cross-validation
# y is a time series
# forecastfunction must return an object of class forecast
# h is number of steps ahead to forecast
# ... are passed to forecastfunction
#' Time series cross-validation
#'
#' `tsCV` computes the forecast errors obtained by applying
#' `forecastfunction` to subsets of the time series `y` using a
#' rolling forecast origin.
#'
#' Let `y` contain the time series \eqn{y_1,\dots,y_T}{y[1:T]}. Then
#' `forecastfunction` is applied successively to the time series
#' \eqn{y_1,\dots,y_t}{y[1:t]}, for \eqn{t=1,\dots,T-h}, making predictions
#' \eqn{\hat{y}_{t+h|t}}{f[t+h]}. The errors are given by \eqn{e_{t+h} =
#' y_{t+h}-\hat{y}_{t+h|t}}{e[t+h] = y[t+h]-f[t+h]}. If h=1, these are returned as a
#' vector, \eqn{e_1,\dots,e_T}{e[1:T]}. For h>1, they are returned as a matrix with
#' the hth column containing errors for forecast horizon h.
#' The first few errors may be missing as
#' it may not be possible to apply `forecastfunction` to very short time
#' series.
#'
#' @inheritParams Arima
#' @inheritParams forecast.ts
#' @param forecastfunction Function to return an object of class
#' `forecast`. Its first argument must be a univariate time series, and it
#' must have an argument `h` for the forecast horizon. If exogenous predictors are used,
#' then it must also have `xreg` and `newxreg` arguments corresponding to the
#' training and test periods.
#' @param window Length of the rolling window, if NULL, a rolling window will not be used.
#' @param xreg Exogenous predictor variables passed to the forecast function if required.
#' @param initial Initial period of the time series where no cross-validation is performed.
#' @param ... Other arguments are passed to `forecastfunction`.
#' @return Numerical time series object containing the forecast errors as a vector (if h=1)
#' and a matrix otherwise. The time index corresponds to the last period of the training
#' data. The columns correspond to the forecast horizons.
#' @author Rob J Hyndman
#' @seealso [CV()], [CVar()], [residuals.Arima()], \url{https://robjhyndman.com/hyndsight/tscv/}.
#'
#' @keywords ts
#' @examples
#'
#' #Fit an AR(2) model to each rolling origin subset
#' far2 <- function(x, h) forecast(Arima(x, order = c(2, 0, 0)), h = h)
#' e <- tsCV(lynx, far2, h = 1)
#'
#' #Fit the same model with a rolling window of length 30
#' e <- tsCV(lynx, far2, h = 1, window = 30)
#'
#' #Example with exogenous predictors
#' far2_xreg <- function(x, h, xreg, newxreg) {
#' forecast(Arima(x, order = c(2, 0, 0), xreg = xreg), xreg = newxreg)
#' }
#'
#' y <- ts(rnorm(50))
#' xreg <- matrix(rnorm(100), ncol = 2)
#' e <- tsCV(y, far2_xreg, h = 3, xreg = xreg)
#'
#' @export
tsCV <- function(
y,
forecastfunction,
h = 1,
window = NULL,
xreg = NULL,
initial = 0,
...
) {
y <- as.ts(y)
n <- length(y)
e <- ts(matrix(NA_real_, nrow = n, ncol = h))
if (initial >= n) {
stop("initial period too long")
}
tsp(e) <- tsp(y)
if (!is.null(xreg)) {
# Make xreg a ts object to allow easy subsetting later
xreg <- ts(as.matrix(xreg))
if (NROW(xreg) != length(y)) {
stop("xreg must be of the same size as y")
}
# Pad xreg with NAs
xreg <- ts(
rbind(xreg, matrix(NA, nrow = h, ncol = NCOL(xreg))),
start = start(y),
frequency = frequency(y)
)
}
if (is.null(window)) {
indx <- seq(1 + initial, n - 1L)
} else {
indx <- seq(window + initial, n - 1L, by = 1L)
}
for (i in indx) {
if (is.null(window)) {
start <- 1L
} else if (i - window >= 0L) {
start <- i - window + 1L
} else {
stop("small window")
}
y_subset <- subset(y, start = start, end = i)
if (is.null(xreg)) {
fc <- try(
suppressWarnings(forecastfunction(y_subset, h = h, ...)),
silent = TRUE
)
} else {
if (is.null(window)) {
start <- 1L
} else if (i - window >= 0L) {
start <- i - window + 1L
} else {
stop("small window")
}
xreg_subset <- subset(xreg, start = start, end = i)
xreg_future <- subset(xreg, start = i + 1, end = i + h)
fc <- try(
suppressWarnings(
forecastfunction(
y_subset,
h = h,
xreg = xreg_subset,
newxreg = xreg_future,
...
)
),
silent = TRUE
)
}
if (!inherits(fc, "try-error")) {
e[i, ] <- y[i + seq(h)] - fc$mean[seq(h)]
}
}
if (h == 1) {
return(e[, 1L])
} else {
colnames(e) <- paste0("h=", 1:h)
return(e)
}
}
# Cross-validation for AR models
# By Gabriel Caceres
## Note arguments to pass must be named
#' k-fold Cross-Validation applied to an autoregressive model
#'
#' `CVar` computes the errors obtained by applying an autoregressive
#' modelling function to subsets of the time series `y` using k-fold
#' cross-validation as described in Bergmeir, Hyndman and Koo (2015). It also
#' applies a Ljung-Box test to the residuals. If this test is significant
#' (see returned pvalue), there is serial correlation in the residuals and the
#' model can be considered to be underfitting the data. In this case, the
#' cross-validated errors can underestimate the generalization error and should
#' not be used.
#'
#' @aliases print.CVar
#'
#' @param y Univariate time series
#' @param k Number of folds to use for cross-validation.
#' @param FUN Function to fit an autoregressive model. Currently, it only works
#' with the [nnetar()] function.
#' @param cvtrace Provide progress information.
#' @param blocked choose folds randomly or as blocks?
#' @param LBlags lags for the Ljung-Box test, defaults to 24, for yearly series can be set to 20
#' @param ... Other arguments are passed to `FUN`.
#' @return A list containing information about the model and accuracy for each
#' fold, plus other summary information computed across folds.
#' @author Gabriel Caceres and Rob J Hyndman
#' @seealso [CV()], [tsCV()].
#' @references Bergmeir, C., Hyndman, R.J., Koo, B. (2018) A note on the
#' validity of cross-validation for evaluating time series prediction.
#' \emph{Computational Statistics & Data Analysis}, \bold{120}, 70-83.
#' \url{https://robjhyndman.com/publications/cv-time-series/}.
#' @keywords ts
#' @examples
#'
#' modelcv <- CVar(lynx, k = 5, lambda = 0.15)
#' print(modelcv)
#' print(modelcv$fold1)
#'
#' library(ggplot2)
#' autoplot(lynx, series = "Data") +
#' autolayer(modelcv$testfit, series = "Fits") +
#' autolayer(modelcv$residuals, series = "Residuals")
#' ggAcf(modelcv$residuals)
#'
#' @export
CVar <- function(
y,
k = 10,
FUN = nnetar,
cvtrace = FALSE,
blocked = FALSE,
LBlags = 24,
...
) {
nx <- length(y)
# n-folds at most equal number of points
k <- min(as.integer(k), nx)
if (k <= 1L) {
stop("k must be at least 2")
}
# Set up folds
ind <- seq_len(nx)
fold <- if (blocked) {
sort(rep_len(1:k, nx))
} else {
sample(rep_len(1:k, nx))
}
cvacc <- matrix(NA_real_, nrow = k, ncol = 7)
out <- vector("list", k)
alltestfit <- rep_len(NA, nx)
for (i in seq_len(k)) {
out[[paste0("fold", i)]] <- list()
testset <- ind[fold == i]
trainset <- ind[fold != i]
trainmodel <- FUN(y, subset = trainset, ...)
testmodel <- FUN(y, model = trainmodel, xreg = trainmodel$xreg)
testfit <- fitted(testmodel)
acc <- accuracy(y, testfit, test = testset)
cvacc[i, ] <- acc
out[[paste0("fold", i)]]$model <- trainmodel
out[[paste0("fold", i)]]$accuracy <- acc
out[[paste0("fold", i)]]$testfit <- testfit
out[[paste0("fold", i)]]$testset <- testset
alltestfit[testset] <- testfit[testset]
if (isTRUE(cvtrace)) {
cat("Fold", i, "\n")
print(acc)
cat("\n")
}
}
out$testfit <- ts(alltestfit)
tsp(out$testfit) <- tsp(y)
out$residuals <- out$testfit - y
out$LBpvalue <- Box.test(out$residuals, type = "Ljung", lag = LBlags)$p.value
out$k <- k
# calculate mean accuracy across all folds
CVmean <- matrix(
colMeans(cvacc, na.rm = TRUE),
dimnames = list(colnames(acc), "Mean")
)
# calculate accuracy sd across all folds --- include?
CVsd <- matrix(
apply(cvacc, 2, FUN = sd, na.rm = TRUE),
dimnames = list(colnames(acc), "SD")
)
out$CVsummary <- cbind(CVmean, CVsd)
out$series <- deparse1(substitute(y))
out$call <- match.call()
structure(out, class = c("CVar", class(trainmodel)))
}
#' @export
print.CVar <- function(x, ...) {
cat("Series:", x$series, "\n")
cat("Call: ")
print(x$call)
# Add info about series, function, and parameters
# Add note about any NA/NaN in folds?
#
# Print number of folds
cat("\n", x$k, "-fold cross-validation\n", sep = "")
# Print mean & sd accuracy() results
print(x$CVsummary)
cat("\n")
cat("p-value of Ljung-Box test of residuals is ", x$LBpvalue, "\n")
cat("if this value is significant (<0.05),\n")
cat("the result of the cross-validation should not be used\n")
cat("as the model is underfitting the data.\n")
invisible(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.