R/tscv.R

Defines functions print.CVar CVar tsCV

Documented in CVar print.CVar tsCV

# Time series cross-validation
# y is a time series
# forecastfunction must return an object of class forecast
# h is number of steps ahead to forecast
# ... are passed to forecastfunction

#' Time series cross-validation
#'
#' `tsCV` computes the forecast errors obtained by applying
#' `forecastfunction` to subsets of the time series `y` using a
#' rolling forecast origin.
#'
#' Let `y` contain the time series \eqn{y_1,\dots,y_T}{y[1:T]}. Then
#' `forecastfunction` is applied successively to the time series
#' \eqn{y_1,\dots,y_t}{y[1:t]}, for \eqn{t=1,\dots,T-h}, making predictions
#' \eqn{\hat{y}_{t+h|t}}{f[t+h]}. The errors are given by \eqn{e_{t+h} =
#' y_{t+h}-\hat{y}_{t+h|t}}{e[t+h] = y[t+h]-f[t+h]}. If h=1, these are returned as a
#' vector, \eqn{e_1,\dots,e_T}{e[1:T]}. For h>1, they are returned as a matrix with
#' the hth column containing errors for forecast horizon h.
#'  The first few errors may be missing as
#' it may not be possible to apply `forecastfunction` to very short time
#' series.
#'
#' @inheritParams Arima
#' @inheritParams forecast.ts
#' @param forecastfunction Function to return an object of class
#' `forecast`. Its first argument must be a univariate time series, and it
#' must have an argument `h` for the forecast horizon. If exogenous predictors are used,
#' then it must also have `xreg` and `newxreg` arguments corresponding to the
#' training and test periods.
#' @param window Length of the rolling window, if NULL, a rolling window will not be used.
#' @param xreg Exogenous predictor variables passed to the forecast function if required.
#' @param initial Initial period of the time series where no cross-validation is performed.
#' @param ... Other arguments are passed to `forecastfunction`.
#' @return Numerical time series object containing the forecast errors as a vector (if h=1)
#' and a matrix otherwise. The time index corresponds to the last period of the training
#' data. The columns correspond to the forecast horizons.
#' @author Rob J Hyndman
#' @seealso [CV()], [CVar()], [residuals.Arima()], \url{https://robjhyndman.com/hyndsight/tscv/}.
#'
#' @keywords ts
#' @examples
#'
#' #Fit an AR(2) model to each rolling origin subset
#' far2 <- function(x, h) forecast(Arima(x, order = c(2, 0, 0)), h = h)
#' e <- tsCV(lynx, far2, h = 1)
#'
#' #Fit the same model with a rolling window of length 30
#' e <- tsCV(lynx, far2, h = 1, window = 30)
#'
#' #Example with exogenous predictors
#' far2_xreg <- function(x, h, xreg, newxreg) {
#'   forecast(Arima(x, order = c(2, 0, 0), xreg = xreg), xreg = newxreg)
#' }
#'
#' y <- ts(rnorm(50))
#' xreg <- matrix(rnorm(100), ncol = 2)
#' e <- tsCV(y, far2_xreg, h = 3, xreg = xreg)
#'
#' @export
tsCV <- function(
  y,
  forecastfunction,
  h = 1,
  window = NULL,
  xreg = NULL,
  initial = 0,
  ...
) {
  y <- as.ts(y)
  n <- length(y)
  e <- ts(matrix(NA_real_, nrow = n, ncol = h))
  if (initial >= n) {
    stop("initial period too long")
  }
  tsp(e) <- tsp(y)
  if (!is.null(xreg)) {
    # Make xreg a ts object to allow easy subsetting later
    xreg <- ts(as.matrix(xreg))
    if (NROW(xreg) != length(y)) {
      stop("xreg must be of the same size as y")
    }
    # Pad xreg with NAs
    xreg <- ts(
      rbind(xreg, matrix(NA, nrow = h, ncol = NCOL(xreg))),
      start = start(y),
      frequency = frequency(y)
    )
  }
  if (is.null(window)) {
    indx <- seq(1 + initial, n - 1L)
  } else {
    indx <- seq(window + initial, n - 1L, by = 1L)
  }
  for (i in indx) {
    if (is.null(window)) {
      start <- 1L
    } else if (i - window >= 0L) {
      start <- i - window + 1L
    } else {
      stop("small window")
    }
    y_subset <- subset(y, start = start, end = i)
    if (is.null(xreg)) {
      fc <- try(
        suppressWarnings(forecastfunction(y_subset, h = h, ...)),
        silent = TRUE
      )
    } else {
      if (is.null(window)) {
        start <- 1L
      } else if (i - window >= 0L) {
        start <- i - window + 1L
      } else {
        stop("small window")
      }
      xreg_subset <- subset(xreg, start = start, end = i)
      xreg_future <- subset(xreg, start = i + 1, end = i + h)
      fc <- try(
        suppressWarnings(
          forecastfunction(
            y_subset,
            h = h,
            xreg = xreg_subset,
            newxreg = xreg_future,
            ...
          )
        ),
        silent = TRUE
      )
    }
    if (!inherits(fc, "try-error")) {
      e[i, ] <- y[i + seq(h)] - fc$mean[seq(h)]
    }
  }
  if (h == 1) {
    return(e[, 1L])
  } else {
    colnames(e) <- paste0("h=", 1:h)
    return(e)
  }
}

# Cross-validation for AR models
# By Gabriel Caceres
## Note arguments to pass must be named

#' k-fold Cross-Validation applied to an autoregressive model
#'
#' `CVar` computes the errors obtained by applying an autoregressive
#' modelling function to subsets of the time series `y` using k-fold
#' cross-validation as described in Bergmeir, Hyndman and Koo (2015). It also
#' applies a Ljung-Box test to the residuals. If this test is significant
#' (see returned pvalue), there is serial correlation in the residuals and the
#' model can be considered to be underfitting the data. In this case, the
#' cross-validated errors can underestimate the generalization error and should
#' not be used.
#'
#' @aliases print.CVar
#'
#' @param y Univariate time series
#' @param k Number of folds to use for cross-validation.
#' @param FUN Function to fit an autoregressive model. Currently, it only works
#' with the [nnetar()] function.
#' @param cvtrace Provide progress information.
#' @param blocked choose folds randomly or as blocks?
#' @param LBlags lags for the Ljung-Box test, defaults to 24, for yearly series can be set to 20
#' @param ... Other arguments are passed to `FUN`.
#' @return A list containing information about the model and accuracy for each
#' fold, plus other summary information computed across folds.
#' @author Gabriel Caceres and Rob J Hyndman
#' @seealso [CV()], [tsCV()].
#' @references Bergmeir, C., Hyndman, R.J., Koo, B. (2018) A note on the
#' validity of cross-validation for evaluating time series prediction.
#' \emph{Computational Statistics & Data Analysis}, \bold{120}, 70-83.
#' \url{https://robjhyndman.com/publications/cv-time-series/}.
#' @keywords ts
#' @examples
#'
#' modelcv <- CVar(lynx, k = 5, lambda = 0.15)
#' print(modelcv)
#' print(modelcv$fold1)
#'
#' library(ggplot2)
#' autoplot(lynx, series = "Data") +
#'   autolayer(modelcv$testfit, series = "Fits") +
#'   autolayer(modelcv$residuals, series = "Residuals")
#' ggAcf(modelcv$residuals)
#'
#' @export
CVar <- function(
  y,
  k = 10,
  FUN = nnetar,
  cvtrace = FALSE,
  blocked = FALSE,
  LBlags = 24,
  ...
) {
  nx <- length(y)
  # n-folds at most equal number of points
  k <- min(as.integer(k), nx)
  if (k <= 1L) {
    stop("k must be at least 2")
  }
  # Set up folds
  ind <- seq_len(nx)
  fold <- if (blocked) {
    sort(rep_len(1:k, nx))
  } else {
    sample(rep_len(1:k, nx))
  }

  cvacc <- matrix(NA_real_, nrow = k, ncol = 7)
  out <- vector("list", k)
  alltestfit <- rep_len(NA, nx)
  for (i in seq_len(k)) {
    out[[paste0("fold", i)]] <- list()
    testset <- ind[fold == i]
    trainset <- ind[fold != i]
    trainmodel <- FUN(y, subset = trainset, ...)
    testmodel <- FUN(y, model = trainmodel, xreg = trainmodel$xreg)
    testfit <- fitted(testmodel)
    acc <- accuracy(y, testfit, test = testset)
    cvacc[i, ] <- acc
    out[[paste0("fold", i)]]$model <- trainmodel
    out[[paste0("fold", i)]]$accuracy <- acc

    out[[paste0("fold", i)]]$testfit <- testfit
    out[[paste0("fold", i)]]$testset <- testset

    alltestfit[testset] <- testfit[testset]

    if (isTRUE(cvtrace)) {
      cat("Fold", i, "\n")
      print(acc)
      cat("\n")
    }
  }

  out$testfit <- ts(alltestfit)
  tsp(out$testfit) <- tsp(y)

  out$residuals <- out$testfit - y
  out$LBpvalue <- Box.test(out$residuals, type = "Ljung", lag = LBlags)$p.value

  out$k <- k
  # calculate mean accuracy across all folds
  CVmean <- matrix(
    colMeans(cvacc, na.rm = TRUE),
    dimnames = list(colnames(acc), "Mean")
  )
  # calculate accuracy sd across all folds --- include?
  CVsd <- matrix(
    apply(cvacc, 2, FUN = sd, na.rm = TRUE),
    dimnames = list(colnames(acc), "SD")
  )
  out$CVsummary <- cbind(CVmean, CVsd)
  out$series <- deparse1(substitute(y))
  out$call <- match.call()
  structure(out, class = c("CVar", class(trainmodel)))
}

#' @export
print.CVar <- function(x, ...) {
  cat("Series:", x$series, "\n")
  cat("Call:   ")
  print(x$call)
  # Add info about series, function, and parameters
  # Add note about any NA/NaN in folds?
  #
  # Print number of folds
  cat("\n", x$k, "-fold cross-validation\n", sep = "")
  # Print mean & sd accuracy() results
  print(x$CVsummary)

  cat("\n")
  cat("p-value of Ljung-Box test of residuals is ", x$LBpvalue, "\n")
  cat("if this value is significant (<0.05),\n")
  cat("the result of the cross-validation should not be used\n")
  cat("as the model is underfitting the data.\n")
  invisible(x)
}

Try the forecast package in your browser

Any scripts or data that you put into this service are public.

forecast documentation built on March 18, 2026, 9:07 a.m.