#' Out-of-sample Tests of Granger Causality
#'
#' Test for Granger causality using out-of-sample prediction errors from an
#' autoregression (AR) model, where some of the near-contemporaneous lags can be removed:
#' \deqn{Y_t = \sum_{i=1}^{p1}\alpha_iY_{t-i} + \sum_{i=lag.restrict+1}^{p2}\beta_iX_{t-i} + e_t,}
#' where \eqn{Y_t} is the dependent variable,
#' \eqn{X_t} is the cause variable,
#' \eqn{p1} and \eqn{p2} are the AR orders (if \code{p.free = FALSE}, \eqn{p1 = p2}),
#' \eqn{lag.restrict} is the number of restricted first lags (see the argument \code{lag.restrict}).
#'
#' The tests include the MSE-t approach \insertCite{McCracken_2007}{funtimes} and
#' MSE-correlation test as in Chapter 9.3 of \insertCite{Granger_Newbold_2016;textual}{funtimes}.
#' The bootstrap is used to empirically derive distributions of the statistics.
#'
#'
#' @details
#' In the implemented bootstrapping, residuals of the restricted model under the null hypothesis of no Granger
#' causality are bootstrapped to generate new data under the null hypothesis. Then, the full and restricted
#' models are re-estimated on the bootstrapped data to obtain new (bootstrapped) forecast errors.
#'
#' In the current implementation, the bootstrapped \eqn{p}-value is calculated using Equation 4.10 in
#' \insertCite{Davison_Hinkley_1997;textual}{funtimes}: \code{p.value} = (1 + \eqn{n}) / (\code{B} + 1),
#' where \eqn{n} is the number of bootstrapped statistics smaller or equal to the observed statistic.
#'
#' This function tests the Granger causation
#' of \eqn{X} to \eqn{Y} or from \eqn{Y} to \eqn{X}
#' (to test in both directions, need to run the function twice, with different argument \code{cause}).
#' To use the symmetric vector autoregression (VAR), use the function \code{\link{causality_predVAR}}.
#'
#' @param y matrix, data frame, or \code{ts} object with two columns
#' (a dependent and an explanatory time-series variable). Missing values are not allowed.
#' @param cause name of the cause variable. If not specified, the first variable in
#' \code{y} is treated as the dependent variable and the second is treated as the cause.
#' @param p a vector of one or two positive integers specifying the order \eqn{p} of
#' autoregressive dependence. The input of length one is recycled, then \code{p[1]} is used for
#' the dependent variable and \code{p[2]} is used for the cause variable.
#' The user must specify \code{p} or \code{lag.max}.
#' If \code{lag.max} is specified, the argument \code{p} is ignored.
#' @param p.free logical value indicating whether the autoregressive orders for the
#' dependent and cause variables should be selected independently.
#' The default \code{p.free = FALSE} means the same autoregressive order is
#' selected for both variables. Note that if \code{p.free = TRUE} and \code{lag.max} is specified,
#' then \code{lag.max[1] * (lag.max[2] - lag.restrict)} models are compared,
#' which might be slow depending on the maximal lags and sample size.
#' @param lag.restrict integer for the number of short-term lags in the cause variable
#' to remove from consideration (default is zero, meaning no lags are removed).
#' This setting does not affect the dependent variable lags that are always present.
#' @param lag.max a vector of one or two positive integers for the highest lag orders to explore.
#' The input of length one is recycled, then \code{lag.max[1]} used for
#' the dependent variable and \code{lag.max[2]} is used for the cause variable.
#' The order is then selected using the Akaike information criterion (AIC; default),
#' see the argument \code{k} to change the criterion.
#' \code{lag.max} of length 2 automatically sets \code{p.free = TRUE}.
#' @param ic logical indicating whether the lags should be selected
#' (\code{ic = TRUE}; default) or used as specified by \code{p}
#' (\code{ic = FALSE}; then the arguments \code{p.free}, \code{k}, and \code{lag.max} are ignored).
#' @param k numeric scalar specifying the weight of the equivalent degrees of freedom part
#' in the AIC formula. Default \code{k = 2} corresponds to the traditional AIC.
#' Use \code{k = log(n)} to use the Bayesian information criterion instead
#' (see \code{\link[stats]{extractAIC}}).
#' @param B number of bootstrap replications. Default is 500.
#' @param test a numeric value specifying the size of the testing set. If \code{test} < 1,
#' the value is treated as a proportion of the sample size to be used as the testing set.
#' Otherwise, \code{test} is treated as the number of the most recent values to be used as the testing set.
#' Default is 0.3, which means that 30% of the sample is used for calculating
#' out-of-sample errors. The testing set is always at the end of the time series.
#' @param cl parameter to specify computer cluster for bootstrapping passed to
#' the package \code{parallel} (default \code{cl = 1}, means no cluster is used).
#' Possible values are:
#' \itemize{
#' \item cluster object (list) produced by \link[parallel]{makeCluster}.
#' In this case, a new cluster is not started nor stopped;
#' \item \code{NULL}. In this case, the function will detect
#' available cores (see \link[parallel]{detectCores}) and, if there are
#' multiple cores (\eqn{>1}), a cluster will be started with
#' \link[parallel]{makeCluster}. If started, the cluster will be stopped
#' after the computations are finished;
#' \item positive integer defining the number of cores to start a cluster.
#' If \code{cl = 1} (default), no attempt to create a cluster will be made.
#' If \code{cl} > 1, a cluster will be started (using \link[parallel]{makeCluster})
#' and stopped afterward (using \link[parallel]{stopCluster}).
#' }
#'
#' @return A list containing the following elements:
#' \item{stat}{a table with the observed values of the test statistics and \eqn{p}-values.}
#' \item{cause}{the cause variable.}
#' \item{p}{the AR orders used for the dependent variable (\code{p[1]}) and for the cause variable (\code{p[2]}).}
#'
#' @references
#' \insertAllCited{}
#'
#' @keywords causality htest ts
#'
#' @author Vyacheslav Lyubchich
#'
#' @seealso \code{\link{causality_predVAR}}
#' @export
#' @examples
#' \dontrun{
#' # Example 1: Canada time series (ts object)
#' Canada <- vars::Canada
#' causality_pred(Canada[,1:2], cause = "e", lag.max = 5, p.free = TRUE)
#' causality_pred(Canada[,1:2], cause = "e", lag.restrict = 3, lag.max = 15, p.free = TRUE)
#'
#' # Example 2 (run in parallel, initiate the cluster automatically)
#' # Box & Jenkins time series
#' # of sales and a leading indicator, see ?BJsales
#'
#' D <- cbind(BJsales.lead, BJsales)
#' causality_pred(D, cause = "BJsales.lead", lag.max = 5, B = 1000, cl = NULL)
#'
#' # Example 3 (run in parallel, initiate the cluster manually)
#'
#' # Initiate a local cluster
#' cores <- parallel::detectCores()
#' cl <- parallel::makeCluster(cores)
#' parallel::clusterSetRNGStream(cl, 123) # to make parallel computations reproducible
#'
#' causality_pred(D, cause = "BJsales.lead", lag.max = 5, B = 1000, cl = cl)
#' causality_pred(D, cause = "BJsales.lead", lag.restrict = 3, p = 5, B = 1000, cl = cl)
#' parallel::stopCluster(cl)
#' }
#'
causality_pred <- function(y,
cause = NULL,
p = NULL,
p.free = FALSE,
lag.restrict = 0L,
lag.max = NULL,
ic = TRUE,
k = 2,
B = 500L,
test = 0.3,
cl = 1L)
{
if (!is.null(lag.max) && any(lag.max < 1L)) {
stop("lag.max must be positive integer(s).")
}
if (!is.null(p) && any(p < 1L)) {
stop("p must be positive integer(s).")
}
bootparallel <- FALSE
if (is.list(cl)) { #some other cluster supplied; use it but do not stop it
bootparallel <- TRUE
clStop <- FALSE
} else {
if (is.null(cl)) {
cores <- parallel::detectCores()
} else {
cores <- cl
}
if (cores > 1) { #specified or detected cores>1; start a cluster and later stop it
bootparallel <- TRUE
cl <- parallel::makeCluster(cores)
clStop <- TRUE
}
}
varnames <- colnames(y)
if (length(varnames) != 2) stop("y must have 2 columns")
if (is.null(cause)) {
cause <- varnames[2]
}
dep <- setdiff(varnames, cause)[1] # dependent variable name
# Define samples
n <- nrow(y) # sample size
if (test < 1) { # percentage split
n_train <- round(n*(1 - test))
} else { # use the last "test" observations as the testing set
n_train <- n - test
}
n_test <- n - n_train
# Part 1: Estimate p on the training set if using an information criterion ----
if (ic){
if (is.null(p) && is.null(lag.max)) {
stop("Please specify p or lag.max.")
}
if (!is.null(lag.max)) { # then select p
if (length(lag.max) == 2) {
p.free = TRUE
} else {
lag.max <- rep(lag.max[1], 2)
}
maxl <- max(lag.max)
if (lag.restrict >= lag.max[2]) {
warning("lag.restrict >= lag.max or lag.max[2]. Using lag.restrict = 0 instead.")
lag.restrict <- 0
}
# Embed both X and Y up to maxl to have the results of the same length
if (lag.restrict > 0) {
lagX <- embed(y[1:n_train, cause], maxl + 1)[, -c(1:(lag.restrict + 1)), drop = FALSE]
} else {
lagX <- embed(y[1:n_train, cause], maxl + 1)[, -1, drop = FALSE]
}
lagY <- embed(y[1:n_train, dep], maxl + 1)
# Information criterion for the model
if (p.free) {
best.ic <- Inf
for (p1 in 1:lag.max[1]) {
for (p2 in (lag.restrict + 1):lag.max[2]) {
fit <- stats::lm.fit(x = cbind(1,
lagY[, 2:(p1 + 1)],
lagX[, 1:(p2 - lag.restrict), drop = FALSE]),
y = lagY[, 1])
# see stats:::extractAIC.lm() but omit the scale option
nfit <- length(fit$residuals)
edf <- nfit - fit$df.residual
RSS <- sum(fit$residuals^2, na.rm = TRUE) # stats:::deviance.lm(fit)
dev <- nfit * log(RSS/nfit)
fit.ic <- dev + k * edf
if (fit.ic < best.ic) {
best.ic <- fit.ic
p <- c(p1, p2)
}
}
}
} else {
IC <- sapply((lag.restrict + 1):lag.max[2], function(s) {
fit <- stats::lm.fit(x = cbind(1,
lagY[, 2:(s + 1)],
lagX[, 1:(s - lag.restrict), drop = FALSE]),
y = lagY[, 1])
# see stats:::extractAIC.lm; but omit the scale option
nfit <- length(fit$residuals)
edf <- nfit - fit$df.residual
RSS <- sum(fit$residuals^2, na.rm = TRUE) # stats:::deviance.lm(fit)
dev <- nfit * log(RSS/nfit)
dev + k * edf
})
p <- which.min(IC) + lag.restrict
}
} # finish selection of p
}
if (length(p) == 2) {
p.free = TRUE
} else {
p <- rep(p[1], 2)
}
maxp <- max(p)
# Part 2: Estimate model and get predictions on the testing set ----
if (lag.restrict >= p[2]) {
warning("lag.restrict >= p or p[2]. Using lag.restrict = 0 instead.")
lag.restrict <- 0
}
if (lag.restrict > 0) {
lagX <- embed(y[,cause], maxp + 1)[, -c(1:(lag.restrict + 1)), drop = FALSE]
} else {
lagX <- embed(y[,cause], maxp + 1)[, -1, drop = FALSE]
}
lagY <- embed(y[,dep], maxp + 1)
n_actual <- nrow(lagY)
# Get 1-step ahead forecasts from the models (recursive)
FCST <- sapply(1:n_test - 1, function(i) { # i = 0
# estimate full and restricted models
m_yx <- stats::lm.fit(x = cbind(1,
lagY[1:(n_train - maxp + i), 2:(p[1] + 1), drop = FALSE],
lagX[1:(n_train - maxp + i), 1:(p[2] - lag.restrict), drop = FALSE]),
y = lagY[1:(n_train - maxp + i), 1, drop = FALSE])
m_y <- stats::lm.fit(x = cbind(1,
lagY[1:(n_train - maxp + i), 2:(p[1] + 1), drop = FALSE]),
y = lagY[1:(n_train - maxp + i), 1, drop = FALSE])
# get predictions
c(m_yx$coefficients %*% c(1, lagY[(n_train - maxp + i + 1), 2:(p[1] + 1)], lagX[(n_train - maxp + i + 1), 1:(p[2] - lag.restrict)]),
m_y$coefficients %*% c(1, lagY[(n_train - maxp + i + 1), 2:(p[1] + 1)]))
})
# Forecast errors
efull <- y[(n_train + 1):n, dep] - FCST[1,]
eres <- y[(n_train + 1):n, dep] - FCST[2,]
# Observed test statistics
OBS <- caustests(efull, eres) # caustests <- funtimes:::caustests
# Bootstrap restricted model estimated on the full sample (Attanasio et al. 2016)
m_y <- stats::lm.fit(x = cbind(1,
lagY[, 2:(p[1] + 1), drop = FALSE]),
y = lagY[, 1, drop = FALSE])
m_y_fit <- m_y$fitted.values
m_y_res <- m_y$residuals
if (bootparallel) {
BOOT0 <- parallel::parSapply(cl, X = 1:B, FUN = function(b) {
dy_boot <- m_y_fit + sample(m_y_res, replace = TRUE)
FCST <- sapply(1:n_test - 1, function(i) { # i = 0
# estimate full and restricted models
m_yx <- stats::lm.fit(x = cbind(1,
lagY[1:(n_train - maxp + i), 2:(p[1] + 1), drop = FALSE],
lagX[1:(n_train - maxp + i), 1:(p[2] - lag.restrict), drop = FALSE]),
y = dy_boot[1:(n_train - maxp + i)])
m_y <- stats::lm.fit(x = cbind(1,
lagY[1:(n_train - maxp + i), 2:(p[1] + 1), drop = FALSE]),
y = dy_boot[1:(n_train - maxp + i)])
# get predictions
c(m_yx$coefficients %*% c(1, lagY[(n_train - maxp + i + 1), 2:(p[1] + 1)], lagX[(n_train - maxp + i + 1), 1:(p[2] - lag.restrict)]),
m_y$coefficients %*% c(1, lagY[(n_train - maxp + i + 1), 2:(p[1] + 1)]))
})
# Forecast errors
efullb <- dy_boot[(n_train + 1):n - maxp] - FCST[1,]
eresb <- dy_boot[(n_train + 1):n - maxp] - FCST[2,]
# test statistics
caustests(efullb, eresb)
})
if (clStop) {
parallel::stopCluster(cl)
}
} else {
BOOT0 <- sapply(1:B, FUN = function(b) {
dy_boot <- m_y_fit + sample(m_y_res, replace = TRUE)
FCST <- sapply(1:n_test - 1, function(i) { # i = 0
# estimate full and restricted models
m_yx <- stats::lm.fit(x = cbind(1,
lagY[1:(n_train - maxp + i), 2:(p[1] + 1), drop = FALSE],
lagX[1:(n_train - maxp + i), 1:(p[2] - lag.restrict), drop = FALSE]),
y = dy_boot[1:(n_train - maxp + i)])
m_y <- stats::lm.fit(x = cbind(1,
lagY[1:(n_train - maxp + i), 2:(p[1] + 1), drop = FALSE]),
y = dy_boot[1:(n_train - maxp + i)])
# get predictions
c(m_yx$coefficients %*% c(1, lagY[(n_train - maxp + i + 1), 2:(p[1] + 1)], lagX[(n_train - maxp + i + 1), 1:(p[2] - lag.restrict)]),
m_y$coefficients %*% c(1, lagY[(n_train - maxp + i + 1), 2:(p[1] + 1)]))
})
# Forecast errors
efullb <- dy_boot[(n_train + 1):n - maxp] - FCST[1,]
eresb <- dy_boot[(n_train + 1):n - maxp] - FCST[2,]
# test statistics
caustests(efullb, eresb)
})
} # end sequential bootstrap
list(stat = data.frame(MSEt = c(OBS["MSEt"],
(sum(BOOT0["MSEt",] <= OBS["MSEt"]) + 1) / (B + 1)),
MSEcor = c(OBS["MSEcor"],
(sum(BOOT0["MSEcor",] <= OBS["MSEcor"]) + 1) / (B + 1)),
OOSF = c(OBS["OOSF"],
(sum(BOOT0["OOSF",] >= OBS["OOSF"]) + 1) / (B + 1)),
EN = c(OBS["EN"],
(sum(BOOT0["EN",] >= OBS["EN"]) + 1) / (B + 1)),
row.names = c("stat_obs", "p_boot")),
cause = cause,
p = p)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.