knitr::opts_chunk$set(message = FALSE, warning = FALSE, error = FALSE, fig.align = "center", dev.args = list(pointsize = 10))
options(digits = 4) # for more compact numerical outputs library("HIDDA.forecasting") library("ggplot2") source("setup.R", local = TRUE) # define test periods (OWA, TEST)
In this vignette, we use forecasting methods provided by:
library("forecast")
The corresponding software reference is:
cat("<blockquote>") print(citation(package = "forecast", auto = TRUE), style = "html") cat("</blockquote>\n")
ARIMA models assume a Gaussian response, so we need to work with transformed counts:
BoxCox.lambda(CHILI, method = "loglik")
=> Box-Cox procedure suggests a log-transformation (lambda = 0
).
arimafit <- auto.arima(CHILI, lambda = 0, stepwise = FALSE, approximation = FALSE)
The above standard approach cannot automatically account for seasonality because the
data have no regular frequency (not a standard "ts"
) ...
But we can manually add sine/cosine covariates and a christmas indicator
just like in the endemic part of the hhh4
model (see vignette("CHILI_hhh4")
).
sarima_cov <- t(sapply(2*pi*seq_along(CHILI)/52.1775, function (x) c(sin = sin(x), cos = cos(x)))) sarimax_cov <- cbind(sarima_cov, christmas = as.integer(strftime(index(CHILI), "%V") == "52")) sarimaxfit <- auto.arima(CHILI, lambda = 0, stepwise = FALSE, approximation = FALSE, xreg = sarimax_cov) summary(sarimaxfit)
CHILIdat <- fortify(CHILI) CHILIdat$sarimaxfitted <- fitted(sarimaxfit) CHILIdat <- cbind(CHILIdat, sapply(c(sarimaxlower=0.025, sarimaxupper=0.975), function (p) InvBoxCox(BoxCox(fitted(sarimaxfit), lambda = sarimaxfit$lambda) + qnorm(p, sd = sqrt(sarimaxfit$sigma2)), lambda = sarimaxfit$lambda)))
ggplot(CHILIdat, aes(x=Index, ymin=sarimaxlower, y=sarimaxfitted, ymax=sarimaxupper)) + geom_ribbon(fill="orange") + geom_line(col="darkred") + geom_point(aes(y=CHILI), pch=20) + scale_y_sqrt(expand = c(0,0), limits = c(0,NA))
We compute r length(OWA)
one-week-ahead forecasts
from r format_period(OWA)
(the OWA
period).
The model selected above is refitted at each time point, but
we do not repeat auto.arima()
model selection.
This is similar to so-called time-series cross-validation as implemented in
forecast::tsCV()
. However, tsCV()
only computes absolute errors of the point
forecasts, whereas we are interested in assessing probabilistic forecasts so
also need the forecast variance.
For each time point, forecasting with Arima
takes about 0.5 seconds, i.e.,
computing all one-week-ahead forecasts takes approx.
r sprintf("%.1f", length(OWA) * 0.5/60)
minutes ...
(we could parallelize via, e.g.,
future.apply::future_lapply()
)
## check update.Arima: we obtain the same fit if we don't change the subset stopifnot(all.equal( sarimaxfit[setdiff(names(sarimaxfit), c("call", "series"))], update(sarimaxfit)[setdiff(names(sarimaxfit), c("call", "series"))] ))
sarimaxowa <- t(simplify2array(lapply(X = OWA, FUN = function (t) { sarimaxfit_t <- update(sarimaxfit, subset = 1:t) unlist(predict(sarimaxfit_t, nahead=1, newxreg=sarimax_cov[t+1,,drop=FALSE])) }))) save(sarimaxowa, file = "sarimaxowa.RData")
load("sarimaxowa.RData")
ARIMA forecasts for the log-counts are normal with mean pred
and variance se^2
=> back-transformation via exp() is log-normal
par(mar = c(5,5,1,1), las = 1) .PIT <- plnorm(CHILI[OWA+1], meanlog = sarimaxowa[,"pred"], sdlog = sarimaxowa[,"se"]) hist(.PIT, breaks = seq(0, 1, 0.1), freq = FALSE, main = "", xlab = "PIT") abline(h = 1, lty = 2, col = "grey")
sarimaxowa_scores <- scores_lnorm( x = CHILI[OWA+1], meanlog = sarimaxowa[,"pred"], sdlog = sarimaxowa[,"se"], which = c("dss", "logs")) summary(sarimaxowa_scores)
Note that discretized forecast distributions yield almost identical scores (essentially due to the large counts):
sarimaxowa_scores_discretized <- scores_lnorm_discrete( x = CHILI[OWA+1], meanlog = sarimaxowa[,"pred"], sdlog = sarimaxowa[,"se"], which = c("dss", "logs")) summary(sarimaxowa_scores_discretized)
stopifnot( all.equal(sarimaxowa_scores_discretized[,"dss"], sarimaxowa_scores[,"dss"], tolerance = 1e-5), all.equal(sarimaxowa_scores_discretized[,"logs"], sarimaxowa_scores[,"logs"], tolerance = 1e-5) )
sarimaxowa_quantiles <- sapply(X = 1:99/100, FUN = qlnorm, meanlog = sarimaxowa[,"pred"], sdlog = sarimaxowa[,"se"]) par(mar = c(5,5,1,1)) osaplot( quantiles = sarimaxowa_quantiles, probs = 1:99/100, observed = CHILI[OWA+1], scores = sarimaxowa_scores, start = OWA[1]+1, xlab = "Week", ylim = c(0,60000), fan.args = list(ln = c(0.1,0.9), rlab = NULL) )
## try update.Arima() with the first test period TEST1 <- TEST[[1]] fit1 <- update(sarimaxfit, subset = 1:(TEST1[1]-1)) sarimaxfor1 <- predict(fit1, n.ahead = length(TEST1), newxreg = sarimax_cov[TEST1,,drop=FALSE]) ## check that quantiles from predict() agree with quantiles from forecast() q <- sapply(X = c(0.1,0.025,0.9,0.975), FUN = qlnorm, meanlog = sarimaxfor1$pred, sdlog = sarimaxfor1$se) fc <- forecast(fit1, xreg = sarimax_cov[TEST1,,drop=FALSE], fan = FALSE) stopifnot(all.equal(q, unclass(cbind(fc$lower, fc$upper)), check.attributes = FALSE)) ## check against quantiles from simulate.Arima() set.seed(3) sarimaxsims1 <- replicate(1000, simulate(fit1, xreg = sarimax_cov[TEST1,,drop=FALSE])) qsims <- t(apply(sarimaxsims1, 1, quantile, probs = c(0.1, 0.025, 0.9, 0.975))) matplot(q); matlines(qsims) # ok stopifnot(all.equal(q, unname(qsims), tolerance = 0.05))
sarimaxfor <- lapply(TEST, function (testperiod) { t0 <- testperiod[1] - 1 fit0 <- update(sarimaxfit, subset = 1:t0) fc <- predict(fit0, n.ahead = length(testperiod), newxreg = sarimax_cov[testperiod,,drop=FALSE]) list(testperiod = testperiod, observed = as.vector(CHILI[testperiod]), pred = fc$pred, se = fc$se) })
par(mar = c(5,5,1,1), mfrow = sort(n2mfrow(length(sarimaxfor))), las = 1) invisible(lapply(sarimaxfor, function (x) { PIT <- plnorm(x$observed, meanlog = x$pred, sdlog = x$se) hist(PIT, breaks = seq(0, 1, 0.1), freq = FALSE, main = format_period(x$testperiod, fmt = "%Y", collapse = "/")) abline(h = 1, lty = 2, col = "grey") }))
par(mar = c(5,5,1,1)) t(sapply(sarimaxfor, function (x) { quantiles <- sapply(X = 1:99/100, FUN = qlnorm, meanlog = x$pred, sdlog = x$se) scores <- scores_lnorm(x = x$observed, meanlog = x$pred, sdlog = x$se, which = c("dss", "logs")) osaplot(quantiles = quantiles, probs = 1:99/100, observed = x$observed, scores = scores, start = x$testperiod[1], xlab = "Week", ylim = c(0,60000), fan.args = list(ln = c(0.1,0.9), rlab = NULL)) colMeans(scores) }))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.