In HIDDA/forecasting: Forecasting Based on Surveillance Data

knitr::opts_chunk$set(message = FALSE, warning = FALSE, error = FALSE,
                      fig.align = "center", dev.args = list(pointsize = 10))

options(digits = 4)  # for more compact numerical outputs
library("HIDDA.forecasting")
library("ggplot2")
source("setup.R", local = TRUE)  # define test periods (OWA, TEST)

In this vignette, we use forecasting methods provided by:

library("glarma")

The corresponding software reference is:

cat("<blockquote>")
print(citation(package = "glarma", auto = TRUE), style = "html")
cat("</blockquote>\n")

Modelling

Construct the design matrix, including yearly seasonality and a Christmas effect as for the other models (see, e.g., vignette("CHILI_hhh4")):

y <- as.vector(CHILI)
X <- t(sapply(2*pi*seq_along(CHILI)/52.1775,
              function (x) c(sin = sin(x), cos = cos(x))))
X <- cbind(intercept = 1,
           X,
           christmas = as.integer(strftime(index(CHILI), "%V") == "52"))

Fitting a NegBin-GLM:

glmnbfit <- MASS::glm.nb(y ~ 0 + X)
summary(glmnbfit)
acf(residuals(glmnbfit))

Fitting a NegBin-GLARMA with orders $p = 4$ and $q = 0$:

glarmafit <- glarma(y = y, X = X, type = "NegBin", phiLags = 1:4)
## philags = 1:4 corresponds to ARMA(4,4) with theta_j = phi_j
summary(glarmafit)
par(mfrow = c(3,2))
set.seed(321)  # for strict reproducibility (randomized residuals)
plot(glarmafit)

## Investigating the AIC of various p and q combinations:
pqgrid <- expand.grid(p = 0:6, q = 0:6)
nrow(pqgrid)  # 49 models
pqgrid$AIC <- apply(pqgrid, 1, function (pq) {
    fit <- try(
        glarma(y = y, X = X, type = "NegBin",
               phiLags = seq_len(pq[1]), thetaLags = seq_len(pq[2])),
        silent = TRUE
    )
    if (inherits(fit, "try-error")) NA_real_ else extractAIC(fit)
})
table(is.na(pqgrid$AIC))  # 36 not converged
table(is.na(subset(pqgrid, p > 0 & q > 0)$AIC))  # p>0 & q>0 => non-convergence
head(pqgrid[order(pqgrid$AIC),], 10)  # p = 4, q = 0 wins

Note: Alternative GLARMA models using both phiLags and thetaLags did not converge. In an AIC comparison among the remaining models, the above model with $p=4$ (and $q=0$) had lowest AIC.

CHILIdat <- fortify(CHILI)
CHILIdat$glarmafitted <- fitted(glarmafit)
CHILIdat <- cbind(CHILIdat,
    sapply(c(glarmalower=0.025, glarmaupper=0.975), function (p)
        qnbinom(p, mu = glarmafit$mu, size = coef(glarmafit, type = "NB"))))

ggplot(CHILIdat, aes(x=Index, ymin=glarmalower, y=glarmafitted, ymax=glarmaupper)) +
    geom_ribbon(fill="orange") + geom_line(col="darkred") +
    geom_point(aes(y=CHILI), pch=20) +
    scale_y_sqrt(expand = c(0,0), limits = c(0,NA))

One-week-ahead forecasts

We compute r length(OWA) one-week-ahead forecasts from r format_period(OWA) (the OWA period). The model is refitted at each time point.

For each time point, refitting and forecasting with glarma takes about 1.5 seconds, i.e., computing all one-week-ahead forecasts takes approx. r sprintf("%.1f", length(OWA) * 1.5/60) minutes ... but we can parallelize.

glarmaowa <- t(simplify2array(surveillance::plapply(X = OWA, FUN = function (t) {
    glarmafit_t <- glarma(y = y[1:t], X = X[1:t,,drop=FALSE], type = "NegBin", phiLags = 1:4)
    c(mu = forecast(glarmafit_t, n.ahead = 1, newdata = X[t+1,,drop=FALSE])$mu,
      coef(glarmafit_t, type = "NB"))
}, .parallel = 3)))
save(glarmaowa, file = "glarmaowa.RData")

load("glarmaowa.RData")

par(mar = c(5,5,1,1), las = 1)
surveillance::pit(
    x = CHILI[OWA+1], pdistr = pnbinom,
    mu = glarmaowa[,"mu"], size = glarmaowa[,"alpha"],
    plot = list(ylab = "Density")
)

glarmaowa_scores <- surveillance::scores(
    x = CHILI[OWA+1], mu = glarmaowa[,"mu"],
    size = glarmaowa[,"alpha"], which = c("dss", "logs"))
summary(glarmaowa_scores)

glarmaowa_quantiles <- sapply(X = 1:99/100, FUN = qnbinom,
                              mu = glarmaowa[,"mu"],
                              size = glarmaowa[,"alpha"])
par(mar = c(5,5,1,1))
osaplot(
    quantiles = glarmaowa_quantiles, probs = 1:99/100,
    observed = CHILI[OWA+1], scores = glarmaowa_scores,
    start = OWA[1]+1, xlab = "Week", ylim = c(0,60000),
    fan.args = list(ln = c(0.1,0.9), rlab = NULL)
)

Long-term forecasts

glarmasims <- surveillance::plapply(TEST, function (testperiod) {
    t0 <- testperiod[1] - 1
    fit0 <- glarma(y = y[1:t0], X = X[1:t0,,drop=FALSE], type = "NegBin", phiLags = 1:4)
    set.seed(t0)
    sims <- replicate(n = 1000, {
        fc <- forecast(fit0, n.ahead = length(testperiod),
                       newdata = X[testperiod,,drop=FALSE],
                       newoffset = rep(0,length(testperiod)))
        do.call("cbind", fc[c("mu", "Y")])
    }, simplify = "array")
    list(testperiod = testperiod,
         observed = as.vector(CHILI[testperiod]),
         fit0 = fit0, means = sims[,"mu",], sims = sims[,"Y",])
}, .parallel = 2)

PIT histograms, based on the pointwise ECDF of the simulated epidemic curves:

par(mar = c(5,5,1,1), mfrow = sort(n2mfrow(length(glarmasims))), las = 1)
invisible(lapply(glarmasims, function (x) {
    surveillance::pit(x = x$observed, pdistr = apply(x$sims, 1, ecdf),
        plot = list(main = format_period(x$testperiod, fmt = "%Y", collapse = "/"),
                    ylab = "Density"))
}))

Just like for the simulations from hhh4() in vignette("CHILI_hhh4"), we can compute the log-score either using generic kernel density estimation as implemented in the scoringRules package, or via mixtures of negative binomial one-step-ahead distributions. A comparison for the first simulation period:

## using kernel density estimation
summary(with(glarmasims[[1]], scoringRules::logs_sample(observed, sims)))

## using `dnbmix()`
summary(with(glarmasims[[1]], logs_nbmix(observed, means, coef(fit0, type="NB"))))

par(mar = c(5,5,1,1))
t(sapply(glarmasims, function (x) {
    quantiles <- t(apply(x$sims, 1, quantile, probs = 1:99/100))
    scores <- cbind(
        scores_sample(x$observed, x$sims),
        logs2 = logs_nbmix(x$observed, x$means, coef(x$fit0, type="NB"))
    )
    osaplot(quantiles = quantiles, probs = 1:99/100,
            observed = x$observed, scores = scores,
            start = x$testperiod[1], xlab = "Week", ylim = c(0,60000),
            fan.args = list(ln = c(0.1,0.9), rlab = NULL))
    colMeans(scores)
}))