In HIDDA/forecasting: Forecasting Based on Surveillance Data

knitr::opts_chunk$set(message = FALSE, warning = FALSE, error = FALSE,
                      fig.align = "center", dev.args = list(pointsize = 10))

options(digits = 4)  # for more compact numerical outputs
library("HIDDA.forecasting")
library("ggplot2")
source("setup.R", local = TRUE)  # define test periods (OWA, TEST)

In this vignette, we use forecasting methods provided by:

library("surveillance")

The corresponding software reference is:

cat("<blockquote>")
print(citation(package = "surveillance", auto = TRUE), style = "html")
cat("</blockquote>\n")

Modelling

CHILI.sts <- sts(observed = CHILI,
                 epoch = as.integer(index(CHILI)), epochAsDate = TRUE)

(weeksInYear <- table(year(CHILI.sts)))
mean(weeksInYear)
## long-term average is 52.1775 weeks per year
f1 <- addSeason2formula(~ 1, period = 52.1775, timevar = "index")
## equivalent: f1 <- addSeason2formula(~ 1, period = 365.2425, timevar = "t")
hhh4fit <- hhh4(stsObj = CHILI.sts,
     control = list(
         ar = list(f = update(f1, ~. + christmas)),
         end = list(f = f1),
         family = "NegBin1",
         data = list(index = 1:nrow(CHILI.sts),
                     christmas = as.integer(epochInYear(CHILI.sts) %in% 52))
     ))
summary(hhh4fit, maxEV = TRUE)

Alternatively, use a yearly varying frequency of 52 or 53 weeks for the sinusoidal time effects:

f1_varfreq <- ~ 1 + sin(2*pi*epochInYear/weeksInYear) + cos(2*pi*epochInYear/weeksInYear)
hhh4fit_varfreq <- update(hhh4fit,
    ar = list(f = update(f1_varfreq, ~. + christmas)),
    end = list(f = f1_varfreq),
    data = list(epochInYear = epochInYear(CHILI.sts),
                weeksInYear = rep(weeksInYear, weeksInYear)))
AIC(hhh4fit, hhh4fit_varfreq)

plot(hhh4fit, hhh4fit_varfreq, type = "maxEV",
     matplot.args = list(ylab = expression(lambda[t]), xlim =c(2002,2007)))

plot(hhh4fit, pch = 20, names = "", ylab = "")

qplot(x = 1:53, y = drop(predict(hhh4fit, newSubset = 1:53, type = "ar.exppred")),
      geom = "line", ylim = c(0,1.3), xlab = "week", ylab = expression(lambda[t])) +
    geom_hline(yintercept = 1, lty = 2)

CHILIdat <- fortify(CHILI)
## add fitted mean and CI to CHILIdat
CHILIdat$hhh4upper <- CHILIdat$hhh4lower <- CHILIdat$hhh4fitted <- NA_real_
CHILIdat[hhh4fit$control$subset,"hhh4fitted"] <- fitted(hhh4fit)
CHILIdat[hhh4fit$control$subset,c("hhh4lower","hhh4upper")] <-
    sapply(c(0.025, 0.975), function (p)
        qnbinom(p, mu = fitted(hhh4fit),
                size = exp(hhh4fit$coefficients[["-log(overdisp)"]])))

ggplot(CHILIdat, aes(x=Index, ymin=hhh4lower, y=hhh4fitted, ymax=hhh4upper)) +
    geom_ribbon(fill="orange") + geom_line(col="darkred") +
    geom_point(aes(y=CHILI), pch=20) +
    scale_y_sqrt(expand = c(0,0), limits = c(0,NA))

One-week-ahead forecasts

We compute r length(OWA) one-week-ahead forecasts from r format_period(OWA) (the OWA period), which takes roughly 4 seconds (we could parallelize using the cores argument of oneStepAhead()).

hhh4owa <- oneStepAhead(hhh4fit, range(OWA), type = "rolling", verbose = FALSE)
save(hhh4owa, file = "hhh4owa.RData")

load("hhh4owa.RData")

par(mar = c(5,5,1,1), las = 1)
pit(hhh4owa, plot = list(ylab = "Density"))

calibrationTest(hhh4owa, which = "dss")
## calibrationTest(hhh4owa, which = "logs")  # skipped for CRAN

hhh4owa_scores <- scores(hhh4owa, which = c("dss", "logs"), reverse = FALSE)
summary(hhh4owa_scores)

par(mar = c(5,5,1,1))
hhh4owa_quantiles <- quantile(hhh4owa, probs = 1:99/100)
osaplot(
    quantiles = hhh4owa_quantiles, probs = 1:99/100,
    observed = hhh4owa$observed, scores = hhh4owa_scores,
    start = OWA[1]+1, xlab = "Week", ylim = c(0,60000),
    fan.args = list(ln = c(0.1,0.9), rlab = NULL)
)

Long-term forecasts

Example with the first test period

TEST1 <- TEST[[1]]
fit1 <- update(hhh4fit, subset.upper = TEST1[1]-1)
hhh4sim1 <- simulate(fit1, nsim = 1000, seed = 726, subset = TEST1,
                     y.start = observed(CHILI.sts)[TEST1[1]-1,])

We can use the plot method provided by surveillance:

par(mfrow=c(1,2))
plot(hhh4sim1, "fan", ylim = c(0,60000), xlab = "", ylab = "",
     xaxis = list(xaxis.tickFreq = list("%m"=atChange, "%Y"=atChange),
                  xaxis.labelFreq = list("%Y"=atMedian),
                  xaxis.labelFormat = "%Y"))
plot(hhh4sim1, "size", horizontal = FALSE, main = "size of the epidemic",
     ylab = "", observed = list(labels = NULL))

There is also an associated scores method:

summary(scores(hhh4sim1, which = c("dss", "logs")))

Using relative frequencies to estimate the forecast distribution from these simulations is problematic. However, we can use kernel density estimation as implemented in package scoringRules, which the function scores_sample() wraps:

summary(scores_sample(x = observed(CHILI.sts)[TEST1], sims = drop(hhh4sim1)))

An even better approximation of the log-score at each time point can be obtained by using a mixture of the one-step-ahead negative binomial distributions given the samples from the previous time point. These forecast distributions are available through the function dhhh4sims(), which is used in logs_hhh4sims():

summary(logs_hhh4sims(sims = hhh4sim1, model = fit1))

For all test periods

hhh4sims <- lapply(TEST, function (testperiod) {
    t0 <- testperiod[1] - 1
    fit0 <- update(hhh4fit, subset.upper = t0)
    sims <- simulate(fit0, nsim = 1000, seed = t0, subset = testperiod,
                     y.start = observed(CHILI.sts)[t0,])
    list(testperiod = testperiod,
         observed = observed(CHILI.sts)[testperiod],
         fit0 = fit0, sims = sims)
})

PIT histograms, based on the pointwise ECDF of the simulated epidemic curves:

par(mar = c(5,5,1,1), mfrow = sort(n2mfrow(length(hhh4sims))), las = 1)
invisible(lapply(hhh4sims, function (x) {
    pit(x = x$observed, pdistr = apply(x$sims, 1, ecdf),
        plot = list(main = format_period(x$testperiod, fmt = "%Y", collapse = "/"),
                    ylab = "Density"))
}))

par(mar = c(5,5,1,1))
t(sapply(hhh4sims, function (x) {
    quantiles <- t(apply(x$sims, 1, quantile, probs = 1:99/100))
    scores <- scores_sample(x$observed, drop(x$sims))
    ## improved estimate via mixture of one-step-ahead NegBin distributions
    ## NOTE: we skip this here for speed (for CRAN)
    ##scores <- cbind(scores, logs2 = logs_hhh4sims(sims=x$sims, model=x$fit0))
    osaplot(quantiles = quantiles, probs = 1:99/100,
            observed = x$observed, scores = scores,
            start = x$testperiod[1], xlab = "Week", ylim = c(0,60000),
            fan.args = list(ln = c(0.1,0.9), rlab = NULL))
    colMeans(scores)
}))