knitr::opts_chunk$set(message = FALSE, warning = FALSE, error = FALSE, fig.align = "center", dev.args = list(pointsize = 10))
options(digits = 4) # for more compact numerical outputs library("HIDDA.forecasting") library("ggplot2") source("setup.R", local = TRUE) # define test periods (OWA, TEST)
In this vignette, we use forecasting methods provided by:
library("surveillance")
The corresponding software reference is:
cat("<blockquote>") print(citation(package = "surveillance", auto = TRUE), style = "html") cat("</blockquote>\n")
CHILI.sts <- sts(observed = CHILI, epoch = as.integer(index(CHILI)), epochAsDate = TRUE)
(weeksInYear <- table(year(CHILI.sts))) mean(weeksInYear) ## long-term average is 52.1775 weeks per year f1 <- addSeason2formula(~ 1, period = 52.1775, timevar = "index") ## equivalent: f1 <- addSeason2formula(~ 1, period = 365.2425, timevar = "t") hhh4fit <- hhh4(stsObj = CHILI.sts, control = list( ar = list(f = update(f1, ~. + christmas)), end = list(f = f1), family = "NegBin1", data = list(index = 1:nrow(CHILI.sts), christmas = as.integer(epochInYear(CHILI.sts) %in% 52)) )) summary(hhh4fit, maxEV = TRUE)
Alternatively, use a yearly varying frequency of 52 or 53 weeks for the sinusoidal time effects:
f1_varfreq <- ~ 1 + sin(2*pi*epochInYear/weeksInYear) + cos(2*pi*epochInYear/weeksInYear) hhh4fit_varfreq <- update(hhh4fit, ar = list(f = update(f1_varfreq, ~. + christmas)), end = list(f = f1_varfreq), data = list(epochInYear = epochInYear(CHILI.sts), weeksInYear = rep(weeksInYear, weeksInYear))) AIC(hhh4fit, hhh4fit_varfreq)
plot(hhh4fit, hhh4fit_varfreq, type = "maxEV", matplot.args = list(ylab = expression(lambda[t]), xlim =c(2002,2007)))
plot(hhh4fit, pch = 20, names = "", ylab = "")
qplot(x = 1:53, y = drop(predict(hhh4fit, newSubset = 1:53, type = "ar.exppred")), geom = "line", ylim = c(0,1.3), xlab = "week", ylab = expression(lambda[t])) + geom_hline(yintercept = 1, lty = 2)
CHILIdat <- fortify(CHILI) ## add fitted mean and CI to CHILIdat CHILIdat$hhh4upper <- CHILIdat$hhh4lower <- CHILIdat$hhh4fitted <- NA_real_ CHILIdat[hhh4fit$control$subset,"hhh4fitted"] <- fitted(hhh4fit) CHILIdat[hhh4fit$control$subset,c("hhh4lower","hhh4upper")] <- sapply(c(0.025, 0.975), function (p) qnbinom(p, mu = fitted(hhh4fit), size = exp(hhh4fit$coefficients[["-log(overdisp)"]])))
ggplot(CHILIdat, aes(x=Index, ymin=hhh4lower, y=hhh4fitted, ymax=hhh4upper)) + geom_ribbon(fill="orange") + geom_line(col="darkred") + geom_point(aes(y=CHILI), pch=20) + scale_y_sqrt(expand = c(0,0), limits = c(0,NA))
We compute r length(OWA)
one-week-ahead forecasts
from r format_period(OWA)
(the OWA
period),
which takes roughly 4 seconds
(we could parallelize using the cores
argument of oneStepAhead()
).
hhh4owa <- oneStepAhead(hhh4fit, range(OWA), type = "rolling", verbose = FALSE) save(hhh4owa, file = "hhh4owa.RData")
load("hhh4owa.RData")
par(mar = c(5,5,1,1), las = 1) pit(hhh4owa, plot = list(ylab = "Density"))
calibrationTest(hhh4owa, which = "dss") ## calibrationTest(hhh4owa, which = "logs") # skipped for CRAN
hhh4owa_scores <- scores(hhh4owa, which = c("dss", "logs"), reverse = FALSE) summary(hhh4owa_scores)
par(mar = c(5,5,1,1)) hhh4owa_quantiles <- quantile(hhh4owa, probs = 1:99/100) osaplot( quantiles = hhh4owa_quantiles, probs = 1:99/100, observed = hhh4owa$observed, scores = hhh4owa_scores, start = OWA[1]+1, xlab = "Week", ylim = c(0,60000), fan.args = list(ln = c(0.1,0.9), rlab = NULL) )
TEST1 <- TEST[[1]] fit1 <- update(hhh4fit, subset.upper = TEST1[1]-1) hhh4sim1 <- simulate(fit1, nsim = 1000, seed = 726, subset = TEST1, y.start = observed(CHILI.sts)[TEST1[1]-1,])
We can use the plot method provided by surveillance:
par(mfrow=c(1,2)) plot(hhh4sim1, "fan", ylim = c(0,60000), xlab = "", ylab = "", xaxis = list(xaxis.tickFreq = list("%m"=atChange, "%Y"=atChange), xaxis.labelFreq = list("%Y"=atMedian), xaxis.labelFormat = "%Y")) plot(hhh4sim1, "size", horizontal = FALSE, main = "size of the epidemic", ylab = "", observed = list(labels = NULL))
There is also an associated scores
method:
summary(scores(hhh4sim1, which = c("dss", "logs")))
Using relative frequencies to estimate the forecast distribution from
these simulations is problematic.
However, we can use kernel density estimation as implemented in package
scoringRules, which the function scores_sample()
wraps:
summary(scores_sample(x = observed(CHILI.sts)[TEST1], sims = drop(hhh4sim1)))
An even better approximation of the log-score at each time point can be
obtained by using a mixture of the one-step-ahead negative binomial
distributions given the samples from the previous time point.
These forecast distributions are available through the function
dhhh4sims()
, which is used in logs_hhh4sims()
:
summary(logs_hhh4sims(sims = hhh4sim1, model = fit1))
hhh4sims <- lapply(TEST, function (testperiod) { t0 <- testperiod[1] - 1 fit0 <- update(hhh4fit, subset.upper = t0) sims <- simulate(fit0, nsim = 1000, seed = t0, subset = testperiod, y.start = observed(CHILI.sts)[t0,]) list(testperiod = testperiod, observed = observed(CHILI.sts)[testperiod], fit0 = fit0, sims = sims) })
PIT histograms, based on the pointwise ECDF of the simulated epidemic curves:
par(mar = c(5,5,1,1), mfrow = sort(n2mfrow(length(hhh4sims))), las = 1) invisible(lapply(hhh4sims, function (x) { pit(x = x$observed, pdistr = apply(x$sims, 1, ecdf), plot = list(main = format_period(x$testperiod, fmt = "%Y", collapse = "/"), ylab = "Density")) }))
par(mar = c(5,5,1,1)) t(sapply(hhh4sims, function (x) { quantiles <- t(apply(x$sims, 1, quantile, probs = 1:99/100)) scores <- scores_sample(x$observed, drop(x$sims)) ## improved estimate via mixture of one-step-ahead NegBin distributions ## NOTE: we skip this here for speed (for CRAN) ##scores <- cbind(scores, logs2 = logs_hhh4sims(sims=x$sims, model=x$fit0)) osaplot(quantiles = quantiles, probs = 1:99/100, observed = x$observed, scores = scores, start = x$testperiod[1], xlab = "Week", ylim = c(0,60000), fan.args = list(ln = c(0.1,0.9), rlab = NULL)) colMeans(scores) }))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.