In HIDDA/forecasting: Forecasting Based on Surveillance Data

knitr::opts_chunk$set(message = FALSE, warning = FALSE, error = FALSE,
                      fig.align = "center", dev.args = list(pointsize = 10))

options(digits = 4)  # for more compact numerical outputs
library("HIDDA.forecasting")
library("ggplot2")
source("setup.R", local = TRUE)  # define test periods (OWA, TEST)

In this vignette, we use forecasting methods provided by:

library("prophet")

The corresponding software reference is:

cat("<blockquote>")
print(citation(package = "prophet", auto = TRUE), style = "html")
cat("</blockquote>\n")

Modelling

prophet uses a Gaussian likelihood, so we will work with log-counts as in ARIMA.

## which dates correspond to ISO week 52?
index(CHILI)[strftime(index(CHILI), "%V") == "52"]

So for consistency with the other models we define holidays from 21 to 28 December each year.

if (packageVersion("prophet") < "0.4") {
    ## workaround a bug in make_holiday_features(), which needs row_number()
    library("dplyr")
}

set.seed(1411171)
christmas <- data.frame(
    holiday = "Christmas",
    ds = as.Date(paste0(2000:2016, "-12-21")),
    lower_window = 0,
    upper_window = 7
)
prophetfit_control <- prophet(
    yearly.seasonality = TRUE, weekly.seasonality = FALSE, daily.seasonality = FALSE,
    holidays = christmas,
    mcmc.samples = 0, # invokes rstan::optimizing (fast MAP estimation)
    interval.width = 0.95, fit = FALSE)
prophetfit <- fit.prophet(
    m = prophetfit_control,
    df = data.frame(ds = index(CHILI), y = log(CHILI))
)
prophetfitted <- predict(prophetfit)

prophet_plot_components(prophetfit, prophetfitted)

CHILIdat <- fortify(CHILI)
CHILIdat[c("prophetfitted","prophetlower","prophetupper")] <-
    exp(prophetfitted[c("yhat", "yhat_lower", "yhat_upper")])

##plot(prophetfit, prophetfitted)
ggplot(CHILIdat, aes(x=Index, ymin=prophetlower, y=prophetfitted, ymax=prophetupper)) +
    geom_ribbon(fill="orange") + geom_line(col="darkred") +
    geom_point(aes(y=CHILI), pch=20) +
    scale_y_sqrt(expand = c(0,0), limits = c(0,NA))

One-week-ahead forecasts

We compute r length(OWA) one-week-ahead forecasts from r format_period(OWA) (the OWA period).

For each time point, forecasting with prophet takes about 3 seconds, i.e., computing all one-week-ahead forecasts takes approx. r sprintf("%.1f", length(OWA) * 3/60) minutes ...

set.seed(1411172)
prophetowa <- cross_validation(
    model = prophetfit, horizon = 1,
    period = 1, initial = OWA[1] - 1, units = "weeks"
)
## add forecast variance (calculated from prediction interval)
prophetowa$sigma <- with(prophetowa, (yhat_upper-yhat_lower)/2/qnorm(0.975))
## save results
save(prophetowa, file = "prophetowa.RData")

load("prophetowa.RData")

## BTW, the error SD can be obtained from a prophet fit as
prophetfit$y.scale * prophetfit$params$sigma_obs

prophet forecasts for the log-counts are approximately Gaussian with mean yhat and variance sigma^2 => back-transformation via exp() is log-normal

par(mar = c(5,5,1,1), las = 1)
.PIT <- plnorm(exp(prophetowa$y), meanlog = prophetowa$yhat, sdlog = prophetowa$sigma)
hist(.PIT, breaks = seq(0, 1, 0.1), freq = FALSE, main = "", xlab = "PIT")
abline(h = 1, lty = 2, col = "grey")

prophetowa_scores <- scores_lnorm(
    x = exp(prophetowa$y),
    meanlog = prophetowa$yhat, sdlog = prophetowa$sigma,
    which = c("dss", "logs"))
summary(prophetowa_scores)

Note that discretized forecast distributions yield almost identical scores (essentially due to the large counts):

prophetowa_scores_discretized <- scores_lnorm_discrete(
    x = exp(prophetowa$y),
    meanlog = prophetowa$yhat, sdlog = prophetowa$sigma,
    which = c("dss", "logs"))
summary(prophetowa_scores_discretized)

stopifnot(
    all.equal(prophetowa_scores_discretized[,"dss"], prophetowa_scores[,"dss"], tolerance = 1e-5),
    all.equal(prophetowa_scores_discretized[,"logs"], prophetowa_scores[,"logs"], tolerance = 1e-5)
)

prophetowa_quantiles <- sapply(X = 1:99/100, FUN = qlnorm,
                               meanlog = prophetowa$yhat,
                               sdlog = prophetowa$sigma)
par(mar = c(5,5,1,1))
osaplot(
    quantiles = prophetowa_quantiles, probs = 1:99/100,
    observed = exp(prophetowa$y), scores = prophetowa_scores,
    start = OWA[1]+1, xlab = "Week", ylim = c(0,60000),
    fan.args = list(ln = c(0.1,0.9), rlab = NULL)
)

Long-term forecasts

## try with the first test period
TEST1 <- TEST[[1]]
fit1 <- fit.prophet(
    m = prophetfit_control,
    df = data.frame(ds = index(CHILI), y = log(CHILI))[1:(TEST1[1]-1),]
)
prophetfor1 <- predict(fit1, data.frame(ds = index(CHILI)[TEST1]))
## add forecast variance (calculated from prediction interval)
prophetfor1$sigma <- with(prophetfor1, (yhat_upper-yhat_lower)/2/qnorm(0.975))

set.seed(1411173)
prophetfor <- lapply(TEST, function (testperiod) {
    t0 <- testperiod[1] - 1
    fit0 <- fit.prophet(
        m = prophetfit_control,
        df = data.frame(ds = index(CHILI), y = log(CHILI))[1:t0,]
    )
    fc <- predict(fit0, data.frame(ds = index(CHILI)[testperiod]))
    ## add forecast variance (calculated from prediction interval)
    fc$sigma <- with(fc, (yhat_upper-yhat_lower)/2/qnorm(0.975))
    list(testperiod = testperiod,
         observed = as.vector(CHILI[testperiod]),
         yhat = fc$yhat, sigma = fc$sigma)
})

par(mar = c(5,5,1,1), mfrow = sort(n2mfrow(length(prophetfor))), las = 1)
invisible(lapply(prophetfor, function (x) {
    PIT <- plnorm(x$observed, meanlog = x$yhat, sdlog = x$sigma)
    hist(PIT, breaks = seq(0, 1, 0.1), freq = FALSE,
         main = format_period(x$testperiod, fmt = "%Y", collapse = "/"))
    abline(h = 1, lty = 2, col = "grey")
}))

par(mar = c(5,5,1,1))
t(sapply(prophetfor, function (x) {
    quantiles <- sapply(X = 1:99/100, FUN = qlnorm,
                        meanlog = x$yhat, sdlog = x$sigma)
    scores <- scores_lnorm(x = x$observed,
                           meanlog = x$yhat, sdlog = x$sigma,
                           which = c("dss", "logs"))
    osaplot(quantiles = quantiles, probs = 1:99/100,
            observed = x$observed, scores = scores,
            start = x$testperiod[1], xlab = "Week", ylim = c(0,60000),
            fan.args = list(ln = c(0.1,0.9), rlab = NULL))
    colMeans(scores)
}))