In HIDDA/forecasting: Forecasting Based on Surveillance Data

knitr::opts_chunk$set(message = FALSE, warning = FALSE, error = FALSE, echo = FALSE,
                      fig.align = "center", dev.args = list(pointsize = 10))

options(digits = 4)  # for more compact numerical outputs
library("HIDDA.forecasting")
source("../setup.R", local = TRUE)  # define test periods (OWA, TEST)

In this vignette, we use forecasting methods provided by:

library("kcde")

The corresponding software reference is:

local({
    ref <- citation(package = "kcde", auto = TRUE)
    ref$year <- "2017"
    ref$url <- paste(ref$url, "tree", packageDescription("kcde", fields = "RemoteSha"), sep = "/")
    cat("<blockquote>"); print(ref, style = "html"); cat("</blockquote>\n")
})

Modelling

CHILIdat <- fortify.zoo(CHILI)
TRAIN <- 1:OWA[1]

Note: we use a log-transformation of the CHILI counts in kcde.

Configuring kcde() is quiet lengthy and not shown here (see the vignette sources for details).

## the setup is derived from the "dengue_sj" example in
## https://github.com/reichlab/article-disease-pred-with-kcde/blob/master/inst/code/estimation/kcde-estimation-step.R

prediction_target_var <- "CHILI"
predictive_vars <- "CHILI"
prediction_horizon <- 1
max_lag <- 1L
max_seasonal_lag <- 0L
init_sample_size <- length(TRAIN)  # "scott's rule"
bw_parameterization <- "diagonal"
### **Full** Bandwidth KCDE seems to cost over 100h estimation time with 4 cores
### So we use the diagonal variant (which took 23 minutes for the Dengue data)

## setup list describing kernel components
kernel_components <- list()

## add periodic kernel component capturing seasonality
kernel_components <- c(kernel_components, list(list(
    vars_and_offsets = data.frame(
        var_name = "Index", offset_value = 0L, offset_type = "lag",
        combined_name = "time_index_lag0", stringsAsFactors = FALSE
    ),
    kernel_fn = kcde::periodic_kernel,
    theta_fixed = list(period = pi / 365.2425),
    theta_est = list("bw"),
    initialize_kernel_params_fn =
        kcde::initialize_params_periodic_kernel,
    initialize_kernel_params_args = list(
        sample_size = init_sample_size
    ),
    get_theta_optim_bounds_fn =
        kcde::get_theta_optim_bounds_periodic_kernel,
    get_theta_optim_bounds_args = NULL,
    vectorize_kernel_params_fn =
        kcde::vectorize_params_periodic_kernel,
    vectorize_kernel_params_args = NULL,
    update_theta_from_vectorized_theta_est_fn =
        kcde::update_theta_from_vectorized_theta_est_periodic_kernel,
    update_theta_from_vectorized_theta_est_args = NULL
    )))

## Kernel components for observed values of incidence
## First step is setup: create list of data frames specifying groups of
## variables and offsets included in each kernel component
lag_values <- NULL
for(seasonal_lag in seq(from = 0, to = max_seasonal_lag)) {
    lag_values <- c(lag_values,
        seq(from = 0, to = max_lag) + 52 * seasonal_lag)
}
print(lag_values)

if (bw_parameterization == "diagonal") {

    vars_and_offsets_groups <- list()

    ## Group of variable names and offsets for prediction target
    new_vars_and_offsets_group <- data.frame(
        var_name = prediction_target_var,
        offset_value = prediction_horizon, offset_type = "horizon",
        stringsAsFactors = FALSE
    )
    vars_and_offsets_groups <- c(vars_and_offsets_groups,
                                 list(new_vars_and_offsets_group))

    ## Groups of variable names and offsets for lagged predictive variables
    for(lag_value in lag_values) {
        for(predictive_var in predictive_vars) {
            ## No filtering: group for lagged "raw"/unfiltered observed incidence
            new_vars_and_offsets_group <- data.frame(
                var_name = predictive_var,
                offset_value = lag_value, offset_type = "lag",
                stringsAsFactors = FALSE
            )
            vars_and_offsets_groups <- c(vars_and_offsets_groups,
                                         list(new_vars_and_offsets_group))
        }
    }

    ## add combined_name column
    for (i in seq_along(vars_and_offsets_groups)) {
        vars_and_offsets_groups[[i]]$combined_name <- with(
            vars_and_offsets_groups[[i]],
            paste0(var_name, "_", offset_type, offset_value)
        )
    }

} else if (bw_parameterization == "full") {

    ## Prediction target variable
    new_vars_and_offsets_group <- data.frame(
        var_name = prediction_target_var,
        offset_value = prediction_horizon, offset_type = "horizon",
        stringsAsFactors = FALSE
    )
    ## Lagged prediction target == predictive variables
    for(lag_value in lag_values) {
        for(predictive_var in predictive_vars) {
            ## No filtering: lagged "raw"/unfiltered observed incidence
            new_vars_and_offsets_group <- rbind(
                new_vars_and_offsets_group,
                data.frame(
                    var_name = predictive_var,
                    offset_value = lag_value, offset_type = "lag",
                    stringsAsFactors = FALSE
                )
            )
        }
    }
    ## Add combined_name column and put in a list for further processing below
    new_vars_and_offsets_group$combined_name <- with(
        new_vars_and_offsets_group,
        paste0(var_name, "_", offset_type, offset_value)
    )
    vars_and_offsets_groups <- list(new_vars_and_offsets_group)

}

print(vars_and_offsets_groups)


## configure discretization
log_exp_x_minus_0.5 <- function(x) {
    temp <- exp(x) - 0.5
    temp[temp < 0] <- 0
    return(log(temp))
}
log_exp_x_plus_0.5 <- function(x) {
    return(log(exp(x) + 0.5))
}
log_round_to_integer_plus_0.5_exp <- function(x) {
    exp_x <- exp(x) + 0.5
    inds_ceil <- exp_x - floor(exp_x) >= 0.5
    exp_x[inds_ceil] <- ceiling(exp_x[inds_ceil])
    exp_x[!inds_ceil] <- floor(exp_x[!inds_ceil])
    return(log(exp_x - 0.5))
}
in_range_fn <- function(x, tolerance = 0.5 * .Machine$double.eps^0.5) {
    vapply(X = x, FUN = function(x_i) {
        isTRUE(all.equal.numeric(
            x_i,
            log_round_to_integer_plus_0.5_exp(x_i),
            tolerance = tolerance
        ))
    }, FUN.VALUE = TRUE, USE.NAMES = FALSE)
}
discrete_var_names <- unlist(lapply(predictive_vars, function (predictive_var)
    c(paste0(predictive_var, "_lag", rep(seq(from = 0, to = max_lag + 52 * max_seasonal_lag), each=2)),
      paste0(predictive_var, "_horizon", rep(1:52, each=2)))))
discrete_var_range_fns <- sapply(
    X = discrete_var_names,
    FUN = function(discrete_var_name) {
    list(a = log_exp_x_minus_0.5,
         b = log_exp_x_plus_0.5,
         in_range = in_range_fn,
         discretizer = log_round_to_integer_plus_0.5_exp)
}, simplify = FALSE, USE.NAMES = TRUE)

## Second step is to actually append the kernel component descriptions to the
## kernel_components list
kernel_components <- c(kernel_components,
    lapply(vars_and_offsets_groups, function(vars_and_offsets) {
        lower_trunc_bds <- rep(-Inf, nrow(vars_and_offsets))
        names(lower_trunc_bds) <- vars_and_offsets$combined_name
        upper_trunc_bds <- rep(Inf, nrow(vars_and_offsets))
        names(upper_trunc_bds) <- vars_and_offsets$combined_name

        return(list(
            vars_and_offsets = vars_and_offsets,
            kernel_fn = kcde::log_pdtmvn_mode_centered_kernel,
            rkernel_fn = kcde::rlog_pdtmvn_mode_centered_kernel,
            theta_fixed = list(
                parameterization = "bw-chol-decomp",
                continuous_vars = NULL,
                discrete_vars = vars_and_offsets$combined_name[
                    vars_and_offsets$combined_name %in% discrete_var_names],
                discrete_var_range_fns = discrete_var_range_fns,
                lower = lower_trunc_bds,
                upper = upper_trunc_bds,
                validate_in_support = FALSE
            ),
            theta_est = list("bw"),
            initialize_kernel_params_fn =
                kcde::initialize_params_log_pdtmvn_kernel,
            initialize_kernel_params_args = list(
                sample_size = init_sample_size
            ),
            get_theta_optim_bounds_fn =
                kcde::get_theta_optim_bounds_log_pdtmvn_kernel,
            get_theta_optim_bounds_args = NULL,
            vectorize_kernel_params_fn =
                kcde::vectorize_params_log_pdtmvn_kernel,
            vectorize_kernel_params_args = NULL,
            update_theta_from_vectorized_theta_est_fn =
                kcde::update_theta_from_vectorized_theta_est_log_pdtmvn_kernel,
            update_theta_from_vectorized_theta_est_args = NULL
        ))
    })
)

kcde_control <- create_kcde_control(
    X_names = "Index",  # seems to work, no need to supply as.integer(Index)
    y_names = "CHILI",
    time_name = "Index",
    prediction_horizons = "this parameter is actually unused",
    kernel_components = kernel_components,
    filter_control = NULL,
    crossval_buffer = 365,  # as in the original application
    prediction_inds_not_included = NULL,
    loss_fn = neg_log_score_loss,
    loss_fn_prediction_args = list(
        prediction_type = "distribution",
        log = TRUE),
    loss_args = NULL,
    variable_selection_method = "all_included",
    par_cores = 3
)
library("doMC")
registerDoMC(cores = kcde_control$par_cores)
runtime <- system.time(
    kcdefit <- kcde(data = CHILIdat[TRAIN,],
                    kcde_control = kcde_control)
)
kcdefit$runtime <- runtime
## save estimation results
save(kcdefit, file = "kcdefit.RData", compress = "xz")

load("kcdefit.RData")

We used 3 cores in parallel. Fitting the full bandwidth KCDE would take several days, so we used the diagonal bandwidth parametrization.

kcdefit$runtime / 60

Unfortunately, the estimation function kcde() just returns a list without a dedicated class. It is unclear to me how to summarize the model fit or extract fitted values ...

One-week-ahead forecasts

We compute r length(OWA) one-week-ahead forecasts from r format_period(OWA) (the OWA period).

The code is again lengthy and not shown here (see the sources for details).

## function derived from code for the "dengue_sj" application in
## https://github.com/reichlab/article-disease-pred-with-kcde/blob/master/inst/code/prediction/kcde-prediction.R
predict_kcde_owa <- function(kcdefit, data, prediction_time_inds,
                             nsamples = 10000)
{
    prediction_target_var <- kcdefit$kcde_control$y_names
    prediction_horizon <- 1  # this is actually a property of kcdefit

    ## Allocate data frame to store results
    num_rows <- length(prediction_time_inds)
    scores <- data.frame(
        prediction_time_ind = prediction_time_inds,
        observed = data[prediction_time_inds, prediction_target_var],
        pt_pred = rep(NA_real_, num_rows),
        AE = rep(NA_real_, num_rows),
        log_score = rep(NA_real_, num_rows)
    )
    samples <- matrix(NA_real_, num_rows, nsamples,
                      dimnames = list(prediction_time_inds, NULL))

    results_row_ind <- 1L
    for(prediction_time_ind in prediction_time_inds) {
        ## Get index of analysis time in data set
        ## (time from which we predict forward)
        analysis_time_ind <- prediction_time_ind - prediction_horizon

        ## Compute log score
        observed_prediction_target <- as.matrix(scores$observed[results_row_ind])
        colnames(observed_prediction_target) <-
            paste0(prediction_target_var, "_horizon", prediction_horizon)
        scores$log_score[results_row_ind] <-
            kcde::kcde_predict(
                kcde_fit = kcdefit,
                prediction_data =
                    data[seq_len(analysis_time_ind), , drop = FALSE],
                leading_rows_to_drop = 0L,
                trailing_rows_to_drop = 0L,
                prediction_type = "distribution",
                prediction_test_lead_obs = observed_prediction_target,
                log = TRUE
            )

        ## Sample from predictive distribution
        samples[results_row_ind,] <-
            kcde::kcde_predict(
                n = nsamples,
                kcde_fit = kcdefit,
                prediction_data =
                    data[seq_len(analysis_time_ind), , drop = FALSE],
                leading_rows_to_drop = 0L,
                trailing_rows_to_drop = 0L,
                prediction_type = "sample"
            )

        ## Increment results row
        results_row_ind <- results_row_ind + 1L
    }

    ## Correction by subtracting 0.5
    samples <- samples - 0.5

    ## Compute point predictions
    scores$pt_pred <- apply(samples, 1, median)

    ## Compute absolute error of point prediction
    scores$AE <- abs(scores$pt_pred - scores$observed)

    ## fix orientation of the log-score
    scores$log_score <- -scores$log_score

    ## add DSS
    scores$DSS <- surveillance:::.dss(
        meanP = apply(samples, 1, mean),
        varP = apply(samples, 1, var),
        x = scores$observed)

    return(list(scores = scores,
                pdists = apply(samples, 1, ecdf)))
}

set.seed(161017)
kcdeowa_runtime <- system.time(
    kcdeowa <- predict_kcde_owa(kcdefit = kcdefit, data = CHILIdat,
                                prediction_time_inds = OWA + 1L)
)
save(kcdeowa, kcdeowa_runtime, file = "kcdeowa.RData", compress = "xz")

load("kcdeowa.RData")

Computing the r length(OWA) forecasts took r sprintf("%.1f", kcdeowa_runtime[["elapsed"]]/60) minutes (single core).

The PIT histogram is based on the pointwise ECDF of the samples from the predictive distributions.

par(mar = c(5,5,1,1), las = 1)
surveillance::pit(x = kcdeowa$scores$observed, pdistr = kcdeowa$pdists,
                  plot = list(ylab = "Density"))

summary(kcdeowa$scores[c("DSS", "log_score", "AE")])

par(mar = c(5,5,1,1))
osaplot(
    quantiles = t(sapply(kcdeowa$pdists, quantile, probs=1:99/100)), probs = 1:99/100,
    observed = kcdeowa$scores$observed,
    scores = as.matrix(kcdeowa$scores[c("DSS", "log_score")]),
    start = OWA[1]+1, xlab = "Week", ylim = c(0,60000),
    fan.args = list(ln = c(0.1,0.9), rlab = NULL)
)

Long-term forecasts

We would need to rerun kcde() and subsequent predictions for each of the three different training periods with prediction_horizon varying from 1 to r length(TEST[[1]]). Based on the runtime of the above computations for a single training period and prediction horizon, computing all long-term forecasts is estimated to take approximately r round(3*30*(kcdefit$runtime[["elapsed"]] + kcdeowa_runtime[["elapsed"]]) / 60 / 60, 1) hours. This is far beyond the runtimes of the alternative prediction approaches, so we skip long-term forecasts with kcde.

HIDDA/forecasting documentation built on Jan. 11, 2024, 1:11 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com