knitr::opts_chunk$set(message = FALSE, warning = FALSE, error = FALSE, echo = FALSE, fig.align = "center", dev.args = list(pointsize = 10))
options(digits = 4) # for more compact numerical outputs library("HIDDA.forecasting") source("../setup.R", local = TRUE) # define test periods (OWA, TEST)
In this vignette, we use forecasting methods provided by:
library("kcde")
The corresponding software reference is:
local({ ref <- citation(package = "kcde", auto = TRUE) ref$year <- "2017" ref$url <- paste(ref$url, "tree", packageDescription("kcde", fields = "RemoteSha"), sep = "/") cat("<blockquote>"); print(ref, style = "html"); cat("</blockquote>\n") })
CHILIdat <- fortify.zoo(CHILI) TRAIN <- 1:OWA[1]
Note: we use a log-transformation of the CHILI counts in kcde.
Configuring kcde()
is quiet lengthy and not shown here
(see the vignette sources for details).
## the setup is derived from the "dengue_sj" example in ## https://github.com/reichlab/article-disease-pred-with-kcde/blob/master/inst/code/estimation/kcde-estimation-step.R prediction_target_var <- "CHILI" predictive_vars <- "CHILI" prediction_horizon <- 1 max_lag <- 1L max_seasonal_lag <- 0L init_sample_size <- length(TRAIN) # "scott's rule" bw_parameterization <- "diagonal" ### **Full** Bandwidth KCDE seems to cost over 100h estimation time with 4 cores ### So we use the diagonal variant (which took 23 minutes for the Dengue data) ## setup list describing kernel components kernel_components <- list() ## add periodic kernel component capturing seasonality kernel_components <- c(kernel_components, list(list( vars_and_offsets = data.frame( var_name = "Index", offset_value = 0L, offset_type = "lag", combined_name = "time_index_lag0", stringsAsFactors = FALSE ), kernel_fn = kcde::periodic_kernel, theta_fixed = list(period = pi / 365.2425), theta_est = list("bw"), initialize_kernel_params_fn = kcde::initialize_params_periodic_kernel, initialize_kernel_params_args = list( sample_size = init_sample_size ), get_theta_optim_bounds_fn = kcde::get_theta_optim_bounds_periodic_kernel, get_theta_optim_bounds_args = NULL, vectorize_kernel_params_fn = kcde::vectorize_params_periodic_kernel, vectorize_kernel_params_args = NULL, update_theta_from_vectorized_theta_est_fn = kcde::update_theta_from_vectorized_theta_est_periodic_kernel, update_theta_from_vectorized_theta_est_args = NULL ))) ## Kernel components for observed values of incidence ## First step is setup: create list of data frames specifying groups of ## variables and offsets included in each kernel component lag_values <- NULL for(seasonal_lag in seq(from = 0, to = max_seasonal_lag)) { lag_values <- c(lag_values, seq(from = 0, to = max_lag) + 52 * seasonal_lag) } print(lag_values) if (bw_parameterization == "diagonal") { vars_and_offsets_groups <- list() ## Group of variable names and offsets for prediction target new_vars_and_offsets_group <- data.frame( var_name = prediction_target_var, offset_value = prediction_horizon, offset_type = "horizon", stringsAsFactors = FALSE ) vars_and_offsets_groups <- c(vars_and_offsets_groups, list(new_vars_and_offsets_group)) ## Groups of variable names and offsets for lagged predictive variables for(lag_value in lag_values) { for(predictive_var in predictive_vars) { ## No filtering: group for lagged "raw"/unfiltered observed incidence new_vars_and_offsets_group <- data.frame( var_name = predictive_var, offset_value = lag_value, offset_type = "lag", stringsAsFactors = FALSE ) vars_and_offsets_groups <- c(vars_and_offsets_groups, list(new_vars_and_offsets_group)) } } ## add combined_name column for (i in seq_along(vars_and_offsets_groups)) { vars_and_offsets_groups[[i]]$combined_name <- with( vars_and_offsets_groups[[i]], paste0(var_name, "_", offset_type, offset_value) ) } } else if (bw_parameterization == "full") { ## Prediction target variable new_vars_and_offsets_group <- data.frame( var_name = prediction_target_var, offset_value = prediction_horizon, offset_type = "horizon", stringsAsFactors = FALSE ) ## Lagged prediction target == predictive variables for(lag_value in lag_values) { for(predictive_var in predictive_vars) { ## No filtering: lagged "raw"/unfiltered observed incidence new_vars_and_offsets_group <- rbind( new_vars_and_offsets_group, data.frame( var_name = predictive_var, offset_value = lag_value, offset_type = "lag", stringsAsFactors = FALSE ) ) } } ## Add combined_name column and put in a list for further processing below new_vars_and_offsets_group$combined_name <- with( new_vars_and_offsets_group, paste0(var_name, "_", offset_type, offset_value) ) vars_and_offsets_groups <- list(new_vars_and_offsets_group) } print(vars_and_offsets_groups) ## configure discretization log_exp_x_minus_0.5 <- function(x) { temp <- exp(x) - 0.5 temp[temp < 0] <- 0 return(log(temp)) } log_exp_x_plus_0.5 <- function(x) { return(log(exp(x) + 0.5)) } log_round_to_integer_plus_0.5_exp <- function(x) { exp_x <- exp(x) + 0.5 inds_ceil <- exp_x - floor(exp_x) >= 0.5 exp_x[inds_ceil] <- ceiling(exp_x[inds_ceil]) exp_x[!inds_ceil] <- floor(exp_x[!inds_ceil]) return(log(exp_x - 0.5)) } in_range_fn <- function(x, tolerance = 0.5 * .Machine$double.eps^0.5) { vapply(X = x, FUN = function(x_i) { isTRUE(all.equal.numeric( x_i, log_round_to_integer_plus_0.5_exp(x_i), tolerance = tolerance )) }, FUN.VALUE = TRUE, USE.NAMES = FALSE) } discrete_var_names <- unlist(lapply(predictive_vars, function (predictive_var) c(paste0(predictive_var, "_lag", rep(seq(from = 0, to = max_lag + 52 * max_seasonal_lag), each=2)), paste0(predictive_var, "_horizon", rep(1:52, each=2))))) discrete_var_range_fns <- sapply( X = discrete_var_names, FUN = function(discrete_var_name) { list(a = log_exp_x_minus_0.5, b = log_exp_x_plus_0.5, in_range = in_range_fn, discretizer = log_round_to_integer_plus_0.5_exp) }, simplify = FALSE, USE.NAMES = TRUE) ## Second step is to actually append the kernel component descriptions to the ## kernel_components list kernel_components <- c(kernel_components, lapply(vars_and_offsets_groups, function(vars_and_offsets) { lower_trunc_bds <- rep(-Inf, nrow(vars_and_offsets)) names(lower_trunc_bds) <- vars_and_offsets$combined_name upper_trunc_bds <- rep(Inf, nrow(vars_and_offsets)) names(upper_trunc_bds) <- vars_and_offsets$combined_name return(list( vars_and_offsets = vars_and_offsets, kernel_fn = kcde::log_pdtmvn_mode_centered_kernel, rkernel_fn = kcde::rlog_pdtmvn_mode_centered_kernel, theta_fixed = list( parameterization = "bw-chol-decomp", continuous_vars = NULL, discrete_vars = vars_and_offsets$combined_name[ vars_and_offsets$combined_name %in% discrete_var_names], discrete_var_range_fns = discrete_var_range_fns, lower = lower_trunc_bds, upper = upper_trunc_bds, validate_in_support = FALSE ), theta_est = list("bw"), initialize_kernel_params_fn = kcde::initialize_params_log_pdtmvn_kernel, initialize_kernel_params_args = list( sample_size = init_sample_size ), get_theta_optim_bounds_fn = kcde::get_theta_optim_bounds_log_pdtmvn_kernel, get_theta_optim_bounds_args = NULL, vectorize_kernel_params_fn = kcde::vectorize_params_log_pdtmvn_kernel, vectorize_kernel_params_args = NULL, update_theta_from_vectorized_theta_est_fn = kcde::update_theta_from_vectorized_theta_est_log_pdtmvn_kernel, update_theta_from_vectorized_theta_est_args = NULL )) }) )
kcde_control <- create_kcde_control( X_names = "Index", # seems to work, no need to supply as.integer(Index) y_names = "CHILI", time_name = "Index", prediction_horizons = "this parameter is actually unused", kernel_components = kernel_components, filter_control = NULL, crossval_buffer = 365, # as in the original application prediction_inds_not_included = NULL, loss_fn = neg_log_score_loss, loss_fn_prediction_args = list( prediction_type = "distribution", log = TRUE), loss_args = NULL, variable_selection_method = "all_included", par_cores = 3 ) library("doMC") registerDoMC(cores = kcde_control$par_cores) runtime <- system.time( kcdefit <- kcde(data = CHILIdat[TRAIN,], kcde_control = kcde_control) ) kcdefit$runtime <- runtime ## save estimation results save(kcdefit, file = "kcdefit.RData", compress = "xz")
load("kcdefit.RData")
We used 3 cores in parallel. Fitting the full bandwidth KCDE would take several days, so we used the diagonal bandwidth parametrization.
kcdefit$runtime / 60
Unfortunately, the estimation function kcde()
just returns a list without a dedicated class.
It is unclear to me how to summarize the model fit or extract fitted values ...
We compute r length(OWA)
one-week-ahead forecasts
from r format_period(OWA)
(the OWA
period).
The code is again lengthy and not shown here (see the sources for details).
## function derived from code for the "dengue_sj" application in ## https://github.com/reichlab/article-disease-pred-with-kcde/blob/master/inst/code/prediction/kcde-prediction.R predict_kcde_owa <- function(kcdefit, data, prediction_time_inds, nsamples = 10000) { prediction_target_var <- kcdefit$kcde_control$y_names prediction_horizon <- 1 # this is actually a property of kcdefit ## Allocate data frame to store results num_rows <- length(prediction_time_inds) scores <- data.frame( prediction_time_ind = prediction_time_inds, observed = data[prediction_time_inds, prediction_target_var], pt_pred = rep(NA_real_, num_rows), AE = rep(NA_real_, num_rows), log_score = rep(NA_real_, num_rows) ) samples <- matrix(NA_real_, num_rows, nsamples, dimnames = list(prediction_time_inds, NULL)) results_row_ind <- 1L for(prediction_time_ind in prediction_time_inds) { ## Get index of analysis time in data set ## (time from which we predict forward) analysis_time_ind <- prediction_time_ind - prediction_horizon ## Compute log score observed_prediction_target <- as.matrix(scores$observed[results_row_ind]) colnames(observed_prediction_target) <- paste0(prediction_target_var, "_horizon", prediction_horizon) scores$log_score[results_row_ind] <- kcde::kcde_predict( kcde_fit = kcdefit, prediction_data = data[seq_len(analysis_time_ind), , drop = FALSE], leading_rows_to_drop = 0L, trailing_rows_to_drop = 0L, prediction_type = "distribution", prediction_test_lead_obs = observed_prediction_target, log = TRUE ) ## Sample from predictive distribution samples[results_row_ind,] <- kcde::kcde_predict( n = nsamples, kcde_fit = kcdefit, prediction_data = data[seq_len(analysis_time_ind), , drop = FALSE], leading_rows_to_drop = 0L, trailing_rows_to_drop = 0L, prediction_type = "sample" ) ## Increment results row results_row_ind <- results_row_ind + 1L } ## Correction by subtracting 0.5 samples <- samples - 0.5 ## Compute point predictions scores$pt_pred <- apply(samples, 1, median) ## Compute absolute error of point prediction scores$AE <- abs(scores$pt_pred - scores$observed) ## fix orientation of the log-score scores$log_score <- -scores$log_score ## add DSS scores$DSS <- surveillance:::.dss( meanP = apply(samples, 1, mean), varP = apply(samples, 1, var), x = scores$observed) return(list(scores = scores, pdists = apply(samples, 1, ecdf))) } set.seed(161017) kcdeowa_runtime <- system.time( kcdeowa <- predict_kcde_owa(kcdefit = kcdefit, data = CHILIdat, prediction_time_inds = OWA + 1L) ) save(kcdeowa, kcdeowa_runtime, file = "kcdeowa.RData", compress = "xz")
load("kcdeowa.RData")
Computing the r length(OWA)
forecasts took
r sprintf("%.1f", kcdeowa_runtime[["elapsed"]]/60)
minutes (single core).
The PIT histogram is based on the pointwise ECDF of the samples from the predictive distributions.
par(mar = c(5,5,1,1), las = 1) surveillance::pit(x = kcdeowa$scores$observed, pdistr = kcdeowa$pdists, plot = list(ylab = "Density"))
summary(kcdeowa$scores[c("DSS", "log_score", "AE")])
par(mar = c(5,5,1,1)) osaplot( quantiles = t(sapply(kcdeowa$pdists, quantile, probs=1:99/100)), probs = 1:99/100, observed = kcdeowa$scores$observed, scores = as.matrix(kcdeowa$scores[c("DSS", "log_score")]), start = OWA[1]+1, xlab = "Week", ylim = c(0,60000), fan.args = list(ln = c(0.1,0.9), rlab = NULL) )
We would need to rerun kcde()
and subsequent predictions
for each of the three different training periods with
prediction_horizon
varying from 1 to r length(TEST[[1]])
.
Based on the runtime of the above computations for a
single training period and prediction horizon,
computing all long-term forecasts is estimated to take approximately
r round(3*30*(kcdefit$runtime[["elapsed"]] + kcdeowa_runtime[["elapsed"]]) / 60 / 60, 1)
hours. This is far beyond the runtimes of the alternative
prediction approaches, so we skip long-term forecasts with kcde.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.