vignettes/quantgen-forecast-cv.R

library(covidcast)
library(evalcast)
library(modeltools)
library(dplyr)

## Setup 

# What are we forecasting?
response_source <- "jhu-csse"
response_signal <- "confirmed_7dav_incidence_prop"
incidence_period <- "day"
ahead <- 1:21
geo_type <- "state" 
forecast_dates <- seq(as.Date("2020-07-01"), as.Date("2020-08-15"), by = "day")

# Some quantgen parameters 
n <- 21               # Training set size (in days) 
lags <- c(0, 7, 14)   # Lags (in days) for features
no_pen_vars <- 1      # Variables to leave unpenalized (lastest response value)
nlambda <- 20         # Number of lambda values to consider in cross-validation
lp_solver <- "gurobi"
sort <- TRUE
nonneg <- TRUE

# Important: functions to considerably speed up data fetching steps. Only pull 
# recent data for each forecast date, depending on the training set size (and 
# other parameters for quantgen)
start_day_baseline <- function(forecast_date) {
  return(as.Date(forecast_date) - n - 4 + 1)
}

start_day_quantgen <- function(forecast_date) {
  return(as.Date(forecast_date) - max(ahead) - n - max(lags) + 1)
}

## Produce forecasts

# Produce forecasts using a baseline forecaster
pred_baseline <- get_predictions(
  forecaster = baseline_forecaster, 
  name_of_forecaster = "Baseline",
  signals = tibble::tibble(
                      data_source = response_source, 
                      signal = response_signal,
                      start_day = list(start_day_baseline)),
  forecast_dates = forecast_dates, 
  incidence_period = incidence_period, 
  ahead = ahead, geo_type = geo_type, 
  signal_aggregation = "long")

# Quantile autoregression with 3 lags, or QAR3
pred_quantgen1 <- get_predictions(
  forecaster = quantgen_forecaster, 
  name_of_forecaster = "QAR3",
  signals = tibble::tibble(
                      data_source = response_source, 
                      signal = response_signal,
                      start_day = list(start_day_quantgen)),
  forecast_dates = forecast_dates, 
  incidence_period = incidence_period, 
  ahead = ahead, geo_type = geo_type, 
  signal_aggregation = "list",
  n = n, lags = lags, 
  nlambda = nlambda, no_pen_vars = no_pen_vars,
  lp_solver = lp_solver, sort = sort, nonneg = nonneg)

# Quantile autoregression with 3 lags, plus 3 lags of the CLI-in-community
# signal from Delphi's symptom survey, or QAR3 + CLI3  
pred_quantgen2 <- get_predictions(
  forecaster = quantgen_forecaster, 
  name_of_forecaster = "QAR3 + CLI3",
  signals = tibble::tibble(
                      data_source = c(response_source, "fb-survey"),
                      signal = c(response_signal, "smoothed_hh_cmnty_cli"),
                      start_day = list(start_day_quantgen)),
  forecast_dates = forecast_dates, 
  incidence_period = incidence_period, 
  ahead = ahead, geo_type = geo_type, 
  signal_aggregation = "list", 
  n = n, lags = lags, 
  nlambda = nlambda, no_pen_vars = no_pen_vars,
  lp_solver = lp_solver, sort = sort, nonneg = nonneg)

## Evaluate forecasts

# Now "evaluate" all of these predictions. In quotes because we pass a fake
# evaluation function so we can do it ourselves later. This is because I'd
# rather see results compressed down to have one row per forecast task (not one
# row per forecasted quantile value) and it's easier to use dplyr::summarize()
results <- evaluate_predictions(
  predictions_cards = rbind(pred_baseline, pred_quantgen1, pred_quantgen2),
  err_measures = list(temp = function(quantile, value, actual) NA)) %>%
  select(geo_value, ahead, quantile, forecaster, forecast_date, target_end_date,
         value, actual)

# Overwrite evalcast::absolute_error() and evalcast::weighted_interval_score(), 
# because I was having some problems with them, see issues #392 and #393 on the
# cmu-delphi/covidcast repo. (This might have been fixed now but I'm still just 
# leaving these function definitions in place to be safe)
absolute_error <- function(tau, value, actual) {
  return(abs(actual - value)[tau == 0.5])
}

weighted_interval_score <- function(tau, value, actual) {
  return((absolute_error(tau, value, actual) / 2 +
         sum(pmax(tau * (actual - value),
         (tau - 1) * (actual - value), na.rm = TRUE))) /
         (length(tau) + 1) * 2)
}

# Do the evaluations ourselves
evals <- results %>%
  filter(!is.na(quantile)) %>% # NAs are problematic for simple my functions
  group_by(geo_value, ahead, forecaster, forecast_date, target_end_date) %>% 
  summarize(ae = absolute_error(quantile, value, actual),
            wis = weighted_interval_score(quantile, value, actual)) %>%
    ungroup()
            
# Save everything to file
save(list = ls(), file = "quantgen-forecast-cv.rda", compress = "xz")
dshemetov/modeltools-mirror documentation built on Jan. 7, 2022, 12:23 a.m.