In reichlab/cdcForecastUtils: Utility functions for CDC forecasts

knitr::opts_chunk$set(message=FALSE, warning=FALSE)

An example of cdcForecastUtils for the sarimaTD model at the regional level

We will use the following packages:

library(sarimaTD)
library(cdcForecastUtils)
library(lubridate)
library(dplyr)
library(ggplot2)

Please note that the sarimaTD and cdcForecastUtils packages can be installed by running the following code:

devtools::install_github("reichlab/sarimaTD")
devtools::install_github("reichlab/cdcForecastUtils")

Model-specific helper function

We define a function that creates a matrix of forecast trajectories from a SARIMA model for a single geographic location. This will look different for different models. Here, the basic steps are:

Subset to the data for a single state (this model looks at one state at a time)
Fit a sarima model to the data for that state
Determine the forecast_horizon: how many steps ahead we need to forecast. This will depend on the targets we are interested in and where we are in the current season.
Simulate "trajectories" of incidence. At this point, we have an nsims by forecast_horizon matrix, where each row is one simulated trajectory of forecasted incidence after the most recent ILINet report.
Prepend reported data. Here we prepend the most recent values from ILINet. Note that this does not make any adjustments for possible future revisions to the ILINet data, sometimes referred to as ``backfill''.

#' Function to fit a sarimaTD model and generate predictions for a single
#' state.
#'
#' @param nsim number of simulated trajectories to generate
#' @param location name of state
#' @param flu_data dataframe of ILINet data as returned by 
#'    cdcForecastUtils::download_and_preprocess_state_flu_data or
#'    cdcForecastUtils::download_and_preprocess_flu_data
#' @param target_variable character specifying the variable in flu_data that we
#'    are forecasting: "unweighted_ili" or "weighted_ili"
#' @param season_start_ew: Epidemic week for start of season,
#'    in format "2020-ew10"
#' @param season_end_ew: Epidemic week for start of season,
#'    in format "2020-ew35"
#' @param cdc_report_ew: Epidemic week for most recent ILINet report,
#'    in format "2020-ew10"
#' @param targets: character vector of targets to forecast
#'
#' @return nsims by forercast_horizon matrix of combined reported ILINet data
#'    (for past weeks) and simulated values for times after the most recent
#'    ILINet report.
get_trajectories_one_location <- function(
  nsim,
  location,
  flu_data,
  target_variable,
  season_start_ew,
  season_end_ew,
  cdc_report_ew,
  targets) {
  # subset to location data
  location_data <- flu_data[flu_data$region == location,]

  # fit sarima model -- model-specific code
  sarima_fit <- sarimaTD::fit_sarima(tail(location_data[[target_variable]], 100),
    ts_frequency = 1,seasonal_difference = F)

  # determine forecast horizon
  # get_required_forecast_horizon is provided by the cdcForecastUtils package
  forecast_horizon <- get_required_forecast_horizon(
    targets = targets,
    h_max = 6,
    season_end_ew = season_end_ew,
    cdc_report_ew = cdc_report_ew
  )

  # predictions -- model-specific code
  preds <- simulate(
    object = sarima_fit,
    nsim = nsim,
    seed = 1,
    newdata = location_data[[target_variable]],
    h = forecast_horizon
  )

  # prepend observed data
  time_from_start_of_season <- get_time_from_start_of_season(season_start_ew, cdc_report_ew)
  trajectory_matrix <- cbind(
    matrix(
      rep(tail(location_data[[target_variable]], time_from_start_of_season), nsim),
      nrow = nsim,
      byrow = TRUE),
    preds)

  # this demonstration model isn't that great, and sometimes generates
  # predictions outside the range of valid bins.
  # Here we truncate them to be between 0 and 100
  trajectory_matrix[trajectory_matrix < 0.0] <- 0.0
  trajectory_matrix[trajectory_matrix > 100] <- 99.9
  trajectory_matrix[is.nan(trajectory_matrix)] <- 0.0
  trajectory_matrix[is.infinite(trajectory_matrix)] <- 0.0

  return(trajectory_matrix)
}

Load data and set up

First we get the ILI data for the regions

flu_data <- download_and_preprocess_flu_data() %>%
  mutate(
    region = ifelse(
      region == "National",
      "US National",
      paste0("HHS ", region)
    )
  )

Next, we define the date parameters we need for the current challenge.

# Epidemic weeks for season start, season end, and most recent ILINet report
season_start_ew <- "2020-ew10"
season_end_ew <- "2020-ew35"
cdc_report_ew <- get_current_date_from_flu_data(flu_data)

We also define a list of targets relevant to the spatial scale we are forecasting.

targets <-  c("wk ahead", "Peak height", "Peak week",
  "First week below baseline", "Below baseline for 3 weeks")

Assemble simulated incidence trajectories across all locations of interest

We now call the function defined above once for each location, and assemble the resulting matrices in a tibble. This is the required input to the multi_trajectories_to_binned_distributions function below.

trajectories_by_location <- tibble(
  location = c("HHS Region 1", "US National")
) %>%
  mutate(
    trajectories = purrr::map(
      location,
      get_trajectories_one_location,
      nsim = 1000,
      flu_data = flu_data,
      target_variable = "weighted_ili",
      season_start_ew = season_start_ew,
      season_end_ew = season_end_ew,
      cdc_report_ew = cdc_report_ew,
      targets = targets)
  )

trajectories_by_location

The trajectories_by_location object is a tibble with the trajectories column being a list of matrices. The first component of this list is a 1000 by 28 matrix of incidence trajectories for Region 1:

1000 is the number of simulated trajectories we generated
To generate predictions for the "1 wk ahead", ..., "6 wk ahead", "Peak height", "Peak week", "First week below baseline", and "Below baseline for 3 weeks" targets at this point in the season, we need simulated trajectories covering all 26 epidemic weeks between the season start and season end, as well as two extra weeks after the end of the season for the baseline targets (for example, incidence may drop below the baseline for the first time on the last week of the "season" and remain below baseline for the two following weeks.).

Convert incidence trajectories to a submission data frame and output to csv

distributional_submission_df <- multi_trajectories_to_binned_distributions(
  multi_trajectories = trajectories_by_location,
  targets = targets,
  h_max = 6,
  bins = c(seq(0, 25, by = .1), 100),
  season_start_ew = season_start_ew,
  season_end_ew = season_end_ew,
  cdc_report_ew = cdc_report_ew)

Finally, we add the point forecasts and verify

distributional_submission_df %>%
  distinct(location, target) %>%
  as.data.frame()
head(distributional_submission_df)

distributional_submission_df <- sanitize_entry(distributional_submission_df)
point_forecasts <- generate_point_forecasts(distributional_submission_df,method="Median")

submission_df <- rbind(distributional_submission_df, point_forecasts)

submission_df$location <- as.factor(submission_df$location)
verify_entry(submission_df,challenge='ilinet')

density_plots <- get_viz_from_submission_df(submission_df)
density_plots[[1]]
density_plots[[2]]
density_plots[[8]]

Plot to verify that simulated trajectories line up with intervals derived from submission df

plot_trajectories_and_intervals(
  flu_data = flu_data,
  target_variable = "weighted_ili",
  trajectories_by_location = trajectories_by_location,
  submission = submission_df,
  season_start_ew = season_start_ew,
  season_end_ew = season_end_ew,
  cdc_report_ew = cdc_report_ew
)