knitr::opts_chunk$set(message=FALSE, warning=FALSE)

An example of cdcForecastUtils for the sarimaTD model at the regional level

We will use the following packages:

library(sarimaTD)
library(cdcForecastUtils)
library(lubridate)
library(dplyr)
library(ggplot2)

Please note that the sarimaTD and cdcForecastUtils packages can be installed by running the following code:

devtools::install_github("reichlab/sarimaTD")
devtools::install_github("reichlab/cdcForecastUtils")

Model-specific helper function

We define a function that creates a matrix of forecast trajectories from a SARIMA model for a single geographic location. This will look different for different models. Here, the basic steps are:

#' Function to fit a sarimaTD model and generate predictions for a single
#' state.
#'
#' @param nsim number of simulated trajectories to generate
#' @param location name of state
#' @param flu_data dataframe of ILINet data as returned by 
#'    cdcForecastUtils::download_and_preprocess_state_flu_data or
#'    cdcForecastUtils::download_and_preprocess_flu_data
#' @param target_variable character specifying the variable in flu_data that we
#'    are forecasting: "unweighted_ili" or "weighted_ili"
#' @param season_start_ew: Epidemic week for start of season,
#'    in format "2020-ew10"
#' @param season_end_ew: Epidemic week for start of season,
#'    in format "2020-ew35"
#' @param cdc_report_ew: Epidemic week for most recent ILINet report,
#'    in format "2020-ew10"
#' @param targets: character vector of targets to forecast
#'
#' @return nsims by forercast_horizon matrix of combined reported ILINet data
#'    (for past weeks) and simulated values for times after the most recent
#'    ILINet report.
get_trajectories_one_location <- function(
  nsim,
  location,
  flu_data,
  target_variable,
  season_start_ew,
  season_end_ew,
  cdc_report_ew,
  targets) {
  # subset to location data
  location_data <- flu_data[flu_data$region == location,]

  # fit sarima model -- model-specific code
  sarima_fit <- sarimaTD::fit_sarima(tail(location_data[[target_variable]], 100),
    ts_frequency = 1,seasonal_difference = F)

  # determine forecast horizon
  # get_required_forecast_horizon is provided by the cdcForecastUtils package
  forecast_horizon <- get_required_forecast_horizon(
    targets = targets,
    h_max = 6,
    season_end_ew = season_end_ew,
    cdc_report_ew = cdc_report_ew
  )

  # predictions -- model-specific code
  preds <- simulate(
    object = sarima_fit,
    nsim = nsim,
    seed = 1,
    newdata = location_data[[target_variable]],
    h = forecast_horizon
  )

  # prepend observed data
  time_from_start_of_season <- get_time_from_start_of_season(season_start_ew, cdc_report_ew)
  trajectory_matrix <- cbind(
    matrix(
      rep(tail(location_data[[target_variable]], time_from_start_of_season), nsim),
      nrow = nsim,
      byrow = TRUE),
    preds)

  # this demonstration model isn't that great, and sometimes generates
  # predictions outside the range of valid bins.
  # Here we truncate them to be between 0 and 100
  trajectory_matrix[trajectory_matrix < 0.0] <- 0.0
  trajectory_matrix[trajectory_matrix > 100] <- 99.9
  trajectory_matrix[is.nan(trajectory_matrix)] <- 0.0
  trajectory_matrix[is.infinite(trajectory_matrix)] <- 0.0

  return(trajectory_matrix)
}

Load data and set up

First we get the ILI data for the regions

flu_data <- download_and_preprocess_flu_data() %>%
  mutate(
    region = ifelse(
      region == "National",
      "US National",
      paste0("HHS ", region)
    )
  )

Next, we define the date parameters we need for the current challenge.

# Epidemic weeks for season start, season end, and most recent ILINet report
season_start_ew <- "2020-ew10"
season_end_ew <- "2020-ew35"
cdc_report_ew <- get_current_date_from_flu_data(flu_data)

We also define a list of targets relevant to the spatial scale we are forecasting.

targets <-  c("wk ahead", "Peak height", "Peak week",
  "First week below baseline", "Below baseline for 3 weeks")

Assemble simulated incidence trajectories across all locations of interest

We now call the function defined above once for each location, and assemble the resulting matrices in a tibble. This is the required input to the multi_trajectories_to_binned_distributions function below.

trajectories_by_location <- tibble(
  location = c("HHS Region 1", "US National")
) %>%
  mutate(
    trajectories = purrr::map(
      location,
      get_trajectories_one_location,
      nsim = 1000,
      flu_data = flu_data,
      target_variable = "weighted_ili",
      season_start_ew = season_start_ew,
      season_end_ew = season_end_ew,
      cdc_report_ew = cdc_report_ew,
      targets = targets)
  )
trajectories_by_location

The trajectories_by_location object is a tibble with the trajectories column being a list of matrices. The first component of this list is a 1000 by 28 matrix of incidence trajectories for Region 1:

Convert incidence trajectories to a submission data frame and output to csv

distributional_submission_df <- multi_trajectories_to_binned_distributions(
  multi_trajectories = trajectories_by_location,
  targets = targets,
  h_max = 6,
  bins = c(seq(0, 25, by = .1), 100),
  season_start_ew = season_start_ew,
  season_end_ew = season_end_ew,
  cdc_report_ew = cdc_report_ew)

Finally, we add the point forecasts and verify

distributional_submission_df %>%
  distinct(location, target) %>%
  as.data.frame()
head(distributional_submission_df)

distributional_submission_df <- sanitize_entry(distributional_submission_df)
point_forecasts <- generate_point_forecasts(distributional_submission_df,method="Median")

submission_df <- rbind(distributional_submission_df, point_forecasts)

submission_df$location <- as.factor(submission_df$location)
verify_entry(submission_df,challenge='ilinet')

density_plots <- get_viz_from_submission_df(submission_df)
density_plots[[1]]
density_plots[[2]]
density_plots[[8]]

Plot to verify that simulated trajectories line up with intervals derived from submission df

plot_trajectories_and_intervals(
  flu_data = flu_data,
  target_variable = "weighted_ili",
  trajectories_by_location = trajectories_by_location,
  submission = submission_df,
  season_start_ew = season_start_ew,
  season_end_ew = season_end_ew,
  cdc_report_ew = cdc_report_ew
)


reichlab/cdcForecastUtils documentation built on May 6, 2020, 10:43 a.m.