knitr::opts_chunk$set(message=FALSE, warning=FALSE)
We will use the following packages:
library(sarimaTD) library(cdcForecastUtils) library(lubridate) library(dplyr) library(ggplot2)
Please note that the sarimaTD
and cdcForecastUtils
packages can be installed by running the following code:
devtools::install_github("reichlab/sarimaTD") devtools::install_github("reichlab/cdcForecastUtils")
We define a function that creates a matrix of forecast trajectories from a SARIMA model for a single geographic location. This will look different for different models. Here, the basic steps are:
forecast_horizon
: how many steps ahead we need to forecast. This will depend on the targets we are interested in and where we are in the current season.nsims
by forecast_horizon
matrix, where each row is one simulated trajectory of forecasted incidence after the most recent ILINet report.#' Function to fit a sarimaTD model and generate predictions for a single #' state. #' #' @param nsim number of simulated trajectories to generate #' @param location name of state #' @param flu_data dataframe of ILINet data as returned by #' cdcForecastUtils::download_and_preprocess_state_flu_data or #' cdcForecastUtils::download_and_preprocess_flu_data #' @param target_variable character specifying the variable in flu_data that we #' are forecasting: "unweighted_ili" or "weighted_ili" #' @param season_start_ew: Epidemic week for start of season, #' in format "2020-ew10" #' @param season_end_ew: Epidemic week for start of season, #' in format "2020-ew35" #' @param cdc_report_ew: Epidemic week for most recent ILINet report, #' in format "2020-ew10" #' @param targets: character vector of targets to forecast #' #' @return nsims by forercast_horizon matrix of combined reported ILINet data #' (for past weeks) and simulated values for times after the most recent #' ILINet report. get_trajectories_one_location <- function( nsim, location, flu_data, target_variable, season_start_ew, season_end_ew, cdc_report_ew, targets) { # subset to location data location_data <- flu_data[flu_data$region == location,] # fit sarima model -- model-specific code sarima_fit <- sarimaTD::fit_sarima(tail(location_data[[target_variable]], 100), ts_frequency = 1,seasonal_difference = F) # determine forecast horizon # get_required_forecast_horizon is provided by the cdcForecastUtils package forecast_horizon <- get_required_forecast_horizon( targets = targets, h_max = 6, season_end_ew = season_end_ew, cdc_report_ew = cdc_report_ew ) # predictions -- model-specific code preds <- simulate( object = sarima_fit, nsim = nsim, seed = 1, newdata = location_data[[target_variable]], h = forecast_horizon ) # prepend observed data time_from_start_of_season <- get_time_from_start_of_season(season_start_ew, cdc_report_ew) trajectory_matrix <- cbind( matrix( rep(tail(location_data[[target_variable]], time_from_start_of_season), nsim), nrow = nsim, byrow = TRUE), preds) # this demonstration model isn't that great, and sometimes generates # predictions outside the range of valid bins. # Here we truncate them to be between 0 and 100 trajectory_matrix[trajectory_matrix < 0.0] <- 0.0 trajectory_matrix[trajectory_matrix > 100] <- 99.9 trajectory_matrix[is.nan(trajectory_matrix)] <- 0.0 trajectory_matrix[is.infinite(trajectory_matrix)] <- 0.0 return(trajectory_matrix) }
First we get the ILI data for the regions
flu_data <- download_and_preprocess_flu_data() %>% mutate( region = ifelse( region == "National", "US National", paste0("HHS ", region) ) )
Next, we define the date parameters we need for the current challenge.
# Epidemic weeks for season start, season end, and most recent ILINet report season_start_ew <- "2020-ew10" season_end_ew <- "2020-ew35" cdc_report_ew <- get_current_date_from_flu_data(flu_data)
We also define a list of targets relevant to the spatial scale we are forecasting.
targets <- c("wk ahead", "Peak height", "Peak week", "First week below baseline", "Below baseline for 3 weeks")
We now call the function defined above once for each location, and assemble the resulting matrices in a tibble. This is the required input to the multi_trajectories_to_binned_distributions
function below.
trajectories_by_location <- tibble( location = c("HHS Region 1", "US National") ) %>% mutate( trajectories = purrr::map( location, get_trajectories_one_location, nsim = 1000, flu_data = flu_data, target_variable = "weighted_ili", season_start_ew = season_start_ew, season_end_ew = season_end_ew, cdc_report_ew = cdc_report_ew, targets = targets) )
trajectories_by_location
The trajectories_by_location
object is a tibble with the trajectories
column being a list of matrices. The first component of this list is a 1000 by 28 matrix of incidence trajectories for Region 1:
distributional_submission_df <- multi_trajectories_to_binned_distributions( multi_trajectories = trajectories_by_location, targets = targets, h_max = 6, bins = c(seq(0, 25, by = .1), 100), season_start_ew = season_start_ew, season_end_ew = season_end_ew, cdc_report_ew = cdc_report_ew)
Finally, we add the point forecasts and verify
distributional_submission_df %>% distinct(location, target) %>% as.data.frame() head(distributional_submission_df) distributional_submission_df <- sanitize_entry(distributional_submission_df) point_forecasts <- generate_point_forecasts(distributional_submission_df,method="Median") submission_df <- rbind(distributional_submission_df, point_forecasts) submission_df$location <- as.factor(submission_df$location) verify_entry(submission_df,challenge='ilinet') density_plots <- get_viz_from_submission_df(submission_df) density_plots[[1]] density_plots[[2]] density_plots[[8]]
plot_trajectories_and_intervals( flu_data = flu_data, target_variable = "weighted_ili", trajectories_by_location = trajectories_by_location, submission = submission_df, season_start_ew = season_start_ew, season_end_ew = season_end_ew, cdc_report_ew = cdc_report_ew )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.