#' Download backfill data from the delphi api
#' and save to data folder.
#' Note: We may want to do this periodically to make sure we are up to date
download_backfill_data <- function(){
library(plyr) # for rbind.fill
library(dplyr)
source("https://raw.githubusercontent.com/cmu-delphi/delphi-epidata/master/src/client/delphi_epidata.R")
# Fetch data
all_obs <- lapply(c("nat", paste0("hhs", 1:10)),
function(region_val) {
lapply(1:51,
function(lag_val) {
obs_one_lag <- Epidata$fluview(
regions = list(region_val),
epiweeks = list(Epidata$range(199740, 201815)),
lag = list(lag_val))
lapply(obs_one_lag$epidata,
function(x) {
x[sapply(x, function(comp) is.null(comp))] <- NA
return(as.data.frame(x))
}) %>%
rbind.fill()
}) %>%
rbind.fill()
}) %>%
rbind.fill()
saveRDS(all_obs,
file = "data/flu_data_with_backfill.rds")
}
investigate_backfill <- function(){
library(dplyr)
library(ggplot2)
library(forecast)
library(cowplot)
### READ DATA AND PLOT
backfill_data <- readRDS("data/flu_data_with_backfill.rds")
# SUBSET NATIONAL DATA
backfill_data <- backfill_data[backfill_data$region=="nat",]
backfill_data$release_date <- as.Date(backfill_data$release_date)
backfill_data <- backfill_data[order(backfill_data$epiweek),]
#PLOT FULL DATA
## CREATE FULLY OBSERVED
fully_observed_data <-backfill_data %>% group_by(region,epiweek) %>% summarize(wili = last(wili) )
plot(fully_observed_data$wili,type='l')
## CREATE PARTIALLY OBSERVED
partially_observed_data <-backfill_data %>% group_by(release_date,region,epiweek) %>% summarize(wili = last(wili) )
#DEFINE A SEQUENCE OF TEST YEARS
years<- seq(2003,2005,by=1)
delta_trajectories <- matrix(NA,nrow=length(years)*35,ncol=35)
row_iter <-1
for (j in 1:(length(years)-1)){
plot_epi_week <- as.numeric(paste(years[j+1],"19",sep=""))
partially_observed_week_dates <- seq(as.Date(paste(years[j],"-12-01",sep="")),as.Date(paste(years[j+1],"-6-01",sep="")),by=7)
fully_observed_wili <- fully_observed_data[fully_observed_data$epiweek<=plot_epi_week & fully_observed_data$epiweek >= as.numeric(paste(years[j],"40",sep="")),]$wili
for(i in 1:length(partially_observed_week_dates)){
partially_observed_week <- partially_observed_week_dates[i]
tmp <- partially_observed_data[partially_observed_data$epiweek >= as.numeric(paste(years[j],"40",sep="")) & partially_observed_data$epiweek<=plot_epi_week & partially_observed_data$release_date <= partially_observed_week,]
tmp <- tmp %>% group_by(region,epiweek) %>% summarize(wili = wili[which.max(release_date)])
p<- ggplot(tmp,aes(x=1:nrow(tmp),y=wili-fully_observed_wili[1:length(tmp$wili)]),col='red') + ylab("Difference from truth and currently reported wILI") + xlab(paste("Time from",years[j],"40")) + geom_line() + ylim(-1.5,1.5) + xlim(1,35)+ ggtitle(years[j]) +theme_bw()
#p <- p + geom_vline(xintercept=7,alpha=.4,linetype="dotted",col="red")
#ggsave(paste(years[j],"-",i,".pdf",sep=""),p,scale = 1.5)
tmp_trajectory <- tmp$wili-fully_observed_wili[1:length(tmp$wili)]
while (length(tmp_trajectory) <35){
tmp_trajectory <- c(tmp_trajectory,NA)
}
delta_trajectories[row_iter,] <-tmp_trajectory
row_iter <- row_iter +1
}
}
}
#' Create non-parametric matrix of backfill with dimensions
#' n_seasons x n_weeks x n_lags
#'
create_e_matrix <- function(){
ggplot(weekILI[weekILI$epiweek < 200352,],aes(x=lag,y=wili,col=factor(epiweek))) + geom_line() + xlab("Lag in Season Week")
backfill_data <- readRDS("data/flu_data_with_backfill.rds")
}
#' Download and preprocess the latest CDC flu data, both national and regional
#'
#' @param latest_year year through which data should be downloaded, defaults to current year
#'
#' @return data frame with latest flu data, preprocessed
#' @export
download_and_preprocess_flu_data <- function(latest_year = as.numeric(format(Sys.Date(), "%Y"))) {
require(cdcfluview)
require(lubridate)
require(dplyr)
require(MMWRweek)
regionflu <- ilinet(region="hhs", years= 1997:latest_year)
regionflu$region <- as.character(regionflu$region)
usflu <- ilinet(region="national", years= 1997:latest_year)
flu_data <- bind_rows(regionflu, usflu)
flu_data <- transmute(flu_data,
region_type = region_type,
region = as.factor(region),
year = year,
week = week,
time = as.POSIXct(MMWRweek2Date(year, week)),
weighted_ili = weighted_ili)
## set zeroes to NAs
flu_data[which(flu_data$weighted_ili==0),"weighted_ili"] <- NA
## Add time_index column: the number of days since some origin date
## (1970-1-1 in this case). The origin is arbitrary.
flu_data$time_index <- as.integer(lubridate::date(flu_data$time) - ymd("1970-01-01"))
## Season column: for example, weeks of 2010 up through and including week 30
## get season 2009/2010; weeks after week 30 get season 2010/2011
## Official CDC flu season for the purposes of prediction runs from week 40 of
## one year to week 20 of the next; the season start week we define here is the
## mid-point of the "off-season"
flu_data$season <- ifelse(
flu_data$week <= 30,
paste0(flu_data$year - 1, "/", flu_data$year),
paste0(flu_data$year, "/", flu_data$year + 1)
)
## Season week column: week number within season
## weeks after week 30 get season_week = week - 30
## weeks before week 30 get season_week = week + (number of weeks in previous year) - 30
## This computation relies on the start_date function in package MMWRweek,
## which is not exported from that package's namespace!!!
flu_data$season_week <- ifelse(
flu_data$week <= 30,
flu_data$week + MMWRweek(MMWRweek:::start_date(flu_data$year) - 1)$MMWRweek - 30,
flu_data$week - 30
)
flu_data <- as.data.frame(flu_data)
return(flu_data)
}
#' Download and preprocess the latest CDC flu data, state-level
#'
#' @param latest_year year through which data should be downloaded, defaults to current year
#'
#' @return data frame with latest state-level flu data, preprocessed
#' @export
download_and_preprocess_state_flu_data <- function(latest_year = as.numeric(format(Sys.Date(), "%Y"))) {
require(cdcfluview)
require(MMWRweek)
require(dplyr)
require(lubridate)
flu_data_raw <- ilinet(region="state", years=1997:latest_year)
flu_data <- mutate(flu_data_raw, time = as.POSIXct(MMWRweek2Date(year, week)))
## set rows with denominator zeroes to NAs
flu_data[which(flu_data$total_patients==0),"weighted_ili"] <- NA
## Add time_index column: the number of days since some origin date
## (1970-1-1 in this case). The origin is arbitrary.
flu_data$time_index <- as.integer(lubridate::date(flu_data$time) - ymd("1970-01-01"))
## Season column: for example, weeks of 2010 up through and including week 30
## get season 2009/2010; weeks after week 30 get season 2010/2011
## Official CDC flu season for the purposes of prediction runs from week 40 of
## one year to week 20 of the next; the season start week we define here is the
## mid-point of the "off-season"
flu_data$season <- ifelse(
flu_data$week <= 30,
paste0(flu_data$year - 1, "/", flu_data$year),
paste0(flu_data$year, "/", flu_data$year + 1)
)
## Season week column: week number within season
## weeks after week 30 get season_week = week - 30
## weeks before week 30 get season_week = week + (number of weeks in previous year) - 30
## This computation relies on the start_date function in package MMWRweek,
## which is not exported from that package's namespace!!!
flu_data$season_week <- ifelse(
flu_data$week <= 30,
flu_data$week + MMWRweek(MMWRweek:::start_date(flu_data$year) - 1)$MMWRweek - 30,
flu_data$week - 30
)
state_flu <- as.data.frame(flu_data)
return(state_flu)
}
#' Download and preprocess the latest CDC hospitalization data
#'
#' @param latest_year year through which data should be downloaded, defaults to current year
#'
#' @return data frame with latest hospitalization data, preprocessed
#' @export
download_and_preprocess_hosp_data <- function(latest_year = as.numeric(format(Sys.Date(), "%Y"))) {
require(cdcfluview)
require(MMWRweek)
require(dplyr)
require(lubridate)
flu_data_raw_hosp <-cdcfluview::hospitalizations(years=2009:latest_year)
flu_data_raw_hosp$week <- flu_data_raw_hosp$year_wk_num
flu_data <- mutate(flu_data_raw_hosp, time = as.POSIXct(MMWRweek2Date(year, week)))
## set rows with denominator zeroes to NAs
#flu_data[which(flu_data$total_patients==0),"weighted_ili"] <- NA
## Add time_index column: the number of days since some origin date
## (1970-1-1 in this case). The origin is arbitrary.
flu_data$time_index <- as.integer(lubridate::date(flu_data$time) - ymd("1970-01-01"))
## Season column: for example, weeks of 2010 up through and including week 30
## get season 2009/2010; weeks after week 30 get season 2010/2011
## Official CDC flu season for the purposes of prediction runs from week 40 of
## one year to week 20 of the next; the season start week we define here is the
## mid-point of the "off-season"
flu_data$season <- ifelse(
flu_data$week <= 30,
paste0(flu_data$year - 1, "/", flu_data$year),
paste0(flu_data$year, "/", flu_data$year + 1)
)
## Season week column: week number within season
## weeks after week 30 get season_week = week - 30
## weeks before week 30 get season_week = week + (number of weeks in previous year) - 30
## This computation relies on the start_date function in package MMWRweek,
## which is not exported from that package's namespace!!!
flu_data$season_week <- ifelse(
flu_data$week <= 30,
flu_data$week + MMWRweek(MMWRweek:::start_date(flu_data$year) - 1)$MMWRweek - 30,
flu_data$week - 30
)
hosp_flu <- as.data.frame(flu_data)
return(hosp_flu)
}
#' Utility function to compute onset week based on a trajectory of incidence values
#'
#' @param incidence_trajectory a numeric vector with incidence for each time
#' point in a season
#' @param baseline the threshold that incidence must cross to count as an onset
#' @param onset_length number of consecutive time points that incidence must
#' exceed the baseline threshold in order to count as the season onset
#' @param first_season_week number of weeks in year corresponding to the first
#' week in the season. For example, our code takes this value to be 31:
#' a new influenza season starts on the 31st week of each year.
#' @param weeks_in_first_season_year How many MMWR weeks are in the first year
#' of the season? For example, in the 2000/2001 season, the first year is
#' 2000. There were 52 MMWR weeks in 2000.
#'
#' @return the smallest index i such that every entry of
#' incidence_trajectory[seq(from = i, length = onset_length)]
#' is >= baseline, if such an index exists.
#' Otherwise, the character vector "none"
#'
#' @export
get_onset_week <- function(incidence_trajectory,
baseline,
onset_length,
first_season_week = 31,
weeks_in_first_season_year) {
exceeded_threshold <- sapply(
seq_len(length(incidence_trajectory) - onset_length),
function(start_ind) {
above_baseline <- incidence_trajectory[seq(from = start_ind, length = onset_length)] >= baseline
length(above_baseline)>0 &&
all(above_baseline) &&
!all(is.na(incidence_trajectory))
}
)
if(any(exceeded_threshold, na.rm = TRUE)) {
season_week <- min(which(exceeded_threshold))
return(season_week)
} else {
return("none")
}
}
#' Convert season week to year week
#'
#' @param season_week vector of indices of weeks in season
#' @param first_season_week number of week in year corresponding to the first
#' week in the season. For example, our code takes this value to be 31:
#' a new influenza season starts on the 31st week of each year.
#' @param weeks_in_first_season_year How many MMWR weeks are in the first year
#' of the season? For example, in the 2000/2001 season, the first year is
#' 2000. There were 52 MMWR weeks in 2000.
#'
#' @return vector of the same length as season_week with the week of the year
#' that each observation falls on
#'
#' @export
season_week_to_year_week <- function(
season_week,
first_season_week = 31,
weeks_in_first_season_year) {
year_week <- season_week
## For competition first bin is week 40
year_week[season_week < 10] <- 40
year_week[season_week >= 10] <- season_week + first_season_week - 1
year_week[year_week > weeks_in_first_season_year] <-
year_week[year_week > weeks_in_first_season_year] -
weeks_in_first_season_year
return(year_week)
}
#' Convert year week to season week
#'
#' @param year_week vector of indices of weeks in year
#' @param year either a single (four digit) year or a vector of years with the
#' same length as year_week
#'
#' @return vector of the same length as year_week with the week of the season
#' that each observation falls on
#'
#' @export
year_week_to_season_week <- function(
year_week,
year) {
season_week <- ifelse(
year_week <= 30,
year_week + MMWRweek::MMWRweek(MMWRweek:::start_date(year) - 1)$MMWRweek - 30,
year_week - 30
)
return(season_week)
}
#' Compute season onset, peak week, and peak incidence
#'
#' @param data a data frame containing at minimum columns named season,
#' season_week and a column with some sort of incidence measure
#' @param season the season to look at
#' @param first_CDC_season_week the first week of the season to use for
#' calculating onset and peak
#' @param last_CDC_season_week the last week of the season to use for
#' calculating onset and peak
#' @param onset_baseline numeric baseline value for determining season onset
#' @param incidence_var a character string naming the variable in the data
#' argument containing a measure of incidence, or an integer index
#' @param incidence_bins a data frame with variables lower and upper defining
#' lower and upper endpoints to use in binning incidence
#' @param incidence_bin_names a character vector with a name for each incidence
#' bin
#'
#' @return a list with four entries:
#' 1) observed_onset_week, either an integer between first_CDC_season_week
#' and last_CDC_season_week (inclusive), or "none"
#' 2) observed_peak_week, an integer between first_CDC_season_week and
#' last_CDC_season_week (inclusive)
#' 3) observed_peak_inc, a numeric with the maximum value of the specified
#' incidence measure between first_CDC_season_week and last_CDC_season_week
#' 4) observed_peak_inc_bin, character name of incidence bin for peak incidence
#'
#' @export
get_observed_seasonal_quantities <- function(
data,
season,
first_CDC_season_week = 10,
last_CDC_season_week = 42,
onset_baseline,
incidence_var,
incidence_bins,
incidence_bin_names
) {
first_season_ind <- min(which(data$season == season))
last_season_ind <- max(which(data$season == season))
obs_inc_in_season_leading_trailing_nas <-
data[seq(from = first_season_ind, to = last_season_ind),
incidence_var]
## pad so that we start at season week 1
if(data$season_week[first_season_ind] != 1) {
obs_inc_in_season_leading_trailing_nas <- c(
rep(NA, data$season_week[first_season_ind] - 1),
obs_inc_in_season_leading_trailing_nas)
}
## set values before first analysis time season week or after last
## analysis time season week to NA
## these are outside of the bounds of the season the CDC wants to look at
obs_inc_in_season_leading_trailing_nas[
seq_len(first_CDC_season_week - 1)] <- NA
if(length(obs_inc_in_season_leading_trailing_nas) >
last_CDC_season_week) {
obs_inc_in_season_leading_trailing_nas[
seq(from = last_CDC_season_week + 1,
to = length(obs_inc_in_season_leading_trailing_nas))] <- NA
}
observed_peak_inc <- max(
obs_inc_in_season_leading_trailing_nas,
na.rm = TRUE)
observed_peak_inc_bin <- get_inc_bin(observed_peak_inc, return_character = TRUE)
## peak week timing is based on rounded values
round_to_.1 <- function(inc_val) {
if(is.na(inc_val)) {
return(inc_val)
} else {
floor_val <- floor(inc_val * 10) / 10
if(inc_val >= floor_val + 0.05) {
return(floor_val + 0.1)
} else {
return(floor_val)
}
}
}
rounded_observed_peak_inc <- round_to_.1(observed_peak_inc)
rounded_obs_inc_in_season <- sapply(obs_inc_in_season_leading_trailing_nas,
round_to_.1
)
observed_peak_week <-
which(rounded_obs_inc_in_season == as.numeric(rounded_observed_peak_inc))
weeks_in_first_season_year <- get_num_MMWR_weeks_in_first_season_year(season)
observed_onset_week <- get_onset_week(
incidence_trajectory = rounded_obs_inc_in_season,
# incidence_trajectory = obs_inc_in_season_leading_trailing_nas, # used in stable method
baseline = onset_baseline,
onset_length = 3L,
first_season_week = 31,
weeks_in_first_season_year = weeks_in_first_season_year
)
return(list(observed_onset_week = observed_onset_week,
observed_peak_week = observed_peak_week,
observed_peak_inc = observed_peak_inc,
observed_peak_inc_bin = observed_peak_inc_bin
))
}
#' Compute season onset, peak week, and peak incidence
#'
#' @param data a data frame containing at minimum columns named season,
#' season_week and a column with some sort of incidence measure
#' @param season the season to look at
#' @param incidence_var a character string naming the variable in the data
#' argument containing a measure of incidence, or an integer index
#'
#' @return a list with four entries:
#' 1) observed_onset_week, either an integer between first_CDC_season_week
#' and last_CDC_season_week (inclusive), or "none"
#' 2) observed_peak_week, an integer between first_CDC_season_week and
#' last_CDC_season_week (inclusive)
#' 3) observed_peak_inc, a numeric with the maximum value of the specified
#' incidence measure between first_CDC_season_week and last_CDC_season_week
#' 4) observed_peak_inc_bin, character name of incidence bin for peak incidence
#'
#' @export
get_official_observed_seasonal_quantities <- function(
data,
season,
incidence_var
) {
require(FluSight)
first_season_ind <- min(which(data$season == season))
last_season_ind <- max(which(data$season == season))
results <- FluSight::create_truth(fluview = FALSE,
year = substr(season, 1, 4),
weekILI = data,
challenge = "ilinet")
return(list(observed_onset_week = observed_onset_week,
observed_peak_week = observed_peak_week,
observed_peak_inc = observed_peak_inc,
observed_peak_inc_bin = observed_peak_inc_bin
))
}
#' Calculate "log scores" for the purpose of the competition -- log[sum_i(p_i)] where p_i is the model's
#' probability of bin i and i runs over some bins adjacent to the bin where the observed quantity was.
#'
#' @param bin_log_probs named numeric vector with log probability of each bin;
#' names identify the bins
#' @param observed_bin character vector with name(s) of the observed bin(s)
#' (Note that peak can occur in multiple bins)
#' @param prediction_target
#'
#' @return log score for the given observation
#'
#' @export
compute_competition_log_score <- function(bin_log_probs,
observed_bin,
prediction_target) {
## validate probabilities sum to 1 and if not, force them to, with warning
if(sum(exp(bin_log_probs)) != 1) {
warning(paste(prediction_target, "probabilities do not sum to 1. automatically adjusting."))
bin_probs <- exp(bin_log_probs)
bin_log_probs <- log(bin_probs/sum(bin_probs))
}
## validate bin names match expected for prediction_target and
## observed_bin has appropriate length
if(identical(prediction_target, "onset_week")) {
expected_bin_names_52 <- c(as.character(10:42), "none")
expected_bin_names_53 <- c(as.character(10:43), "none")
if(!identical(length(observed_bin), 1L) || !identical(typeof(observed_bin), "character")) {
stop("For prediction target onset_week, observed_bin must be a character vector of length 1")
}
if(identical(sort(expected_bin_names_52), sort(names(bin_log_probs)))) {
expected_bin_names <- expected_bin_names_52
} else if(identical(sort(expected_bin_names_53), sort(names(bin_log_probs)))) {
expected_bin_names <- expected_bin_names_53
} else {
stop("invalid names for the vector bin_log_probs")
}
} else if(identical(prediction_target, "peak_week")) {
expected_bin_names_52 <- as.character(10:42)
expected_bin_names_53 <- as.character(10:43)
if(length(observed_bin) == 0 || !identical(typeof(observed_bin), "character")) {
stop("For prediction target onset_week, observed_bin must be a character vector of length > 0")
}
if(identical(sort(expected_bin_names_52), sort(names(bin_log_probs)))) {
expected_bin_names <- expected_bin_names_52
} else if(identical(sort(expected_bin_names_53), sort(names(bin_log_probs)))) {
expected_bin_names <- expected_bin_names_53
} else {
stop("invalid names for the vector bin_log_probs")
}
} else if(prediction_target %in% c("peak_inc", paste0("ph", 1:4, "_inc"))) {
expected_bin_names <- as.character(seq(from = 0, to = 13, by = 0.1))
if(!identical(length(observed_bin), 1L) || !identical(typeof(observed_bin), "character")) {
stop("For given prediction target, observed_bin must be a character vector of length 1")
}
if(!identical(sort(expected_bin_names), sort(names(bin_log_probs)))) {
stop("invalid names for the vector bin_log_probs")
}
} else {
stop("Invalid value for prediction_target: must be 'onset_week', ")
}
## validate observed_bin is one of the expected_bin_names
if(!(all(observed_bin %in% expected_bin_names))) {
stop(paste0(
"observed_bin must be one of (",
paste(expected_bin_names, collapes = ", "),
")"
))
}
## get bins to sum over
obs_bin_inds <- sapply(observed_bin, function(bin_name) {
which(expected_bin_names == bin_name)
})
if(identical(prediction_target, "onset_week")) {
if(identical(observed_bin, "none")) {
bins_to_sum <- obs_bin_inds
} else {
bins_to_sum <- obs_bin_inds + seq(from = -1, to = 1, by = 1)
bins_to_sum <- bins_to_sum[
bins_to_sum >= 1 & bins_to_sum <= length(expected_bin_names) - 1]
}
} else if(identical(prediction_target, "peak_week")) {
bins_to_sum <- unique(as.vector(sapply(obs_bin_inds, function(bin_ind) {
bin_ind + seq(from = -1, to = 1, by = 1)
})))
bins_to_sum <- bins_to_sum[
bins_to_sum >= 1 & bins_to_sum <= length(expected_bin_names)]
} else if(prediction_target %in% c("peak_inc", paste0("ph", 1:4, "_inc"))) {
bins_to_sum <- obs_bin_inds + seq(from = -5, to = 5)
bins_to_sum <- bins_to_sum[
bins_to_sum >= 1 & bins_to_sum <= length(expected_bin_names)]
}
## Do summation
## Futz around with bin names because order of expected bin names may not
## match order of bin_log_probs
bin_names_to_sum <- expected_bin_names[bins_to_sum]
log_prob <- logspace_sum(bin_log_probs[bin_names_to_sum])
## They truncate at -10
log_prob <- max(-10, log_prob)
## return
return(log_prob)
}
#' Get the onset baseline for a combination of region and season
#'
#' @param region a string, either "National", "Region k", or "Regionk" where
#' k in {1, ..., 10}
#' @param season a string, in the format "2015/2016"
#'
#' @return baseline value for determining season onset
#'
#' @export
get_onset_baseline <- function(region, season = "2015/2016") {
## pick baseline
## assumes region is either "National" or "Region k" format
reg_string <- ifelse(region=="National", "National", gsub(" ", "", region))
idx <- which(flu_onset_baselines$region==reg_string &
flu_onset_baselines$season==season)
reg_baseline <- flu_onset_baselines[idx, "baseline"]
return(reg_baseline)
}
#' Helper function to standardize creation of empty prediction dataset
#'
#' @param data the data to be used
#' @param model_name the name of the model
#' @param incidence_bin_names character vector of names for incidence bins
#' @param first_analysis_time_season_week the first week during each season in
#' which we will form a prediction
#' @param last_analysis_time_season_week the last week during each season in
#' which we will form a prediction
#'
#' @return the prediction_df filled with NAs
#'
#' @export
make_predictions_dataframe <- function(data,
model_name,
incidence_bin_names,
first_analysis_time_season_week = 10,
last_analysis_time_season_week = 41) {
## allocate more than enough space up front,
## delete extra later
na_vec <- rep(NA,
(last_analysis_time_season_week - first_analysis_time_season_week + 1)
)
onset_week_bins <- c(as.character(10:42), "none")
peak_week_bins <- as.character(10:42)
predictions_df <- cbind(
data.frame(
model = model_name,
analysis_time_season = na_vec,
analysis_time_season_week = na_vec,
prediction_week_ph_1 = na_vec,
prediction_week_ph_2 = na_vec,
prediction_week_ph_3 = na_vec,
prediction_week_ph_4 = na_vec,
onset_log_score = na_vec,
peak_week_log_score = na_vec,
peak_inc_log_score = na_vec,
ph_1_inc_log_score = na_vec,
ph_2_inc_log_score = na_vec,
ph_3_inc_log_score = na_vec,
ph_4_inc_log_score = na_vec,
onset_competition_log_score = na_vec,
peak_week_competition_log_score = na_vec,
peak_inc_competition_log_score = na_vec,
ph_1_inc_competition_log_score = na_vec,
ph_2_inc_competition_log_score = na_vec,
ph_3_inc_competition_log_score = na_vec,
ph_4_inc_competition_log_score = na_vec,
stringsAsFactors = FALSE), # don't want model to be a factor with 1 level
as.data.frame(matrix(NA, nrow = length(na_vec), ncol = length(onset_week_bins))) %>%
`colnames<-`(paste0("onset_bin_", onset_week_bins, "_log_prob")),
as.data.frame(matrix(NA, nrow = length(na_vec), ncol = length(peak_week_bins))) %>%
`colnames<-`(paste0("peak_week_bin_", peak_week_bins, "_log_prob")),
as.data.frame(matrix(NA, nrow = length(na_vec), ncol = length(incidence_bin_names))) %>%
`colnames<-`(paste0("peak_inc_bin_", incidence_bin_names, "_log_prob")),
as.data.frame(matrix(NA, nrow = length(na_vec), ncol = length(incidence_bin_names))) %>%
`colnames<-`(paste0("ph_1_inc_bin_", incidence_bin_names, "_log_prob")),
as.data.frame(matrix(NA, nrow = length(na_vec), ncol = length(incidence_bin_names))) %>%
`colnames<-`(paste0("ph_2_inc_bin_", incidence_bin_names, "_log_prob")),
as.data.frame(matrix(NA, nrow = length(na_vec), ncol = length(incidence_bin_names))) %>%
`colnames<-`(paste0("ph_3_inc_bin_", incidence_bin_names, "_log_prob")),
as.data.frame(matrix(NA, nrow = length(na_vec), ncol = length(incidence_bin_names))) %>%
`colnames<-`(paste0("ph_4_inc_bin_", incidence_bin_names, "_log_prob"))
)
return(predictions_df)
}
#' Get log scores and full predictive distributions for each prediction target
#'
#' for a given season using a predictive method that works by simulating
#' trajectories of incidence in each remaining week of the season. Results are
#' stored in a data frame, saved in a .rds file with a name like
#' "model_name-region-season-loso-predictions.rds"
#' Results have columns indicating the analysis time season and season week,
#' model name, log scores for each prediction target, the "log score" used
#' in the competition (adding probabilities from adjacent bins) for each
#' prediction target, as well as the log of the probability assigned to each
#' bin.
#'
#' @param all_seasons_left_out character vector of seasons that were in the
#' training period, in the format "2000/2001"
#' @param analysis_time_season character vector of length 1 specifying the
#' season to obtain predictions for, in the format "2000/2001"
#' @param first_analysis_time_season_week integer specifying the first week of
#' the season in which to make predictions, using all data up to and
#' including that week to make predictions for each following week in the
#' season
#' @param last_analysis_time_season_week integer specifying the last week of
#' the season in which to make predictions, using all data up to and including
#' that week to make predictions for each following week in the season
#' @param region string specifying the region to use, in the format "X" or
#' "Region k" where k in {1, ..., 10}
#' @param prediction_target_var string specifying the name of the variable in
#' data for which we want to make predictions
#' @param incidence_bins a data frame with variables lower and upper defining
#' lower and upper endpoints to use in binning incidence
#' @param incidence_bin_names a character vector with a name for each incidence
#' bin
#' @param n_trajectory_sims integer number of trajectories to simulate
#' @param simulate_trajectories_function a function to call to simulate
#' incidence trajectories. It will be called with the following arguments:
#' * n_sims = number of trajectories to simulate
#' * max_prediction_horizon = number of following weeks to simulate
#' * data = all available data to use in doing simulation, up to and
#' including the analysis time
#' * region = region
#' * analysis_time_season = analysis_time_season
#' * analysis_time_season_week = week of the season at which we are making
#' the predictions
#' * params = simulate_trajectories_params; additional user-provided
#' parameters
#' @param simulate_trajectories_params optional additional parameters to pass
#' to simulate_trajectories_function
#' @param model_name name of model, stored in the results data frame
#' @param prediction_save_path path to directory where results will be saved
#'
#' @return none
#'
#' @export
get_log_scores_via_trajectory_simulation <- function(
all_seasons_left_out,
analysis_time_season,
first_analysis_time_season_week = 10, # == week 40 of year
last_analysis_time_season_week = 41, # analysis for 33-week season, consistent with flu competition -- at week 41, we do prediction for a horizon of one week ahead
region,
prediction_target_var,
incidence_bins,
incidence_bin_names,
n_trajectory_sims,
simulate_trajectories_function,
simulate_trajectories_params,
model_name,
prediction_save_path
) {
## Load data. The only reason to do this here is to know what the dimensions
## of the results data frame should be.
data <- read.csv("data-raw/allflu-cleaned.csv", stringsAsFactors = FALSE)
data$time <- as.POSIXct(data$time)
## subset data to be only the region of interest
data <- data[data$region == region,]
## data frame to describe predictions
## allocate more than enough space up front,
## delete extra later
predictions_df <- make_predictions_dataframe(
data = data,
model_name = model_name,
incidence_bin_names = incidence_bin_names,
first_analysis_time_season_week = 10,
last_analysis_time_season_week = 41)
results_save_row <- 1L
weeks_in_first_season_year <-
get_num_MMWR_weeks_in_first_season_year(analysis_time_season)
## make predictions for each prediction target in the left-out season
## for each possible "last observed" week, starting with the last week of the previous season
## get observed quantities related to overall season, for computing log scores
observed_seasonal_quantities <- get_observed_seasonal_quantities(
data = data,
season = analysis_time_season,
first_CDC_season_week = first_analysis_time_season_week,
last_CDC_season_week = last_analysis_time_season_week + 1,
onset_baseline =
get_onset_baseline(region = region, season = analysis_time_season),
incidence_var = prediction_target_var,
incidence_bins = incidence_bins,
incidence_bin_names = incidence_bin_names
)
last_analysis_time_season_week_in_data <- max(data$season_week[data$season == analysis_time_season])
for(analysis_time_season_week in seq(from = first_analysis_time_season_week, to = min(last_analysis_time_season_week, last_analysis_time_season_week_in_data - 1))) {
cat(paste0("analysis season week = ", analysis_time_season_week, "\n"))
analysis_time_ind <- which(data$season == analysis_time_season &
data$season_week == analysis_time_season_week)
## keep track of if we made any predictions with this combination of season and week
made_predictions <- FALSE
## simulate n_trajectory_sims trajectories
max_prediction_horizon <- max(4L,
last_analysis_time_season_week + 1 - analysis_time_season_week)
trajectory_samples <- simulate_trajectories_function(
n_sims = n_trajectory_sims,
max_prediction_horizon = max_prediction_horizon,
data = data[seq_len(analysis_time_ind), , drop = FALSE],
region = region,
analysis_time_season = analysis_time_season,
analysis_time_season_week = analysis_time_season_week,
params = simulate_trajectories_params
)
## Round to nearest 0.1 -- they do this in competition
## I'm not using R's round() function because I want
## 0.05 -> 0.1
## (...? We should see how they round? Maybe this never comes up?
## I have a feeling I'm fretting over a probability 0 event.)
trajectory_samples <- apply(
trajectory_samples,
c(1, 2),
function(inc_val) {
if(is.na(inc_val)) {
return(inc_val)
} else {
floor_val <- floor(inc_val * 10) / 10
if(inc_val >= floor_val + 0.05) {
return(floor_val + 0.1)
} else {
return(floor_val)
}
}
}
)
## get indices in trajectory samples with NA values that affect
## estimation of seasonal quantities
sample_inds_with_na <- apply(
trajectory_samples[,
seq_len(last_analysis_time_season_week + 1 - analysis_time_season_week),
drop = FALSE],
1,
function(x) any(is.na(x)))
## Predictions for things about the whole season
if(!all(sample_inds_with_na)) {
made_predictions <- TRUE
## subset to sampled trajectories that are usable/do not have NAs
subset_trajectory_samples <- trajectory_samples[!sample_inds_with_na, ]
## Augment trajectory samples with previous observed incidence values
## This is where we should be adding in something to account for backfill.
first_season_obs_ind <- min(which(data$season == analysis_time_season))
subset_trajectory_samples <- cbind(
matrix(
rep(data[seq(from = first_season_obs_ind, to = analysis_time_ind), prediction_target_var],
each = nrow(subset_trajectory_samples)),
nrow = nrow(subset_trajectory_samples)
),
subset_trajectory_samples
)
## If first observation for the season was not at season week 1,
## augment with leading NAs
first_season_obs_week <- data$season_week[first_season_obs_ind]
if(first_season_obs_week != 1) {
subset_trajectory_samples <- cbind(
matrix(NA, nrow = nrow(subset_trajectory_samples), ncol = first_season_obs_week - 1),
subset_trajectory_samples
)
}
## values before the first analysis time week are NA so that
## onset and peak calculations only look at data within the CDC's definition
## of the flu season for purposes of the competition
subset_trajectory_samples[,
seq(from = 1, to = first_analysis_time_season_week - 1)] <- NA
## Convert to binned values
if (age != "65+ yr"){
binned_subset_trajectory_samples <-
get_inc_bin(subset_trajectory_samples,max=13,
return_character = FALSE)
}else{
binned_subset_trajectory_samples <-
get_inc_bin(subset_trajectory_samples,max=60,
return_character = FALSE)
}
## Get onset week for each simulated trajectory
onset_week_by_sim_ind <-
apply(binned_subset_trajectory_samples, 1, function(trajectory) {
get_onset_week(
incidence_trajectory = trajectory,
baseline =
get_onset_baseline(region = region, season = analysis_time_season),
onset_length = 3L,
first_season_week = 31,
weeks_in_first_season_year = weeks_in_first_season_year
)
})
## Get peak incidence for each simulated trajectory
peak_inc_bin_by_sim_ind <-
apply(binned_subset_trajectory_samples, 1, function(trajectory) {
max(trajectory, na.rm = TRUE)
})
## get peak week by sim ind
## note that some sim inds may have more than 1 peak week...
peak_weeks_by_sim_ind <- unlist(lapply(
seq_len(nrow(binned_subset_trajectory_samples)),
function(sim_ind) {
bin_val <- peak_inc_bin_by_sim_ind[sim_ind]
peak_season_weeks <- which(
binned_subset_trajectory_samples[sim_ind, ] == bin_val)
return(peak_season_weeks)
}
))
## Get log scores
onset_week_bins <- c(as.character(10:42), "none")
onset_bin_log_probs <- log(sapply(
onset_week_bins,
function(bin_name) {
sum(onset_week_by_sim_ind == bin_name)
}))
onset_bin_log_probs <- onset_bin_log_probs - logspace_sum(onset_bin_log_probs)
predictions_df[results_save_row, paste0("onset_bin_", onset_week_bins, "_log_prob")] <-
onset_bin_log_probs
predictions_df[results_save_row, "onset_log_score"] <-
onset_bin_log_probs[
as.character(observed_seasonal_quantities$observed_onset_week)]
predictions_df[results_save_row, "onset_competition_log_score"] <-
compute_competition_log_score(onset_bin_log_probs,
as.character(observed_seasonal_quantities$observed_onset_week),
"onset_week")
peak_week_bins <- as.character(10:42)
peak_week_bin_log_probs <- log(sapply(
peak_week_bins,
function(bin_name) {
sum(peak_weeks_by_sim_ind == bin_name)
}))
peak_week_bin_log_probs <- peak_week_bin_log_probs - logspace_sum(peak_week_bin_log_probs)
names(peak_week_bin_log_probs) <- as.character(peak_week_bins)
predictions_df[results_save_row, paste0("peak_week_bin_", peak_week_bins, "_log_prob")] <-
peak_week_bin_log_probs
predictions_df[results_save_row, "peak_week_log_score"] <-
logspace_sum(peak_week_bin_log_probs[
as.character(observed_seasonal_quantities$observed_peak_week)])
predictions_df[results_save_row, "peak_week_competition_log_score"] <-
compute_competition_log_score(peak_week_bin_log_probs,
as.character(observed_seasonal_quantities$observed_peak_week),
"peak_week")
peak_inc_bin_log_probs <- log(sapply(
incidence_bin_names,
function(bin_name) {
sum(peak_inc_bin_by_sim_ind == as.numeric(bin_name))
}))
peak_inc_bin_log_probs <- peak_inc_bin_log_probs - logspace_sum(peak_inc_bin_log_probs)
predictions_df[results_save_row, paste0("peak_inc_bin_", incidence_bin_names, "_log_prob")] <-
peak_inc_bin_log_probs
predictions_df[results_save_row, "peak_inc_log_score"] <-
peak_inc_bin_log_probs[
as.character(observed_seasonal_quantities$observed_peak_inc_bin)]
predictions_df[results_save_row, "peak_inc_competition_log_score"] <-
compute_competition_log_score(peak_inc_bin_log_probs,
observed_seasonal_quantities$observed_peak_inc_bin,
"peak_inc")
}
## Predictions for incidence in an individual week at prediction horizon ph = 1, ..., 4
for(ph in 1:4) {
sample_inds_with_na <- is.na(trajectory_samples[, ph])
## get observed value/bin
observed_ph_inc <-
data[analysis_time_ind + ph, prediction_target_var]
observed_ph_inc_bin <- get_inc_bin(observed_ph_inc, return_character = TRUE)
if(!all(sample_inds_with_na) && !is.na(observed_ph_inc)) {
made_predictions <- TRUE
## get sampled incidence values at prediction horizon that are usable/not NAs
ph_inc_by_sim_ind <- trajectory_samples[!sample_inds_with_na, ph]
ph_inc_bin_by_sim_ind <- get_inc_bin(ph_inc_by_sim_ind, return_character = TRUE)
## get log score
ph_inc_bin_log_probs <- log(sapply(
incidence_bin_names,
function(bin_name) {
sum(ph_inc_bin_by_sim_ind == bin_name)
})) -
log(length(ph_inc_bin_by_sim_ind))
predictions_df[results_save_row, paste0("ph_", ph, "_inc_bin_", incidence_bin_names, "_log_prob")] <-
ph_inc_bin_log_probs
predictions_df[results_save_row, paste0("ph_", ph, "_inc_log_score")] <-
ph_inc_bin_log_probs[observed_ph_inc_bin]
predictions_df[results_save_row,
paste0("ph_", ph, "_inc_competition_log_score")] <-
compute_competition_log_score(ph_inc_bin_log_probs,
observed_ph_inc_bin,
paste0("ph", ph, "_inc"))
}
} # ph loop
if(made_predictions) {
predictions_df[results_save_row, "analysis_time_season"] <- analysis_time_season
predictions_df[results_save_row, "analysis_time_season_week"] <- analysis_time_season_week
predictions_df[results_save_row, "prediction_week_ph_1"] <- analysis_time_season_week + 1
predictions_df[results_save_row, "prediction_week_ph_2"] <- analysis_time_season_week + 2
predictions_df[results_save_row, "prediction_week_ph_3"] <- analysis_time_season_week + 3
predictions_df[results_save_row, "prediction_week_ph_4"] <- analysis_time_season_week + 4
results_save_row <- results_save_row + 1
}
} # analysis_time_season_week
## if there are extra rows in the predictions_df, delete them
if(results_save_row <= nrow(predictions_df)) {
predictions_df <- predictions_df[
-seq(from = results_save_row, to = nrow(predictions_df)),
,
drop = FALSE
]
}
region_str <- ifelse(identical(region, "X"), "National", gsub(" ", "", region))
season_str <- gsub("/", "-", analysis_time_season)
saveRDS(predictions_df,
file = paste0(prediction_save_path,
"/",
model_name, "-", region_str, "-", season_str, "-loso-predictions.rds"))
}
#' Get log scores and full predictive distributions for each prediction target
#'
#' for a given season using a predictive method that works by simulating
#' trajectories of incidence in each remaining week of the season. Results are
#' stored in a data frame, saved in a .rds file with a name like
#' "model_name-region-season-loso-predictions.rds"
#' Results have columns indicating the analysis time season and season week,
#' model name, log scores for each prediction target, the "log score" used
#' in the competition (adding probabilities from adjacent bins) for each
#' prediction target, as well as the log of the probability assigned to each
#' bin.
#'
#' @param data data frame with observed disease data available to use for predictions
#' @param analysis_time_season character vector of length 1 specifying the
#' season to obtain predictions for, in the format "2000/2001"
#' @param first_analysis_time_season_week integer specifying the first week of
#' the season in which to make predictions, using all data up to and
#' including that week to make predictions for each following week in the
#' season
#' @param last_analysis_time_season_week integer specifying the last week of
#' the season in which to make predictions, using all data up to and including
#' that week to make predictions for each following week in the season
#' @param prediction_target_var string specifying the name of the variable in
#' data for which we want to make predictions
#' @param incidence_bins a data frame with variables lower and upper defining
#' lower and upper endpoints to use in binning incidence
#' @param incidence_bin_names a character vector with a name for each incidence
#' bin
#' @param n_trajectory_sims integer number of trajectories to simulate
#' @param simulate_trajectories_function a function to call to simulate
#' incidence trajectories. It will be called with the following arguments:
#' * n_sims = number of trajectories to simulate
#' * max_prediction_horizon = number of following weeks to simulate
#' * data = all available data to use in doing simulation, up to and
#' including the analysis time
#' * region = region
#' * analysis_time_season = analysis_time_season
#' * analysis_time_season_week = week of the season at which we are making
#' the predictions
#' * params = simulate_trajectories_params; additional user-provided
#' parameters
#' @param simulate_trajectories_params optional additional parameters to pass
#' to simulate_trajectories_function
#' @param all_regions a character vector with all of the regions over which to make forecasts,
#' defaults to HHS region names
#'
#' @return data frame with contents suitable for writing out as a csv file in
#' the CDC's standardized format
#'
#' @export
get_submission_via_trajectory_simulation <- function(
data,
analysis_time_season,
first_analysis_time_season_week = 10, # == week 40 of year
last_analysis_time_season_week = 41, # analysis for 33-week season, consistent with flu competition -- at week 41, we do prediction for a horizon of one week ahead
prediction_target_var,
incidence_bins,
incidence_bin_names,
n_trajectory_sims,
simulate_trajectories_function,
simulate_trajectories_params,
all_regions=c( paste0("Region ", 1:10),"National"),
regional_switch) {
require(plyr)
if (!regional_switch=="Hosp"){
return(
rbind.fill(lapply(all_regions, function(region) {
get_submission_one_region_via_trajectory_simulation(
data = data,
analysis_time_season = analysis_time_season,
first_analysis_time_season_week = first_analysis_time_season_week,
last_analysis_time_season_week = last_analysis_time_season_week,
region = region,
prediction_target_var = prediction_target_var,
incidence_bins = incidence_bins,
incidence_bin_names = incidence_bin_names,
n_trajectory_sims = n_trajectory_sims,
simulate_trajectories_function = simulate_trajectories_function,
simulate_trajectories_params = simulate_trajectories_params,
regional_switch=regional_switch
)
}))
)
}
else {
return(
rbind.fill(lapply(simulate_trajectories_params$age_groups, function(age) {
get_submission_one_region_via_trajectory_simulation(
data = data,
analysis_time_season = analysis_time_season,
first_analysis_time_season_week = first_analysis_time_season_week,
last_analysis_time_season_week = last_analysis_time_season_week,
region = "Entire Network",
prediction_target_var = prediction_target_var,
incidence_bins = incidence_bins,
incidence_bin_names = incidence_bin_names,
n_trajectory_sims = n_trajectory_sims,
simulate_trajectories_function = simulate_trajectories_function,
simulate_trajectories_params = simulate_trajectories_params,
regional_switch=regional_switch,
age=age
)
}))
)
}
}
#' Get log scores and full predictive distributions for each prediction target
#'
#' for a given season using a predictive method that works by simulating
#' trajectories of incidence in each remaining week of the season. Results are
#' stored in a data frame, saved in a .rds file with a name like
#' "model_name-region-season-loso-predictions.rds"
#' Results have columns indicating the analysis time season and season week,
#' model name, log scores for each prediction target, the "log score" used
#' in the competition (adding probabilities from adjacent bins) for each
#' prediction target, as well as the log of the probability assigned to each
#' bin.
#'
#' @param data data frame with observed disease data available to use for predictions
#' @param analysis_time_season character vector of length 1 specifying the
#' season to obtain predictions for, in the format "2000/2001"
#' @param first_analysis_time_season_week integer specifying the first week of
#' the season in which to make predictions, using all data up to and
#' including that week to make predictions for each following week in the
#' season
#' @param last_analysis_time_season_week integer specifying the last week of
#' the season in which to make predictions, using all data up to and including
#' that week to make predictions for each following week in the season
#' @param region string with name of region to make predictions for
#' @param prediction_target_var string specifying the name of the variable in
#' data for which we want to make predictions
#' @param incidence_bins a data frame with variables lower and upper defining
#' lower and upper endpoints to use in binning incidence
#' @param incidence_bin_names a character vector with a name for each incidence
#' bin
#' @param n_trajectory_sims integer number of trajectories to simulate
#' @param simulate_trajectories_function a function to call to simulate
#' incidence trajectories. It will be called with the following arguments:
#' * n_sims = number of trajectories to simulate
#' * max_prediction_horizon = number of following weeks to simulate
#' * data = all available data to use in doing simulation, up to and
#' including the analysis time
#' * region = region
#' * analysis_time_season = analysis_time_season
#' * analysis_time_season_week = week of the season at which we are making
#' the predictions
#' * params = simulate_trajectories_params; additional user-provided
#' parameters
#' @param simulate_trajectories_params optional additional parameters to pass
#' to simulate_trajectories_function
#' @param regional logical whether to make predictions for HHS regions (TRUE),
#' or states (FALSE)
#'
#' @return data frame with contents suitable for writing out as a csv file in
#' the CDC's standardized format, but for just one region
#' @export
get_submission_one_region_via_trajectory_simulation <- function(
data,
analysis_time_season = "2016/2017",
first_analysis_time_season_week = 10, # == week 40 of year
last_analysis_time_season_week = 41, # analysis for 33-week season, consistent with flu competition -- at week 41, we do prediction for a horizon of one week ahead
region,
prediction_target_var,
incidence_bins,
incidence_bin_names,
n_trajectory_sims,
simulate_trajectories_function,
simulate_trajectories_params,
regional_switch,
age) {
weeks_in_first_season_year <-
get_num_MMWR_weeks_in_first_season_year(analysis_time_season)
if(regional_switch == "Country"){
age <- NA
## find region ID for CDC submission
region_str <- ifelse(identical(region, "National"),
"US National",
gsub("Region ", "HHS Region ", region))
## load region-specific submission file template
if(identical(as.integer(weeks_in_first_season_year), 52L)) {
region_results <- read.csv(file.path(
find.package("cdcFlu20182019"),
"templates",
"region-prediction-template.csv"))
} else {
region_results <- read.csv(file.path(
find.package("cdcFlu20182019"),
"guidance",
"region-prediction-template-EW53.csv"))
}
} else if (regional_switch == "Hosp") {
## load region-specific submission file template
if (age != "65+ yr"){
region_results <- read.csv(file.path(
find.package("cdcFlu20182019"),
"templates",
"hosp-prediction-template.csv"))
}else{
region_results <- read.csv(file.path(
find.package("cdcFlu20182019"),
"templates",
"65_template.csv"))
}
}
else if (regional_switch == "State") {
age <- NA
## find state ID for CDC submission
region_str <- region
## load region-specific submission file template
region_results <- read.csv(file.path(
find.package("cdcFlu20182019"),
"templates",
"state-prediction-template.csv"))
}
if (regional_switch !="Hosp"){
region_results$Location <- region_str
} else {
region_results$Location <- age
}
## subset data to be only the region-specific data
data <- data[data$region == region,]
analysis_time_season_week <- data$season_week[nrow(data)]
analysis_time_ind <- nrow(data)
max_prediction_horizon <- max(4L,
last_analysis_time_season_week + 1 - analysis_time_season_week)
first_season_obs_ind <- min(which(data$season == analysis_time_season))
trajectory_samples <- simulate_trajectories_function(
n_sims = n_trajectory_sims,
max_prediction_horizon = max_prediction_horizon,
data = data[seq_len(analysis_time_ind), , drop = FALSE],
region = region,
analysis_time_season = analysis_time_season,
analysis_time_season_week = analysis_time_season_week,
params = simulate_trajectories_params,
age=age,
regional_switch=regional_switch
)
## simulate n_trajectory_sims trajectories
## Round to nearest 0.1 -- they do this in competition
## I'm not using R's round() function because I want
## 0.05 -> 0.1
## (...? We should see how they round? Maybe this never comes up?
## I have a feeling I'm fretting over a probability 0 event.)
trajectory_samples <- apply(
trajectory_samples,
c(1, 2),
function(inc_val) {
if(is.na(inc_val)) {
return(inc_val)
} else {
floor_val <- floor(inc_val * 10) / 10
if(inc_val >= floor_val + 0.05) {
return(floor_val + 0.1)
} else {
return(floor_val)
}
}
}
)
## get indices in trajectory samples with NA values that affect
## estimation of seasonal quantities
sample_inds_with_na <- apply(
trajectory_samples[,
seq_len(last_analysis_time_season_week + 1 - analysis_time_season_week),
drop = FALSE],
1,
function(x) any(is.na(x)))
## Predictions for things about the whole season
if(all(sample_inds_with_na)) {
stop("Error: NAs in all simulated trajectories, unable to predict seasonal quantities")
} else if(region != "Entire Network") {
## subset to sampled trajectories that are usable/do not have NAs
subset_trajectory_samples <- trajectory_samples[!sample_inds_with_na, ]
## Augment trajectory samples with previous observed incidence values
## This is where we should be adding in some boostrapping to account
## for backfill.
# e <- read.csv("data/e")
# current_season <- substr(analysis_time_season,1,4)
# current_week <- last_analysis_time_season_week+1-analysis_time_season_week
#lag <- analysis_time_season_week
# then something like
#e[e$lag ==lag & e$week==current_week,prediction_target_var]
# to get a random sample from previous seasons with the same
# week and lag value
subset_trajectory_samples <- cbind(
matrix(
rep(data[seq(from = first_season_obs_ind, to = analysis_time_ind), prediction_target_var], each = nrow(subset_trajectory_samples)),
nrow = nrow(subset_trajectory_samples)
),
subset_trajectory_samples
)
## If first observation for the season was not at season week 1,
## augment with leading NAs
first_season_obs_week <- data$season_week[first_season_obs_ind]
if(first_season_obs_week != 1) {
subset_trajectory_samples <- cbind(
matrix(NA, nrow = nrow(subset_trajectory_samples), ncol = first_season_obs_week - 1),
subset_trajectory_samples
)
}
## values before the first analysis time week are NA so that
## onset and peak calculations only look at data within the CDC's definition
## of the flu season for purposes of the competition
subset_trajectory_samples[,
seq(from = 1, to = first_analysis_time_season_week - 1)] <- NA
## Convert to binned values
if (regional_switch == "Hosp"){
binned_subset_trajectory_samples <-
get_inc_bin(subset_trajectory_samples,max=13,
return_character = FALSE)
}else{
binned_subset_trajectory_samples <- get_inc_bin(subset_trajectory_samples,max=13,
return_character = FALSE)
}
## Onset is defined relative to a baseline and is only available for regions
#TODO: SET "COUNTRY" LABEL TO REGION
if(regional_switch == "Country"){
## Get onset week for each simulated trajectory
onset_week_by_sim_ind <-
apply(binned_subset_trajectory_samples, 1, function(trajectory) {
get_onset_week(
incidence_trajectory = trajectory,
baseline =
get_onset_baseline(region = region, season = analysis_time_season),
onset_length = 3L,
first_season_week = 31,
weeks_in_first_season_year = weeks_in_first_season_year
)
})
}
## Get peak incidence for each simulated trajectory
peak_inc_bin_by_sim_ind <-
apply(binned_subset_trajectory_samples, 1, function(trajectory) {
max(trajectory, na.rm = TRUE)
})
## get peak week by sim ind
## note that some sim inds may have more than 1 peak week...
peak_weeks_by_sim_ind <- unlist(lapply(
seq_len(nrow(subset_trajectory_samples)),
function(sim_ind) {
bin_val <- peak_inc_bin_by_sim_ind[sim_ind]
peak_season_weeks <- which(
binned_subset_trajectory_samples[sim_ind, ] == bin_val)
return(peak_season_weeks)
}
))
## Get bin probabilities and add to region template
if(regional_switch == "Country") {
onset_week_bins <- c(as.character(seq(from = 10, to = weeks_in_first_season_year - 10, by = 1)), "none")
onset_bin_log_probs <- log(sapply(
onset_week_bins,
function(bin_name) {
sum(onset_week_by_sim_ind == bin_name)
})) -
log(length(onset_week_by_sim_ind))
onset_bin_log_probs <- onset_bin_log_probs - logspace_sum(onset_bin_log_probs)
region_results[
region_results$Target == "Season onset" & region_results$Type == "Bin",
"Value"] <- exp(onset_bin_log_probs)
if(onset_bin_log_probs[length(onset_bin_log_probs)] >= 0.5) {
region_results[
region_results$Target == "Season onset" & region_results$Type == "Point",
"Value"] <- NA
} else {
region_results[
region_results$Target == "Season onset" & region_results$Type == "Point",
"Value"] <- season_week_to_year_week(
floor(median(as.numeric(onset_week_by_sim_ind), na.rm = TRUE)),
first_season_week = 31,
weeks_in_first_season_year = weeks_in_first_season_year)
}
}
peak_week_bins <- seq(from = 10, to = weeks_in_first_season_year - 10, by = 1)
peak_week_bin_log_probs <- log(sapply(
peak_week_bins,
function(bin_name) {
sum(peak_weeks_by_sim_ind == bin_name)
})) -
log(length(peak_weeks_by_sim_ind))
peak_week_bin_log_probs <- peak_week_bin_log_probs - logspace_sum(peak_week_bin_log_probs)
names(peak_week_bin_log_probs) <- as.character(peak_week_bins)
region_results[
region_results$Target == "Season peak week" & region_results$Type == "Bin",
"Value"] <- exp(peak_week_bin_log_probs)
region_results[
region_results$Target == "Season peak week" & region_results$Type == "Point",
"Value"] <- season_week_to_year_week(
floor(median(as.numeric(peak_weeks_by_sim_ind))),
first_season_week = 31,
weeks_in_first_season_year = weeks_in_first_season_year)
peak_inc_bin_log_probs <- log(sapply(
incidence_bin_names,
function(bin_name) {
sum(peak_inc_bin_by_sim_ind == as.numeric(bin_name))
})) -
log(length(peak_inc_bin_by_sim_ind))
peak_inc_bin_log_probs <- peak_inc_bin_log_probs - logspace_sum(peak_inc_bin_log_probs)
region_results[
region_results$Target == "Season peak percentage" & region_results$Type == "Bin",
"Value"] <- exp(peak_inc_bin_log_probs)
region_results[
region_results$Target == "Season peak percentage" & region_results$Type == "Point",
"Value"] <- median(peak_inc_bin_by_sim_ind)
}
## Predictions for incidence in an individual week at prediction horizon ph = 1, ..., 4
for(ph in 1:4) {
sample_inds_with_na <- is.na(trajectory_samples[, ph])
## get sampled incidence values at prediction horizon that are usable/not NAs
ph_inc_by_sim_ind <- trajectory_samples[!sample_inds_with_na, ph]
ph_inc_bin_by_sim_ind <- get_inc_bin(ph_inc_by_sim_ind, return_character = TRUE)
## get bin probabilities and store in regional template
ph_inc_bin_log_probs <- log(sapply(
incidence_bin_names,
function(bin_name) {
sum(ph_inc_bin_by_sim_ind == bin_name)
})) -
log(length(ph_inc_bin_by_sim_ind))
ph_inc_bin_log_probs <- ph_inc_bin_log_probs - logspace_sum(ph_inc_bin_log_probs)
region_results[
region_results$Target == paste0(ph, " wk ahead") & region_results$Type == "Bin",
"Value"] <- exp(ph_inc_bin_log_probs)
region_results[
region_results$Target == paste0(ph, " wk ahead") & region_results$Type == "Point",
"Value"] <- median(ph_inc_by_sim_ind)
} # ph loop
return(region_results)
}
#' return integer that's either 52 or 53: number of MMWR weeks in the given year
#'
#' @param year year in the format "2014" -- can be character or numeric
#'
#' @details requires non-exported function start_date from MMWRweek package
#'
#' @export
get_num_MMWR_weeks_in_year <- function(year) {
require(MMWRweek)
year <- as.numeric(year)
return(MMWRweek::MMWRweek(MMWRweek:::start_date(year + 1) - 1)$MMWRweek)
}
#' return integer that's either 52 or 53: number of weeks in the first year of
#' a given season.
#'
#' @param season season in the format "2014/2015"
#'
#' @details requires MMWRweek package
#'
#' @export
get_num_MMWR_weeks_in_first_season_year <- function(season) {
return(get_num_MMWR_weeks_in_year(substr(season, 1, 4)))
}
#' return the bin name for a given incidence
#'
#' @param inc numeric incidence level
#' @param return_character logical: if true, return type is character (bin name)
#' if false, return type is numeric representation of bin
#'
#' @return vector giving the bin name of the input incidence.
#'
#' @details assumes max inc bin is 13 and bins are 0.1 in size.
#'
#' @export
get_inc_bin <- function(inc,max=13,
return_character = TRUE) {
inc <- round(inc, 1)
bin_numeric <- ifelse(inc < max,
floor(inc*10)/10, ## floors to 1st decimal place
max)
if(return_character) {
return(as.character(bin_numeric))
} else {
return(bin_numeric)
}
}
#' Calcluation of median value from binned probability distribution
#'
#' @param probs vector of named probabilities
#'
#' @return a numeric value
#'
#' @export
calc_median_from_binned_probs <- function(probs) {
## could do something more intelligent for "none" bin in onset - currently assuming it is all ordered
cumprob <- cumsum(probs)
median_idx <- min(which(cumprob>=0.5))
as.numeric(names(probs)[median_idx])
}
#' Make plots of prediction submissions for flu contest: so far, seasonal targets only
#'
#' @param preds_save_file path to a file with predictions in csv submission format
#' @param plots_save_file path to a pdf file where plots should go
#' @param data data observed so far this season
#' @export
make_predictions_plots <- function(
preds_save_file,
plots_save_file,
data
) {
require("grid")
require("ggplot2")
predictions <- read.csv(preds_save_file)
regional <- data$region_type[1] =="HHS Regions"
if(regional) {
preds_region_map <- data.frame(
internal_region = c("National", paste0("Region ", 1:10)),
preds_region = c("US National", paste0("HHS Region ", 1:10))
)
} else {
preds_region_map <- data.frame(
internal_region = unique(predictions$Location),
preds_region = unique(predictions$Location)
)
}
current_season <- tail(data$season, 1)
current_year <- tail(data$year, 1)
pdf(plots_save_file)
for(region in unique(data$region)) {
preds_region <- preds_region_map$preds_region[preds_region_map$internal_region == region]
## Observed incidence
p_obs <- ggplot(data[data$region == region & data$season == current_season, ]) +
expand_limits(x = c(0, 42), y = c(0, 13)) +
geom_line(aes(x = season_week, y = weighted_ili)) +
ggtitle("Observed incidence") +
theme_bw()
if(regional){
p_obs <- p_obs +
geom_hline(yintercept = get_onset_baseline(region, season = current_season), colour = "red")
}
## Onset
if(regional) {
reduced_preds <- predictions[predictions$Location == preds_region & predictions$Target == "Season onset" & predictions$Type == "Bin", ] %>%
mutate(
season_week = year_week_to_season_week(as.numeric(as.character(Bin_start_incl)), current_year)
)
point_pred <- predictions[predictions$Location == preds_region & predictions$Target == "Season onset" & predictions$Type == "Point", , drop = FALSE] %>%
mutate(
season_week = year_week_to_season_week(Value, current_year)
)
p_onset <- ggplot(reduced_preds) +
geom_line(aes(x = season_week, y = Value)) +
geom_vline(xintercept = point_pred$season_week, colour = "red") +
expand_limits(x = c(0, 42)) +
ylab("predicted probability of onset") +
ggtitle("Onset") +
theme_bw()
}
## Peak Timing
reduced_preds <- predictions[predictions$Location == preds_region & predictions$Target == "Season peak week" & predictions$Type == "Bin", ] %>%
mutate(
season_week = year_week_to_season_week(as.numeric(as.character(Bin_start_incl)), current_year)
)
point_pred <- predictions[predictions$Location == preds_region & predictions$Target == "Season peak week" & predictions$Type == "Point", , drop = FALSE] %>%
mutate(
season_week = year_week_to_season_week(Value, current_year)
)
p_peak_timing <- ggplot(reduced_preds) +
geom_line(aes(x = season_week, y = Value)) +
geom_vline(xintercept = point_pred$season_week, colour = "red") +
expand_limits(x = c(0, 42)) +
ylab("predicted probability of peak") +
ggtitle("Peak timing") +
theme_bw()
## Peak Incidence
reduced_preds <- predictions[predictions$Location == preds_region & predictions$Target == "Season peak percentage" & predictions$Type == "Bin", ] %>%
mutate(inc_bin = as.numeric(as.character(Bin_start_incl)))
point_pred <- predictions[predictions$Location == preds_region & predictions$Target == "Season peak percentage" & predictions$Type == "Point", , drop = FALSE] %>%
mutate(inc_bin = Value)
p_peak_inc <- ggplot(reduced_preds) +
geom_line(aes(x = inc_bin, y = Value)) +
geom_vline(xintercept = point_pred$inc_bin, colour = "red", data) +
expand_limits(x = c(0, 13)) +
ylab("predicted probability of peak incidence") +
coord_flip() +
ggtitle("Peak incidence") +
theme_bw()
grid.newpage()
pushViewport(viewport(layout =
grid.layout(nrow = 4,# ifelse(regional, 4, 3), ## adjustment for onset
ncol = 2,
heights = unit(c(2, 1, 1, 1), c("lines", "null", "null", "null")))))
grid.text(preds_region,
gp = gpar(fontsize = 20),
vp = viewport(layout.pos.col = 1:2, layout.pos.row = 1))
if(regional){
print(p_onset, vp = viewport(layout.pos.col = 1, layout.pos.row = 2))
}
print(p_obs, vp = viewport(layout.pos.col = 1, layout.pos.row = 3))
print(p_peak_timing, vp = viewport(layout.pos.col = 1, layout.pos.row = 4))
print(p_peak_inc, vp = viewport(layout.pos.col = 2, layout.pos.row = 3))
recent_obs <- data[data$region == region & data$season == current_season, "weighted_ili"]
recent_obs <- tail(recent_obs, 1)
p_1wk <- my_plot_weekahead(res, region = preds_region, wk = 1, ilimax=13, years = 2018:2019) + ggtitle(paste(preds_region, ": 1 wk ahead")) + ylim(0,1) + geom_vline(xintercept = recent_obs)
p_2wk <- my_plot_weekahead(res, region = preds_region, wk = 2, ilimax=13, years = 2018:2019) + ylim(0,1) + geom_vline(xintercept = recent_obs)
p_3wk <- my_plot_weekahead(res, region = preds_region, wk = 3, ilimax=13, years = 2018:2019) + ylim(0,1) + geom_vline(xintercept = recent_obs)
p_4wk <- my_plot_weekahead(res, region = preds_region, wk = 4, ilimax=13, years = 2018:2019) + ylim(0,1) + geom_vline(xintercept = recent_obs)
grid.arrange(p_1wk, p_2wk, p_3wk, p_4wk, ncol=1)
}
dev.off()
}
#' Get the initial rng substream for an rstream object. Should be called by all
#' prediction methods before doing any random number generation.
#'
#' This function DOES NOT set RNG to use rstream with the returned object.
#' Because of strange behavior in the rstream package, this function can be
#' called only once in a given R session.
#'
#' @param seed integer seed for rng; the default was randomly generated
#'
#' @return object of class "rstream.mrg32k3a". The object has been packed via
#' rstream.packed.
#'
#' @export
get_initial_rng_substream <- function(
seed = 9029979) {
require("rstream")
set.seed(seed)
rngstream <- new("rstream.mrg32k3a", seed = sample(1:100000, 6, rep = FALSE))
## pack rngstream object and return (invisibly) in case methods want to use
rstream.packed(rngstream) <- TRUE
return(rngstream)
}
#' Get the rng substream for an rstream object corresponding to the combination
#' of prediction method, region, and season left out. Should be called by all
#' prediction methods before doing any random number generation.
#'
#' Importantly, by default this function has the side effect of setting RNG to
#' use rstream with the returned object. This behavior is determined by the
#' set_rng argument. This means the caller doesn't have to worry about doing
#' anything unless (a) it wants to use more than 1 substream or (b) it is going
#' to parallelize or do any RNG in a different R session. Because of strange
#' behavior in the rstream package, this function can be called at most once
#' without the rngstream argument in a given R session.
#'
#' @param rngstream (optional) object of class "rstream.mrg32k3a" which will be
#' advanced from its current state.
#' @param seed integer seed for rng; the default was randomly generated
#' @param method character string specifying prediction method
#' currently one of "sarima", "kcde", or "kde"
#' @param year character string specifying year, format "1998"
#' @param week character string specifying epidemic week, in format "02"
#' @param set_rng boolean should rng be set to use rstream with the returned
#' rngstream object?
#'
#' @return invisible object of class "rstream.mrg32k3a", but advanced to the
#' first substream reserved for the given combination of prediction method,
#' region, and season. The object has been packed via rstream.packed.
#'
#' @export
get_rng_substream <- function(
rngstream,
seed = 9029979,
method,
year,
week,
set_rng = TRUE) {
require("rstream")
## Create a data frame with combinations of method, year and week,
## number of substreams used for each such combination.
## We can add more methods later without causing any problems by appending
## new method names to the END of the "method" vector below.
## Adding new years or weeks must be done by adding a new set of rows
## to the bottom of the substreams_used data frame (e.g. via bind_rows).
year_week_combos <- expand.grid(
year = as.character(2010:2017),
week = sprintf("%02d", c(1:20, 40:52)),
stringsAsFactors = FALSE
) %>%
mutate(epiweek = as.integer(paste0(year, week))) %>%
filter(epiweek >= 201040 &
epiweek <= 201720) %>%
rbind(
data.frame(year = "2014",
week = "53",
epiweek = 201453,
stringsAsFactors = FALSE)
) %>%
arrange(epiweek)
substreams_used <- expand.grid(
epiweek = year_week_combos$epiweek,
method = c("sarima_seasonal_difference_TRUE",
"sarima_seasonal_difference_FALSE",
"kcde",
"kde"),
stringsAsFactors = FALSE
)
substreams_used$num_substreams <- 1
## if any future method uses more than 1 substream, set that here
## substream index for the specified method, region, and season
ind <- which(
substreams_used$epiweek == paste0(year, week) &
substreams_used$method == method)
if(!identical(length(ind), 1L)) {
stop("Invalid year, week, and/or method.")
}
## Create Rstream object and advance past all substreams used by previous
## methods/regions/seasons
if(missing(rngstream)) {
set.seed(seed)
rngstream <- new("rstream.mrg32k3a", seed = sample(1:100000, 6, rep = FALSE))
} else {
rstream.packed(rngstream) <- FALSE
}
advance_count <- sum(substreams_used$num_substreams[seq_len(ind - 1)])
for(i in seq_len(advance_count)) {
rstream.nextsubstream(rngstream)
}
## set to use rstream package for RNG with rngstream object
if(set_rng) {
rstream.RNG(rngstream)
}
## pack rngstream object and return (invisibly) in case methods want to use
rstream.packed(rngstream) <- TRUE
invisible(rngstream)
}
## interface to R's C API for logspace arithmetic
#' Calculate log(exp(logx) - exp(logy)) in a somewhat numerically stable way.
#'
#' @param logx, logy log-scale numeric values to subtract
#'
#' @return log(exp(logx) - exp(logy)), but more numerically stable
#'
#' @export
logspace_sub <- function(logx, logy) {
return(.Call("logspace_sub_C",
as.numeric(logx),
as.numeric(logy),
PACKAGE = "cdcFlu20182019"))
}
#' Calculate log(exp(logx) + exp(logy)) in a somewhat numerically stable way.
#'
#' @param logx, logy log-scale numeric values to add
#'
#' @return log(exp(logx) + exp(logy)), but more numerically stable
#'
#' @export
logspace_add <- function(logx, logy) {
return(.Call("logspace_add_C",
as.numeric(logx),
as.numeric(logy),
PACKAGE = "cdcFlu20182019"))
}
#' Calculate log(sum(exp(logx))) in a somewhat numerically stable way.
#'
#' @param logx log-scale numeric vector of values to sum
#'
#' @return log(sum(exp(logx))), but more numerically stable
#'
#' @export
logspace_sum <- function(logx) {
dim(logx) <- c(1, length(logx))
return(logspace_sum_matrix_rows(logx))
}
#' Calculate logspace summation of matrix rows in a somewhat numerically stable
#' way.
#'
#' @param logX log-scale numeric matrix of values to sum.
#'
#' @return log(apply(exp(logX), 1, sum)), but more numerically stable
#'
#' @export
logspace_sum_matrix_rows <- function(logX) {
return(.Call("logspace_sum_matrix_rows_C",
as.numeric(logX),
as.integer(nrow(logX)),
as.integer(ncol(logX)),
PACKAGE = "cdcFlu20182019"))
}
#' Calculate logspace difference of matrix rows in a somewhat numerically stable
#' way.
#'
#' @param logX log-scale numeric matrix of values to subtract. logX must have
#' exactly 2 columns.
#'
#' @return log(exp(logX)[, 1] - exp(logX)[, 2]), but more numerically stable
#'
#' @export
logspace_sub_matrix_rows <- function(logX) {
if(!is.matrix(logX) || !identical(ncol(logX), 2L))
stop("logX must be a matrix with 2 columns")
return(.Call("logspace_sub_matrix_rows_C",
as.numeric(logX),
as.integer(nrow(logX)),
PACKAGE = "cdcFlu20182019"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.