### post-process ssr fits to obtain predictions
library(ggplot2)
#library(ssrFlu)
library(cdcfluview)
library(lubridate)
library(reshape)
library(plyr)
library(dplyr)
library(tidyr)
library(copula)
library(mvtnorm)
library(kcde)
## Function borrowed from copula package and modified
## Find a matrix near mat that is positive definite
makePosDef <- function (mat, delta = 0.001)
{
while(min(eigen(mat)$values) < 10^{-6}) {
decomp <- eigen(mat)
Lambda <- decomp$values
Lambda[Lambda < 0] <- delta
Gamma <- decomp$vectors
newmat <- Gamma %*% diag(Lambda) %*% t(Gamma)
D <- 1/sqrt(diag(newmat))
mat <- diag(D) %*% newmat %*% diag(D)
}
return(mat)
}
## set up -- these parameters determine the data set and which KCDE fit we will use
data_set <- "ili_national"
max_lag <- 1L
max_seasonal_lag <- 0L
filtering <- FALSE
differencing <- FALSE
seasonality <- TRUE
#bw_parameterization <- "diagonal"
bw_parameterization <- "full"
n_sims <- 10000
## these are pointers to files in article-disease-pred-with-kcde repo
copula_save_path <- file.path("/media/evan/data/Reich/infectious-disease-prediction-with-kcde/inst/results",
data_set,
"copula-estimation-results")
estimation_save_path <- file.path("/media/evan/data/Reich/infectious-disease-prediction-with-kcde/inst/results",
data_set,
"estimation-results")
prediction_save_path <- file.path("/media/evan/data/Reich/infectious-disease-prediction-with-kcde/inst/results",
data_set,
"prediction-results")
case_descriptor <- paste0(
data_set,
"-max_lag_", max_lag,
"-max_seasonal_lag_", max_seasonal_lag,
"-filtering_", filtering,
"-differencing_", differencing,
"-seasonality_", seasonality,
"-bw_parameterization_", bw_parameterization
)
file_name <- paste0("kcde-copula-fits-",
case_descriptor,
".rds")
copula_fits <- readRDS(file = file.path(copula_save_path, file_name))
analysis_time_season_week_by_copula_fit <- unlist(lapply(copula_fits,
function(copula_fit) {copula_fit$analysis_time_season_week}))
kcde_fits_by_prediction_horizon <- lapply(seq_len(52),
function(prediction_horizon) {
case_descriptor <- paste0(
data_set,
"-prediction_horizon_", prediction_horizon,
"-max_lag_", max_lag,
"-max_seasonal_lag_", max_seasonal_lag,
"-filtering_", filtering,
"-differencing_", differencing,
"-seasonality_", seasonality,
"-bw_parameterization_", bw_parameterization
)
readRDS(file.path(estimation_save_path,
paste0("kcde_fit-", case_descriptor, ".rds")))
})
if(identical(data_set, "ili_national")) {
## Load data for nationally reported influenza like illness
library(cdcfluview)
usflu <- get_flu_data("national", "ilinet", years=1997:2015)
data <- transmute(usflu,
region.type = REGION.TYPE,
region = REGION,
year = YEAR,
week = WEEK,
weighted_ili = as.numeric(X..WEIGHTED.ILI))
## Add time column. This is used for calculating times to drop in cross-validation
data$time <- ymd(paste(data$year, "01", "01", sep = "-"))
week(data$time) <- data$week
## Add time_index column. This is used for calculating the periodic kernel.
## Here, this is calculated as the number of days since some origin date (1970-1-1 in this case).
## The origin is arbitrary.
data$time_index <- as.integer(data$time - ymd(paste("1970", "01", "01", sep = "-")))
## Season column: for example, weeks of 2010 up through and including week 30 get season 2009/2010;
## weeks after week 30 get season 2010/2011
data$season <- ifelse(
data$week <= 30,
paste0(data$year - 1, "/", data$year),
paste0(data$year, "/", data$year + 1)
)
## Season week column: week number within season
data$season_week <- sapply(seq_len(nrow(data)), function(row_ind) {
sum(data$season == data$season[row_ind] & data$time_index <= data$time_index[row_ind])
})
if(differencing) {
data$weighted_ili_ratio <- data$weighted_ili / lag(data$weighted_ili, 52)
prediction_target_var <- "weighted_ili_ratio"
train_seasons <- paste0(seq(from = 1998, to = 2009), "/", seq(from = 1999, to = 2010))
} else {
prediction_target_var <- "weighted_ili"
train_seasons <- paste0(seq(from = 1997, to = 2009), "/", seq(from = 1998, to = 2010))
}
kernel_fn <- log_pdtmvn_kernel
rkernel_fn <- rlog_pdtmvn_kernel
variable_selection_method <- "all_included"
crossval_buffer <- ymd("2010-01-01") - ymd("2009-01-01")
season_length <- 33L
analysis_seasons <- "2015/2016"
first_analysis_time_season_week <- 10 # == week 40 of year
last_analysis_time_season_week <- 41 # analysis for 33-week season, consistent with flu competition -- at week 41, we do prediction for a horizon of one week ahead
}
last_obs_week <- 18
last_obs_year <- 2016
analysis_time_ind <- which(data$year == last_obs_year & data$week == last_obs_week)
analysis_time_season <- data$season[analysis_time_ind]
analysis_time_season_week <- data$season_week[analysis_time_ind]
### simulate from copula that ties the marginal predictive distributions together
## get the right copula for analysis_time_season_week
predictive_copula_ind <- which(analysis_time_season_week_by_copula_fit == analysis_time_season_week)
copula_fit <- copula_fits[[predictive_copula_ind]]$copula_fit
predictive_copula <- copula_fit@copula
## simulate n_sims sequences from copula
max_prediction_horizon <-
last_analysis_time_season_week + 1 -
analysis_time_season_week
if(max_prediction_horizon < 4L) {
## get the right copula for analysis_time_season_week
predictive_copula_ind <- which(analysis_time_season_week_by_copula_fit == analysis_time_season_week) -
(4L - max_prediction_horizon)
copula_fit <- copula_fits[[predictive_copula_ind]]$copula_fit
predictive_copula <- copula_fit@copula
max_prediction_horizon <- 4L
}
sim_sequences <- matrix(NA, nrow = n_sims, ncol = max_prediction_horizon)
na_rows <- seq_len(n_sims)
# while(length(na_rows) > 0) {
for(sim_ind in na_rows) {
## generate random parameters from estimation distribution
## Note that the variance estimate for parameters is low; at least we're
## accounting for some uncertainty though...
# predictive_copula@parameters <- rmvnorm(1, copula_fit@estimate, sigma = copula_fit@var.est)[1, ]
# if(identical(class(predictive_copula)[1], "normalCopula")) {
# ## randomly generated parameters above may not yield a positive definite correlation matrix; correct
# predictive_copula@parameters <- makePosDef(getSigma(predictive_copula))[1, 1 + seq_along(predictive_copula@parameters)]
# }
#
## simulate sequence from copula
sim_sequences[sim_ind, ] <- rCopula(1, predictive_copula)[1, ]
}
## NA rows result when random parameters are problematic
# na_rows <- which(apply(sim_sequences, 1, function(ss_row) any(is.na(ss_row))))
# }
### get quantiles from marginal predictive distributions corresponding to
### values simulated from copula
#analysis_time_ind <- which(data$season == analysis_time_season &
# data$season_week == analysis_time_season_week)
trajectory_samples <- matrix(NA, nrow = n_sims, ncol = max_prediction_horizon)
for(prediction_horizon in seq_len(max_prediction_horizon)) {
trajectory_samples[, prediction_horizon] <-
kcde_predict(
p = sim_sequences[, prediction_horizon],
n = 100000,
kcde_fit = kcde_fits_by_prediction_horizon[[prediction_horizon]],
prediction_data =
data[seq_len(analysis_time_ind), , drop = FALSE],
leading_rows_to_drop = 0L,
trailing_rows_to_drop = 0L,
additional_training_rows_to_drop = NULL,
prediction_type = "quantile",
log = TRUE
)
if(differencing) {
trajectory_samples[, prediction_horizon] <-
trajectory_samples[, prediction_horizon] *
data[analysis_time_ind + prediction_horizon - 52, prediction_target_var]
}
}
#locations <- c("ili_national", paste0("ili_region", 1:10))
#pred_hzns <- 1:30
#
########################################
### collect fits for a given location ##
########################################
#
###for(location in locations) {
#location <- "ili_national"
#ssr_fits_by_prediction_horizon_limit <- lapply(pred_hzns,
# function(phl) {
# file_name <- paste0(
# "inst/estimation/2015-cdc-flu-competition/fit-competition-ssr-ph",
# phl,
# "-",
# location,
# ".Rdata")
# read_env <- new.env()
# load(file_name, envir=read_env)
# return(read_env$ssr_fit)
# })
#names(ssr_fits_by_prediction_horizon_limit) <- paste0("phl", pred_hzns)
#
### assemble data set
#
#if(identical(location, "ili_national")) {
# usflu <- get_flu_data("national", "ilinet", years=1997:2015)
# data <- transmute(usflu,
# region.type = REGION.TYPE,
# region = REGION,
# year = YEAR,
# season_week = WEEK,
# season = paste0(year, "/", year+1),
# total_cases = as.numeric(X..WEIGHTED.ILI))
#
#
# ## add log column
# data$log_total_cases <- log(data$total_cases + 1)
#
# ## add week_start_date
# ## note we changed reference to data$week in estimation file to data$season_week here
# char_dates <- paste(data$year, data$season_week, "1")
# data$week_start_date <- as.Date(char_dates, format="%Y %W %w")
#
# ## remove week 53s
# data <- data[-which(is.na(data$week_start_date)),]
# ## subset is different than for estimation: adding 1 to index to account for rows removed for lag?
# data <- data[(262+1):nrow(data),]
#
# ## add smooth log column -- or not, since it is already pretty smooth...
# ## (but this was done in estimation, so we need to do it here as well.)
# sm <- loess(log_total_cases ~ as.numeric(week_start_date), data=data, span= 26 / nrow(data))
# data$smooth_log_cases <- sm$fitted
#
# ## add time column
# data$time_ind <- seq_len(nrow(data))
#} else {
# data <- NULL
#}
##########################################
## make something resembling a forecast ##
##########################################
filedate <- '20160513'
last_obs_week <- 18
last_obs_year <- 2016
nsim <- 10000
pred_horizons <- seq_len(ncol(trajectory_samples))
#pred_horizons <- 1:30
#pred_horizons <- 1:10 THIS ONE WORKS.
preds <- matrix(nrow=nsim*length(pred_horizons), ncol=3)
colnames(preds) <- c("hzn", "sim", "ili")
preds[,"hzn"] <- rep(pred_horizons, each=nsim)
preds[,"sim"] <- rep(1:nsim, times=length(pred_horizons))
for(i in 1:length(pred_horizons)){
# message(paste("horizon", i))
# pred_horizon <- pred_horizons[i]
# ssr_fit <- ssr_fits_by_prediction_horizon_limit[[pred_horizon]]
# max_lag <- max(unlist(ssr_fit$lags_hat))
#
# ## what are we predicting?
# prediction_data_inds <- (nrow(data)-max_lag) : nrow(data)
#
# ## update theta_est in ssr_fit object to also contain parameter values that were held fixed
# ## in this case, this includes only the period of the periodic kernel function
# if("time_ind_lag0" %in% names(ssr_fit$theta_hat)) {
# ssr_fit$theta_hat$time_ind_lag0 <- c(ssr_fit$theta_hat$time_ind_lag0,
# ssr_fit$ssr_control$theta_fixed$time_ind)
# }
#
# ## this adapted from challenge-predictions.R ~ lines 124-129
# tmp <- ssr_predict(ssr_fit,
# prediction_data=data[prediction_data_inds, , drop=FALSE],
# leading_rows_to_drop=max(unlist(ssr_fit$lags_hat)),
# prediction_horizon=pred_horizon,
# normalize_weights=TRUE)
idx <- which(preds[,"hzn"]==i)
preds[idx,"ili"] <- trajectory_samples[, i]
# preds[idx,"ili"] <- simulate_from_weighted_kde(nsim, tmp)
}
#upto_current_data_idx <- which(data$season=="2015/2016" & data$season_week==40):nrow(data)
#data_sims <- data_frame(week = rep(data[upto_current_data_idx, "season_week"], each=nsim),
# sim = rep(1:nsim, times=length(upto_current_data_idx)),
# season_week = 201500 + 100*(week<30) + week,
# ili = rep(data[upto_current_data_idx, "total_cases"], each=nsim))
upto_current_data_idx <- which(data$season=="2015/2016" & data$week==40):nrow(data)
data_sims <- data_frame(week = rep(data[upto_current_data_idx, "week"], each=nsim),
sim = rep(1:nsim, times=length(upto_current_data_idx)),
season_week = 201500 + 100*(week<30) + week,
ili = rep(data[upto_current_data_idx, "weighted_ili"], each=nsim))
preds_df <- tbl_df(data.frame(preds)) %>%
mutate(week = (last_obs_week + hzn - 1)%%52 + 1,
season_week = 201500 + 100*(week<30) + week) %>%
full_join(data_sims) %>%
mutate(week_date = as.Date(paste0(season_week,00), format="%Y%W%w")) %>%
arrange(season_week)
#########################################
## calculuate peak week probabilities ##
#########################################
template_table <- data_frame(week=c(40:52, 1:20), season_week=c(201540:201552, 201601:201620))
peak_week <- preds_df %>% group_by(sim) %>%
summarize(week = week[which.max(ili)]) %>%
ungroup() %>% group_by(week) %>%
summarize(peak_wk_totals = n()) %>%
ungroup() %>% right_join(template_table)
## replace zeroes
peak_week[is.na(peak_week)] <- 0
peak_week <- peak_week %>%
mutate(peak_wk_totals = (peak_wk_totals+1),
peak_wk_prob = peak_wk_totals/sum(peak_wk_totals))
write.csv(peak_week, file=paste0('inst/submissions/', filedate, '-peak-week.csv'))
############################################
## calculuate season onset probabilities ##
############################################
## need to add no onset possibility?
template_onsets <- data_frame(first_week_year = c(rep(2015, 13), rep(2016, 20)),
first_week_num = c(40:52, 1:20))
seasonal_baseline <- 2.1
onsets <- preds_df %>% group_by(sim) %>%
mutate(ili_lag1 = lag(ili, 1),
ili_lag2 = lag(ili, 2),
ili_lag3 = lag(ili, 3),
onset = ili_lag1 > seasonal_baseline & ili_lag2 > seasonal_baseline & ili_lag3 > seasonal_baseline) %>%
filter(onset) %>%
summarize(first_week = first(week_date, order_by=onset)-weeks(2)) %>%
ungroup() %>%
mutate(first_week_num = as.numeric(format(first_week, "%W")),
first_week_year = as.numeric(format(first_week, "%Y"))) %>%
count(first_week_year, first_week_num) %>%
right_join(template_onsets) %>% ungroup()
onsets[is.na(onsets)] <- 0
onsets <- onsets %>%
mutate(n = (n+1),
sumn=sum(n),
onset_prob = n/sum(n))
write.csv(onsets, file=paste0('inst/submissions/', filedate, '-onsets.csv'))
############################################
## calculuate next 4 week ahead bins ##
############################################
ili_breaks <- seq(.5, 13, by = .5)
pred_bins <- preds_df %>%
filter(week_date <= as.Date(paste0(last_obs_year, sprintf("%02d", last_obs_week), 00),
format="%Y%W%w") + weeks(4),
week_date > as.Date(paste0(last_obs_year, sprintf("%02d", last_obs_week), 00),
format="%Y%W%w")) %>%
mutate(ili_bin=cut(ili, breaks=ili_breaks, right=FALSE)) %>%
count(week_date, ili_bin) %>%
spread(week_date, n)
## NEED TO MAKE THIS BETTER
pred_bins[is.na(pred_bins)] <- 1
pred_bins_dodge <- rbind(rep(1, 5),
pred_bins,
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5),
rep(1, 5))
for(i in 2:ncol(pred_bins_dodge)){
pred_bins_dodge[,i] <- pred_bins_dodge[,i]/sum(pred_bins_dodge[,i])
}
## need to make adjustment for zero predictions
write.csv(pred_bins_dodge, file=paste0('inst/submissions/', filedate, '-pred_bins.csv'))
## for point predictions
preds_df %>%
filter(week_date <= as.Date(paste0(last_obs_year, sprintf("%02d", last_obs_week), 00), format="%Y%W%w") + weeks(4),
week_date > as.Date(paste0(last_obs_year, sprintf("%02d", last_obs_week), 00), format="%Y%W%w")) %>%
group_by(week_date) %>%
summarize(median(ili))
######################
## for peak height ##
######################
peak_ht <- preds_df %>%
group_by(sim) %>%
summarize(peak_hts = max(ili)) %>%
mutate(peak_ht=cut(peak_hts, breaks=ili_breaks, right=FALSE)) %>%
count(peak_ht)
peak_height_dodge <- rbind(rep(1, 2), rep(1, 2), rep(1, 2), rep(1, 2), rep(1, 2),
peak_ht,
rep(1, 2), rep(1, 2), rep(1, 2), rep(1, 2), rep(1, 2), rep(1, 2),
rep(1, 2), rep(1, 2), rep(1, 2), rep(1, 2), rep(1, 2), rep(1, 2),
rep(1, 2), rep(1, 2), rep(1, 2))
peak_height_dodge[,2] <- peak_height_dodge[,2]/sum(peak_height_dodge[,2])
write.csv(peak_height_dodge, file=paste0('inst/submissions/', filedate, '-peak-height.csv'))
preds_df %>%
group_by(sim) %>%
summarize(peak_hts = max(ili)) %>%
ungroup() %>% summarize(median(peak_hts))
############################################
## plot predictions sanity check ##
############################################
preds_sum <- preds_df %>%
group_by(week_date) %>%
summarize(median_ili = median(ili),
p05 = quantile(ili, .05),
p95 = quantile(ili, .95))
ggplot(preds_sum, aes(x=week_date)) +
geom_line(aes(y=median_ili)) +
geom_ribbon(aes(ymin=p05, ymax=p95), alpha=.2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.