library(lubridate)
library(plyr)
library(dplyr)
library(reshape)
library(kcde)
library(doMC)
library(forecast)
library(cdcFlu20182019)
### Get command line arguments
args <- commandArgs(trailingOnly=TRUE)
## Some other arguments that we're not using for now, but might want to use later
## I have left in code below that depends on these values.
max_seasonal_lag <- 0
bw_parameterization <- "diagonal"
## national or regional (which region?)
## in order to region as an argument, we inserted a - instead of a space
## between "Region" and region number, so now undo that.
data_set <- gsub("-", " ", args[1])
## Prediction horizon -- integer number of steps ahead to predict
prediction_horizon <- as.integer(args[2])
## Maximum number of non-seasonal lagged observations to explore using for prediction
max_lag <- as.integer(args[3])
## Include terms capturing seasonality?
seasonality <- as.logical(args[4])
## Initialization rule -- "cov" or "scott" for covariance or scott's rule, respectively
bw_init_rule <- "scott"
## use data up to but not including first test season to do estimation
first_test_season <- args[5]
## Path to save results in
save_path <- args[6]
## subset data to be only the region of interest
flu_data <- flu_data[flu_data$region == data_set,]
## Subset data to do estimation using only data up through first test season
first_ind_test_season <- min(which(flu_data$season == first_test_season))
data <- flu_data[seq_len(first_ind_test_season - 1), , drop = FALSE]
prediction_target_var <- "weighted_ili"
continuous_var_names <- c(
paste0(c("weighted_ili", "filtered_weighted_ili"), "_horizon", rep(1:52, each=2)),
paste0(c("weighted_ili", "filtered_weighted_ili"), "_lag", rep(seq(from = 0, to = max_lag + 52 * max_seasonal_lag), each=2))
)
discrete_var_names <- NULL
predictive_vars <- "box_cox_trans_weighted_ili"
time_var <- "time"
target_kernel_fn <- log_pdtmvn_mode_centered_kernel
target_rkernel_fn <- rlog_pdtmvn_mode_centered_kernel
target_initialize_kernel_params_fn <- initialize_params_log_pdtmvn_kernel
target_get_theta_optim_bounds_fn <- get_theta_optim_bounds_log_pdtmvn_kernel
target_vectorize_kernel_params_fn <- vectorize_params_log_pdtmvn_kernel
target_update_theta_from_vectorized_theta_est_fn <- update_theta_from_vectorized_theta_est_log_pdtmvn_kernel
target_discrete_var_range_fns <- NULL
predictive_kernel_fn <- pdtmvn_kernel
predictive_rkernel_fn <- rpdtmvn_kernel
predictive_initialize_kernel_params_fn <- initialize_params_pdtmvn_kernel
predictive_get_theta_optim_bounds_fn <- get_theta_optim_bounds_pdtmvn_kernel
predictive_vectorize_kernel_params_fn <- vectorize_params_pdtmvn_kernel
predictive_update_theta_from_vectorized_theta_est_fn <- update_theta_from_vectorized_theta_est_pdtmvn_kernel
predictive_discrete_var_range_fns <- NULL
variable_selection_method <- "all_included"
crossval_buffer <- ymd("2010-01-01") - ymd("2009-01-01")
### Initial Box Cox transformation of data
## Estimate lambda parameter of box cox transformation using data not including
## season that was left out.
temp_data <- data$weighted_ili
lambda <- BoxCox.lambda(temp_data, method = "loglik")
data$box_cox_trans_weighted_ili <- BoxCox(data$weighted_ili, lambda)
### Assemble control parameters for KCDE estimation process
## List describing kernel components -- depends on the values of
## prediction_horizon, filtering, seasonality, and bw_parameterization
kernel_components <- list()
## sample size for initialize_kernel_params_args
if(identical(bw_init_rule, "cov")) {
init_sample_size <- 1L
} else {
init_sample_size <- nrow(data)
}
## If requested, periodic kernel component capturing seasonality
if(seasonality) {
kernel_components <- c(kernel_components,
list(list(
vars_and_offsets = data.frame(var_name = "time_index",
offset_value = 0L,
offset_type = "lag",
combined_name = "time_index_lag0",
stringsAsFactors = FALSE),
kernel_fn = periodic_kernel,
theta_fixed = list(period = pi / 365.2425), # 365.2425 is the mean number of days in a year
theta_est = list("bw"),
initialize_kernel_params_fn = initialize_params_periodic_kernel,
initialize_kernel_params_args = list(
sample_size = init_sample_size
),
get_theta_optim_bounds_fn = get_theta_optim_bounds_periodic_kernel,
get_theta_optim_bounds_args = NULL,
vectorize_kernel_params_fn = vectorize_params_periodic_kernel,
vectorize_kernel_params_args = NULL,
update_theta_from_vectorized_theta_est_fn =
update_theta_from_vectorized_theta_est_periodic_kernel,
update_theta_from_vectorized_theta_est_args = NULL
)))
}
## Kernel components for observed values of incidence
## First step is setup: create list of data frames specifying groups of
## variables and offsets included in each kernel component
lag_values <- NULL
for(seasonal_lag in seq(from = 0, to = max_seasonal_lag)) {
lag_values <- c(lag_values,
seq(from = 0, to = max_lag) + 52 * seasonal_lag)
}
if(identical(bw_parameterization, "diagonal")) {
## Separate kernel components for each prediction target variable and
## predictive variable
vars_and_offsets_groups <- list()
## Group of variable names and offsets for prediction target
new_vars_and_offsets_group <- data.frame(
var_name = prediction_target_var,
offset_value = prediction_horizon,
offset_type = "horizon",
stringsAsFactors = FALSE
)
new_vars_and_offsets_group$combined_name <- paste0(
new_vars_and_offsets_group$var_name,
"_",
new_vars_and_offsets_group$offset_type,
new_vars_and_offsets_group$offset_value
)
vars_and_offsets_groups <- c(vars_and_offsets_groups,
list(new_vars_and_offsets_group))
## Groups of variable names and offsets for lagged predictive variables
for(lag_value in lag_values) {
for(predictive_var in predictive_vars) {
new_vars_and_offsets_group <- data.frame(
var_name = predictive_var,
offset_value = lag_value,
offset_type = "lag",
stringsAsFactors = FALSE
)
new_vars_and_offsets_group$combined_name <- paste0(
new_vars_and_offsets_group$var_name,
"_",
new_vars_and_offsets_group$offset_type,
new_vars_and_offsets_group$offset_value
)
vars_and_offsets_groups <- c(vars_and_offsets_groups,
list(new_vars_and_offsets_group))
}
}
} else if(identical(bw_parameterization, "full")) {
## One kernel component for prediction target variable and all predictive
## variables
## Prediction target variable
new_vars_and_offsets_group <- data.frame(
var_name = prediction_target_var,
offset_value = prediction_horizon,
offset_type = "horizon",
stringsAsFactors = FALSE
)
## Lagged prediction target == predictive variables
for(lag_value in lag_values) {
for(predictive_var in predictive_vars) {
## Else, lagged "raw"/unfiltered observed incidence
new_vars_and_offsets_group <- rbind(
new_vars_and_offsets_group,
data.frame(
var_name = predictive_var,
offset_value = lag_value,
offset_type = "lag",
stringsAsFactors = FALSE
)
)
}
}
## Add combined_name column and put in a list for further processing below
new_vars_and_offsets_group$combined_name <- paste0(
new_vars_and_offsets_group$var_name,
"_",
new_vars_and_offsets_group$offset_type,
new_vars_and_offsets_group$offset_value
)
vars_and_offsets_groups <- list(new_vars_and_offsets_group)
} else {
stop("Invalid bandwidth parameterization")
}
## Second step is to actually append the kernel component descriptions to the
## kernel_components list
kernel_components <- c(kernel_components,
lapply(vars_and_offsets_groups, function(vars_and_offsets) {
lower_trunc_bds <- rep(-Inf, nrow(vars_and_offsets))
names(lower_trunc_bds) <- vars_and_offsets$combined_name
upper_trunc_bds <- rep(Inf, nrow(vars_and_offsets))
names(upper_trunc_bds) <- vars_and_offsets$combined_name
discrete_var_range_fns <- NULL
if(prediction_target_var %in% vars_and_offsets$var_name) {
return(list(
vars_and_offsets = vars_and_offsets,
kernel_fn = target_kernel_fn,
rkernel_fn = target_rkernel_fn,
theta_fixed = list(
parameterization = "bw-chol-decomp",
continuous_vars = vars_and_offsets$combined_name[
vars_and_offsets$combined_name %in% continuous_var_names],
discrete_vars = vars_and_offsets$combined_name[
vars_and_offsets$combined_name %in% discrete_var_names],
discrete_var_range_fns = target_discrete_var_range_fns,
lower = lower_trunc_bds,
upper = upper_trunc_bds,
validate_in_support = FALSE
),
theta_est = list("bw"),
initialize_kernel_params_fn = target_initialize_kernel_params_fn,
initialize_kernel_params_args = list(
sample_size = init_sample_size
),
get_theta_optim_bounds_fn = target_get_theta_optim_bounds_fn,
get_theta_optim_bounds_args = NULL,
vectorize_kernel_params_fn = target_vectorize_kernel_params_fn,
vectorize_kernel_params_args = NULL,
update_theta_from_vectorized_theta_est_fn = target_update_theta_from_vectorized_theta_est_fn,
update_theta_from_vectorized_theta_est_args = NULL
))
} else {
return(list(
vars_and_offsets = vars_and_offsets,
kernel_fn = predictive_kernel_fn,
rkernel_fn = predictive_rkernel_fn,
theta_fixed = list(
parameterization = "bw-chol-decomp",
continuous_vars = vars_and_offsets$combined_name[
vars_and_offsets$combined_name %in% continuous_var_names],
discrete_vars = vars_and_offsets$combined_name[
vars_and_offsets$combined_name %in% discrete_var_names],
discrete_var_range_fns = predictive_discrete_var_range_fns,
lower = lower_trunc_bds,
upper = upper_trunc_bds,
validate_in_support = FALSE
),
theta_est = list("bw"),
initialize_kernel_params_fn = predictive_initialize_kernel_params_fn,
initialize_kernel_params_args = list(
sample_size = init_sample_size
),
get_theta_optim_bounds_fn = predictive_get_theta_optim_bounds_fn,
get_theta_optim_bounds_args = NULL,
vectorize_kernel_params_fn = predictive_vectorize_kernel_params_fn,
vectorize_kernel_params_args = NULL,
update_theta_from_vectorized_theta_est_fn = predictive_update_theta_from_vectorized_theta_est_fn,
update_theta_from_vectorized_theta_est_args = NULL
))
}
})
)
## Set up filter_control to do no filtering
filter_control <- NULL
## Assemble kernel_components and filter_control created above,
## along with other parameters controling KCDE definition and estimation
kcde_control <- create_kcde_control(X_names = "time_index",
y_names = prediction_target_var,
time_name = time_var,
prediction_horizons = prediction_horizon,
filter_control = filter_control,
kernel_components = kernel_components,
crossval_buffer = crossval_buffer,
prediction_inds_not_included = NULL,
loss_fn = neg_log_score_loss,
loss_fn_prediction_args = list(
prediction_type = "distribution",
log = TRUE),
loss_args = NULL,
par_cores = 4L,
variable_selection_method = variable_selection_method)
### Do estimation
case_descriptor <- paste0(
data_set,
"-prediction_horizon_", prediction_horizon,
"-max_lag_", max_lag,
"-seasonality_", seasonality,
# "-season_left_out_", gsub("/", "-", season_left_out),
"-first_test_season_", gsub("/", "-", first_test_season)
)
init_theta_vector <- NULL
init_phi_vector <- NULL
registerDoMC(cores = kcde_control$par_cores)
## Get the KCDE fit
fit_time <- system.time({
kcde_fit <- kcde(data = data,
kcde_control = kcde_control,
init_theta_vector = init_theta_vector,
init_phi_vector = init_phi_vector)
})
### Save results
saveRDS(kcde_fit,
file = file.path(save_path,
paste0("kcde_fit-",
case_descriptor,
".rds")
)
)
saveRDS(lambda,
file = file.path(save_path,
paste0("box_cox_lambda-",
case_descriptor,
".rds")
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.