## utility functions that might be useful for multiple prediction methods
#' Compute a weighted combination of prediction results
#'
#' @param component_preds data frame of predictions from component models
#' @param weights data frame of model weights
#' @param log_weights data frame of (log) model weights
#'
#' @return data frame with weighted combination of predictions
#'
#' @export
weighted_combine_predictions <- function(
component_preds,
weights,
log_weights) {
log_weights_missing <- missing(log_weights)
if(log_weights_missing) {
if(!missing(weights)) {
log_weights <- weights
} else {
stop("Must provide at least one of weights, log_weights")
}
}
## all possible prediction targets and types
prediction_targets <- c("onset", "peak_inc", "peak_week", "ph_1_inc", "ph_2_inc", "ph_3_inc", "ph_4_inc")
prediction_types <- c("log_score", "competition_log_score", "bin_log_probs")
## prediction targets for which weights were supplied
prediction_targets_used <- colnames(log_weights)[colnames(log_weights) %in% prediction_targets]
## variables on which weights depend
weighting_covars <- colnames(log_weights)[!(colnames(log_weights) %in% prediction_targets)]
## variables that, taken together, uniquely identify cases for which predictions were made
unique_case_id_vars <- c("region", "analysis_time_season", "analysis_time_season_week")
## if log weights was missing, need to take log of provided weights
if(log_weights_missing) {
log_weights[, prediction_targets_used] <- log(log_weights[, prediction_targets_used])
}
## add log_weights columns to predictions from component models
component_preds <- left_join(component_preds,
log_weights,
by = weighting_covars)
## number of models
num_models <- length(unique(log_weights$model))
## compute weighted sum of model predictions for each prediction target
for(prediction_target in prediction_targets) {
for(prediction_type in prediction_types) {
## which columns contain predictions for given target/type. use column
## names because column indices may differ in component_preds and combined_preds
prediction_cols_to_combine <- grep(
paste0(prediction_target, "_", ifelse(prediction_type == "bin_log_probs", "bin_.*_log_prob", prediction_type)),
colnames(component_preds))
prediction_cols_to_combine <- colnames(component_preds)[prediction_cols_to_combine]
## if prediction target has associated weights, weight predictions from component models
## otherwise, set predictions from component models to NA
if(prediction_target %in% prediction_targets_used) {
component_preds[, prediction_cols_to_combine] <-
component_preds[, prediction_cols_to_combine] +
component_preds[, prediction_target]
} else {
component_preds[, prediction_cols_to_combine] <- NA
}
}
}
cols_to_summarize <- colnames(component_preds)[
!(colnames(component_preds) %in% c("model", prediction_targets_used, "kcde_model_confidence", "sarima_model_confidence", "weighted_ili"))
]
cols_to_sum <- cols_to_summarize[!(cols_to_summarize %in% unique_case_id_vars)]
## split by unique_case_id_vars, summarize each, and rejoin
split_component_preds <- split(component_preds[, cols_to_summarize],
component_preds[, unique_case_id_vars])
combined_preds <- rbind.fill(lapply(split_component_preds,
function(comp) {
res <- cbind(
comp[1, cols_to_summarize[!(cols_to_summarize %in% cols_to_sum)], drop = FALSE],
apply(comp[, cols_to_sum], 2, logspace_sum) %>%
`dim<-`(c(1, length(cols_to_sum))) %>%
as.data.frame() %>%
`colnames<-`(cols_to_sum)
)
if(!identical(nrow(comp), num_models)) {
warning("predictions from all component models not available.")
res[, cols_to_sum] <- NA
}
return(res)
}
))
return(combined_preds)
}
#' Assemble leave-one-season-out or test phase predictions made by kde, kcde, and sarima
#' models on training data.
#'
#' @param preds_path path to leave-one-season-out or test phase predictions.
#' @param regions character vector specifying regions for which to get predictions,
#' "National" or "Regionk" for k in 1, ..., 10.
#' @param seasons character vector specifying seasons for which to get predictions,
#' "2011/2012" or "2011-2012"
#' @param models character vector specifying models for which to get predictions,
#' "kde", "kcde", or "sarima"
#' @param prediction_targets character vector specifying prediction targets,
#' "onset", "peak_week", "peak_inc", "ph_1_inc", ..., "ph_4_inc"
#' @param prediction_types character vector specifying prediction types,
#' "log_score", "competition_log_score", or "bin_log_probs"
#'
#' @return a data frame with predictions.
#'
#' @export
assemble_predictions <- function(
preds_path = "inst/estimation/loso-predictions",
regions = c("National", paste0("Region", 1:10)),
seasons = paste0(2010:2016, "-", 2011:2017),
models = c("kde", "kcde", "sarima_seasonal_difference_TRUE", "sarima_seasonal_difference_FALSE"),
prediction_targets = c("onset", "peak_week", "peak_inc", "ph_1_inc", "ph_2_inc", "ph_3_inc", "ph_4_inc"),
prediction_types = c("log_score", "competition_log_score", "bin_log_probs")
) {
## Seasons in format used in prediction file names
seasons <- gsub("/", "-", seasons)
## names of files with prediction results to load
file_name_patterns <- sapply(models,
function(model) {
if(identical(preds_path, "inst/prospective-predictions")) {
if(model %in% c("sarima_seasonal_difference_TRUE", "sarima_seasonal_difference_FALSE")) {
paste0(preds_path, "/sarima/prospective_predictions_", model, "/", "*")
} else if(identical(model, "kcde")) {
paste0(preds_path, "/kcde/prospective_predictions_kcde/*")
} else if(identical(model, "kde")) {
paste0(preds_path, "/kde/*")
} else {
stop("Invalid model.")
}
} else {
stop("Currently the only supported path for predictions files is inst/prospective-predictions")
}
})
file_names <- Sys.glob(file_name_patterns)
## Subset file_names to only files for the specified seasons
file_name_seasons <- sapply(file_names,
function(file_path) {
file_start_ind <- regexpr("EW[0-9]{2}-201[0-7]", file_path)
ew <- as.integer(substr(file_path, file_start_ind + 2, file_start_ind + 3))
year <- as.integer(substr(file_path, file_start_ind + 5, file_start_ind + 8))
if(ew > 30) {
season <- paste0(year, "-", year + 1)
} else {
season <- paste0(year - 1, "-", year)
}
return(season)
})
file_names <- file_names[file_name_seasons %in% seasons]
## load the prediction results
pred_res <- rbind.fill(lapply(
file_names,
function(file_path) {
file_start_ind <- regexpr("EW[0-9]{2}-201[0-7]", file_path)
ew <- as.integer(substr(file_path, file_start_ind + 2, file_start_ind + 3))
year <- as.integer(substr(file_path, file_start_ind + 5, file_start_ind + 8))
if(ew > 30) {
season <- paste0(year, "-", year + 1)
} else {
season <- paste0(year - 1, "-", year)
}
read.csv(file_path) %>%
mutate(
ew = ew,
year = year,
season = season)
}
))
## narrow down to the specified prediction targets
targets_submission_format <- prediction_targets
targets_submission_format[targets_submission_format == "onset"] <- "Season onset"
targets_submission_format[targets_submission_format == "peak_week"] <- "Season peak week"
targets_submission_format[targets_submission_format == "peak_inc"] <- "Season peak percentage"
targets_submission_format[targets_submission_format == "ph_1_inc"] <- "1 wk ahead"
targets_submission_format[targets_submission_format == "ph_2_inc"] <- "2 wk ahead"
targets_submission_format[targets_submission_format == "ph_3_inc"] <- "3 wk ahead"
targets_submission_format[targets_submission_format == "ph_4_inc"] <- "4 wk ahead"
target_pred_res <- pred_res %>%
filter(Target %in% targets_submission_format & Type == "Bin")
return(target_pred_res)
}
#' Assemble a data frame of inputs to stacking
#'
#' @param regions string with region: either "National" or in the format
#' "Regionk" for k in {1, ..., 10}
#' @param seasons character vector specifying seasons for which to get predictions,
#' "2011/2012" or "2011-2012" or "*" for all seasons
#' @param prediction_target string with either "onset", "peak_week",
#' "peak_inc", "ph1_inc", ..., "ph4_inc"
#' @param component_model_names character vector with names of component models
#' @param explanatory_variables character vector with names of explanatory variables
#' to include for weights; a non-empty subset of "analysis_time_season_week",
#' "kcde_model_confidence", "sarima_model_confidence", "weighted_ili"
#' @param include_model_performance boolean; should measures of model
#' performance be included in the return result? Generally, should be TRUE if
#' we're getting inputs for model training and FALSE if we're just getting
#' covariates to calculate model weights.
#' @param preds_path path to directory with leave-one-season-out or test phase
#' predictions from each component model. Predictions should be saved in
#' files named like "kde-National-loso-predictions.rds"
#'
#' @return a data frame with covariates that model weights depend on
#' (as specified by explanatory_variables) and possibly log scores from each
#' component model
#'
#' @export
assemble_stacking_inputs <- function(
regions,
seasons = paste0(2010:2016, "-", 2011:2017),
prediction_target,
component_model_names,
explanatory_variables,
include_model_performance = FALSE,
preds_path
) {
## Load prediction results
target_pred_res <- assemble_predictions(
preds_path = preds_path,
regions = c("National", paste0("Region", 1:10)),
seasons = seasons,
models = component_model_names,
prediction_targets = prediction_target
)
## For each season, ...
for(season in seasons) {
## get observed value of prediction target
if(identical(prediction_target, "onset")) {
## get observed onset week
flu_data_for_create_truth <- flu_data %>%
transmute(
location = gsub(
"National",
"US National",
gsub("Region ",
"HHS Region ",
region)),
week = week,
season = season,
season_week = season_week,
wILI = weighted_ili,
ILI = weighted_ili # as of 20171016, FluSight::create_truth operates on ILI column
)
obs_target_values_by_season <- map_df(seasons,
function(season_val) {
FluSight::create_truth(fluview = FALSE,
year = as.integer(substr(season_val, 1, 4)),
weekILI = filter(flu_data_for_create_truth,
season == gsub("-", "/", season_val) & season_week >= 10),
challenge = "ilinet") %>%
mutate(season = season_val)
}
)
#.... ##MORE TO DO -- other targets
}
## subset to weeks that are actually used in scoring.
## From Flu_Challenge_2017-18_v3.docx, these weeks are as follows:
##
## For the seasonal targets, the evaluation period will begin with the first
## forecast submission. The evaluation period for season onset will end with
## the forecast that is received six weeks after the observed onset week while
## the evaluation period for season peak and intensity will end with the first
## forecast received after ILINet is observed to go below baseline for the
## final time during an influenza season. For short-term forecasts, the
## evaluation period will begin with the forecast received four weeks prior to
## the onset week and will end with the forecast received three weeks after
## ILINet is observed to go below baseline for the final time during an
## influenza season.
## Get scores via FluSight::score_entry()
## https://github.com/jarad/FluSight/blob/master/R/score_entry.R
}
## drop rows where any of the (response or) explanatory variables for weights
## have NA values. this is aggressive.
if(include_model_performance) {
vars_to_check <- c(paste0(component_model_names, "_log_score"), explanatory_variables)
} else {
vars_to_check <- explanatory_variables
}
target_pred_res <- target_pred_res[
apply(target_pred_res[, vars_to_check, drop = FALSE],
1,
function(x) {!any(is.na(x))}), # row has na? drop if so
, # all columns
drop = FALSE]
return(target_pred_res)
}
#' Fit a stacking model that assigns weights to component models
#' The weights are a function of observed covariates (which?),
#' and are obtained via gradient tree boosting
#'
#' @param prediction_target string with either "onset", "peak_week",
#' "peak_inc", "ph1_inc", ..., "ph4_inc"
#' @param component_model_names character vector with names of component models
#' @param explanatory_variables character vector with names of explanatory variables
#' to include for weights; a non-empty subset of "analysis_time_season_week",
#' "kcde_model_confidence", "sarima_model_confidence", "weighted_ili"
#' @param loso_preds_path path to directory with leave-one-season-out
#' predictions from each component model. Predictions should be saved in
#' files named like "kde-National-loso-predictions.rds"
#' @param seasons_to_leave_out optional character vector of seasons to leave out
#' of stacking estimation
#' @param booster what form of boosting to use? see xgboost documentation
#' @param subsample fraction of data to use in bagging. not supported yet.
#' @param colsample_bytree fraction of explanatory variables to randomly select
#' in growing each regression tree. see xgboost documentation
#' @param colsample_bylevel fraction of explanatory variables to randomly select
#' in growing each level of the regression tree. see xgboost documentation
#' @param max_depth maximum depth of regression trees. see xgboost documentation
#' @param min_child_weight not recommended for use. see xgboost documentation
#' @param eta learning rate. see xgboost documentation
#' @param gamma Penalty on number of regression tree leafs. see xgboost documentation
#' @param lambda L2 regularization of contribution to model weights in each
#' round. see xgboost documentation
#' @param alpha L1 regularization of contribution to model weights in each round.
#' see xgboost documentation
#' @param nrounds see xgboost documentation
#' @param cv_params optional named list of parameter values to evaluate loss
#' via cross-validation. Each component is a vector of parameter values with
#' name one of "booster", "subsample", "colsample_bytree",
#' "colsample_bylevel", "max_depth", "min_child_weight", "eta", "gamma",
#' "lambda", "alpha", "nrounds"
#' @param cv_folds list specifying observation groups to use in cross-validation
#' each list component is a numeric vector of observation indices.
#' @param cv_nfolds integer specifying the number of cross-validation folds to
#' use. if cv_folds was provided, cv_nfolds is ignored. if cv_folds was not
#' provided, the data will be randomly partitioned into cv_nfolds groups
#' @param cv_refit character describing which of the models specified by the
#' values in cv_params to refit using the full data set. Either "best",
#' "ttest", or "none".
#' @param update an object of class xgbstack to update
#' @param nthread how many threads to use. see xgboost documentation
#' @param verbose how much output to generate along the way. 0 for no logging,
#' 1 for some logging
#'
#' @return a model stacking fit
#'
#' @export
fit_stacked_model <- function(
prediction_target,
component_model_names,
explanatory_variables =
c("analysis_time_season_week", "kcde_model_confidence", "sarima_model_confidence", "weighted_ili"),
loso_preds_path,
seasons_to_leave_out,
booster = "gbtree",
subsample = 1,
colsample_bytree = 1,
colsample_bylevel = 1,
max_depth = 10,
min_child_weight = -10^10,
eta = 0.3,
gamma = 0,
lambda = 0,
alpha = 0,
nrounds = 10,
cv_params = NULL,
cv_folds = NULL,
cv_nfolds = 10L,
cv_refit = "ttest",
update = NULL,
nthread = NULL,
verbose = 0) {
require(xgboost)
require(xgbstack)
### Assemble training data
target_loso_pred_res <- assemble_stacking_inputs(
regions = regions,
prediction_target = prediction_target,
component_model_names = component_model_names,
explanatory_variables = explanatory_variables,
include_model_performance = TRUE,
preds_path = loso_preds_path
)
## if requested, drop season(s)
if(!missing(seasons_to_leave_out)) {
target_loso_pred_res <-
target_loso_pred_res[!(target_loso_pred_res$analysis_time_season %in% seasons_to_leave_out), ]
}
## get cross-validation groups if requested
if(identical(cv_folds, "leave-one-season-out")) {
cv_folds <- lapply(unique(target_loso_pred_res$analysis_time_season),
function(season_val) {
which(target_loso_pred_res$analysis_time_season == season_val)
})
}
## fit stacking model for weights
fit_formula <- as.formula(paste0(
paste0(component_model_names, "_log_score", collapse = " + "),
" ~ ",
paste(explanatory_variables, collapse = " + ")))
xgbstack_fit <- xgbstack(fit_formula,
data = target_loso_pred_res,
booster = booster,
subsample = subsample,
colsample_bytree = colsample_bytree,
colsample_bylevel = colsample_bylevel,
max_depth = max_depth,
min_child_weight = min_child_weight,
eta = eta,
gamma = gamma,
lambda = lambda,
alpha = alpha,
nrounds = nrounds,
cv_params = cv_params,
cv_folds = cv_folds,
cv_nfolds = cv_nfolds,
cv_refit = cv_refit,
update = update,
nthread = nthread,
verbose = verbose)
return(xgbstack_fit)
}
#' Estimate "optimal" unregularized weights for a linear combination of
#' predictive models.
#'
#' This function uses either a grid search or the "degenerate EM" algorithm
#' outlined at http://www.cs.cmu.edu/~roni/11761-f16/Presentations/degenerateEM.pdf
#'
#' @param component_model_log_scores a data frame or matrix of log scores.
#' Each column gives log scores for a particular predictive model.
#' Each row corresponds to one observation
#' @param method character, either "em" for degenerate EM or "grid-search" for
#' grid search
#' @param tol numeric, if method was "em", stopping tolerance
#' @param grid_size number of grid points for each model to evaluate in a grid
#' search
#' @param return_type character, if method was "grid-search", either "optimal"
#' to return only the best weights examined or "all" to return the full grid
#'
#' @return model weights
#'
#' @export
fit_unregularized_stacked_model <- function(
component_model_log_scores,
method = "em",
tol = .Machine$double.eps,
grid_size = 101,
return_type = "optimal") {
component_model_log_scores <- as.matrix(component_model_log_scores)
if(identical(method, "em")) {
weights <- rep(1 / ncol(component_model_log_scores), ncol(component_model_log_scores))
log_weights <- log(weights)
prev_score <- -Inf
curr_score <- mean(logspace_sum_matrix_rows(sweep(component_model_log_scores, 2, log_weights, `+`)))
while((curr_score - prev_score) > tol[1] && (curr_score - prev_score) / abs(curr_score) > tol[1]) {
log_weight_update_num <- sweep(component_model_log_scores, 2, log_weights, `+`)
log_weight_update_denom <- logspace_sum_matrix_rows(log_weight_update_num)
log_weights <-
logspace_sum_matrix_rows(t(log_weight_update_num - log_weight_update_denom)) -
log(nrow(component_model_log_scores))
prev_score <- curr_score
curr_score <- mean(logspace_sum_matrix_rows(sweep(component_model_log_scores, 2, log_weights, `+`)))
}
return(exp(log_weights))
} else if(identical(method, "em-safe")) {
weights <- rep(1 / ncol(component_model_log_scores), ncol(component_model_log_scores))
prev_score <- -Inf
curr_score <- mean(log(exp(component_model_log_scores) %*% matrix(weights)))
while((curr_score - prev_score) / abs(curr_score) > tol[1]) {
weight_update_num <- exp(component_model_log_scores) *
matrix(rep(weights, each = nrow(component_model_log_scores)), nrow = nrow(component_model_log_scores))
weight_update_denom <- apply(weight_update_num, 1, sum)
weights <- apply(weight_update_num / weight_update_denom, 2, mean)
prev_score <- curr_score
curr_score <- mean(log(exp(component_model_log_scores) %*% matrix(weights)))
}
return(weights)
} else if(identical(method, "grid-search")) {
weights_grid <- expand.grid(
lapply(
seq_len(ncol(component_model_log_scores) - 1),
function(i) seq(from = 0, to = 1, length = grid_size)
)
)
weights_grid <- weights_grid[apply(weights_grid, 1, sum) <= 1, ]
weights_grid <- cbind(weights_grid, 1 - apply(weights_grid, 1, sum)) %>%
`colnames<-`(colnames(component_model_log_scores)) %>%
mutate(combined_log_score = NA)
for(weights_ind in seq_len(nrow(weights_grid))) {
weights_grid$combined_log_score[weights_ind] <-
(
exp(component_model_log_scores) %*%
matrix(unlist(weights_grid[weights_ind, seq_len(ncol(component_model_log_scores))]))
) %>%
log() %>%
mean()
}
if(identical(return_type, "optimal")) {
inds <- which(weights_grid$combined_log_score == max(weights_grid$combined_log_score))
} else {
inds <- seq_len(nrow(weights_grid))
}
return(weights_grid[inds, seq_len(ncol(component_model_log_scores)), drop = FALSE])
} else {
stop("invalid method")
}
}
#' Estimate weights for a linear combination of predictive models.
#'
#' This function uses the "degenerate EM" algorithm outlined at
#' http://www.cs.cmu.edu/~roni/11761-f16/Presentations/degenerateEM.pdf,
#' but modified in a way that is ad hoc but I think could be justified
#' to use kernel-weighted observations.
#'
#' @param component_model_log_scores a data frame or matrix of log scores.
#' Each column gives log scores for a particular predictive model.
#' Each row corresponds to one observation
#' @param covariate a single covariate over which weights should be smoothed
#' (more than one covariate may be supported later)
#' @param bw a bandwidth for the smoothing
#' @param tol numeric, if method was "em", stopping tolerance
#'
#' @return model weights
#'
#' @export
fit_kernel_smoothed_stacked_model_fixed_bw <- function(
component_model_log_scores,
covariate,
prediction_covariate,
bw,
tol = .Machine$double.eps) {
component_model_log_scores <- as.matrix(component_model_log_scores)
covariate <- as.matrix(covariate)
prediction_covariate <- as.matrix(prediction_covariate)
method <- "em-safe"
if(identical(method, "em")) {
weights <- rep(1 / ncol(component_model_log_scores), ncol(component_model_log_scores))
log_weights <- log(weights)
prev_score <- -Inf
curr_score <- mean(logspace_sum_matrix_rows(sweep(component_model_log_scores, 2, log_weights, `+`)))
while((curr_score - prev_score) > tol[1] && (curr_score - prev_score) / abs(curr_score) > tol[1]) {
log_weight_update_num <- sweep(component_model_log_scores, 2, log_weights, `+`)
log_weight_update_denom <- logspace_sum_matrix_rows(log_weight_update_num)
log_weights <-
logspace_sum_matrix_rows(t(log_weight_update_num - log_weight_update_denom)) -
log(nrow(component_model_log_scores))
prev_score <- curr_score
curr_score <- mean(logspace_sum_matrix_rows(sweep(component_model_log_scores, 2, log_weights, `+`)))
}
return(exp(log_weights))
} else if(identical(method, "em-safe")) {
combined_weights <- matrix(NA, nrow = nrow(prediction_covariate),
ncol = ncol(component_model_log_scores))
for(i in seq_len(nrow(combined_weights))) {
cat(paste0("Getting weights for covariate ", i, " of ", nrow(combined_weights), ".\n"))
cat(Sys.time())
cat("\n")
obs_weights <- dnorm(covariate, prediction_covariate[i, ], sd = bw)
obs_weights <- obs_weights / sum(obs_weights)
weights <- rep(1 / ncol(component_model_log_scores), ncol(component_model_log_scores))
prev_score <- -Inf
curr_score <- weighted.mean(
log(exp(component_model_log_scores) %*% matrix(weights)),
w = obs_weights)
while((curr_score - prev_score) / abs(curr_score) > tol[1]) {
weight_update_num <- exp(component_model_log_scores) *
matrix(rep(weights, each = nrow(component_model_log_scores)), nrow = nrow(component_model_log_scores))
weight_update_denom <- apply(weight_update_num, 1, sum)
weights <- apply(weight_update_num / weight_update_denom, 2, weighted.mean, w = obs_weights)
prev_score <- curr_score
curr_score <- weighted.mean(
log(exp(component_model_log_scores) %*% matrix(weights)),
w = obs_weights)
# cat(curr_score)
}
combined_weights[i, ] <- weights
}
return(combined_weights)
} else {
stop("invalid method")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.