## utility functions for SARIMA fits
#' Simulate predictive trajectories from an ARIMA model
#'
#' This is directly taken from the forecast.Arima function from the forecast package,
#' but I've forced bootstrap = TRUE and return the simulated trajectories which were
#' not returned in the original function definition.
#'
#' @param object an Arima fit object (with class "Arima")
#' @param h number of time steps forwards to simulate
#' @param level not used
#' @param fan not used
#' @param xreg not used
#' @param lambda not used
#' @param npaths number of sample trajectories to simulate
#'
#' @return an npaths by h matrix with simulated values
#'
#' @export
sample_predictive_trajectories_arima <- function (object,
h = ifelse(object$arma[5] > 1, 2 * object$arma[5], 10),
level = c(80, 95),
fan = FALSE,
xreg = NULL,
lambda = object$lambda,
npaths = 5000,
...) {
sim <- matrix(NA, nrow = npaths, ncol = h)
for (i in 1:npaths) {
try(sim[i, ] <- forecast:::simulate.Arima(object, nsim = h), silent = TRUE)
}
return(sim)
}
#' A wrapper around sample_predictive_trajectories_arima suitable for use as the
#' \code{simulate_trajectories_function} argument to
#' \code{get_log_scores_via_trajectory_simulation}.
#'
#' This function does a few things worth noting. It subsets data to the
#' analysis time. It pulls in an appropriate SARIMA fit, either a
#' leave-one-season-out fit if the analysis time is before the first test season
#' or a fit based on all of the training data if the analysis time is in or
#' after the first test season. It linearly interpolates missing values.
#' It handles log transformations and possible seasonal differencing that may
#' be done outside of functionality in the forecast package.
#'
#' @param n_sims number of trajectories to simulate
#' @param max_prediction_horizon how many steps ahead to simulate
#' @param data data set
#' @param region region
#' @param analysis_time_season season in which we're predicting
#' @param analysis_time_season_week week of the season in which we're making our
#' predictions, using all data up to the analysis time to make predictions for
#' later time points
#' @param params other parameters. A list with the following entries:
#' * fits_filepath = path to a directory where SARIMA model fits are located
#' * prediction_target_var = string naming variable in data we are predicting
#' * seasonal_difference = logical specifying whether a seasonal difference
#' should be computed manually before passing to auto.arima
#' * transformation = string, either "log", "box-cox", or "none", indicating
#' type of transformation to do
#' * first_test_season = string, in format of "2011/2012", specifying first
#' test season.
#'
#' @return an n_sims by h matrix with simulated values
#'
#' @export
sample_predictive_trajectories_arima_wrapper <- function(
n_sims,
max_prediction_horizon,
data,
region,
analysis_time_season,
analysis_time_season_week,
params
) {
## load SARIMA fit
reg_str <- gsub(" ", "_", region)
fit_filepath <- file.path(
params$fits_filepath,
paste0(
"sarima-",
reg_str,
"-seasonal_difference_", params$seasonal_difference,
"-transformation_", params$transformation,
"-first_test_season_",
gsub("/", "_", params$first_test_season),
".rds"))
## If no SARIMA fit, exit early by returning a matrix of NAs
if(!file.exists(fit_filepath)) {
return(matrix(NA, nrow = n_sims, ncol = max_prediction_horizon))
}
sarima_fit <- readRDS(file = fit_filepath)
## Update SARIMA fit object with seasonally differenced data up
## through analysis_time_ind
if(identical(params$transformation, "log")) {
undifferenced_new_target_data <- log(data[, params$prediction_target_var])
} else {
undifferenced_new_target_data <- data[, params$prediction_target_var]
}
if(params$seasonal_difference) {
new_target_data <- ts(c(rep(NA, 52),
undifferenced_new_target_data[seq(from = 53, to = nrow(data))] -
undifferenced_new_target_data[seq(from = 1, to = nrow(data) - 52)]),
frequency = 52)
} else {
new_target_data <- ts(undifferenced_new_target_data, frequency = 52)
}
## deal with internal missing values in new_target_data that can result in
## simulated trajectories of all NAs if the model has a moving average component
##
## here we do this by linear interpolation
##
## another solution would be to write a version of stats::filter that
## does the "right thing" with NAs if filter coefficients are 0
## and then use that function in forecast:::myarima.sim
if(any(is.na(new_target_data) | is.infinite(new_target_data)) && !is.na(tail(new_target_data, 1))) {
## drop leading NAs
if(is.na(new_target_data[1] | is.infinite(new_target_data[1]))) {
num_leading_nas <-
rle(is.na(new_target_data) | is.infinite(new_target_data))$lengths[1]
new_target_data <- new_target_data[- seq_len(num_leading_nas)]
}
## interpolate internal NAs
while(any(is.na(new_target_data) | is.infinite(new_target_data))) {
na_rle <- rle(is.na(new_target_data) | is.infinite(new_target_data))
na_run_start_ind <- na_rle$lengths[1] + 1
na_run_end_ind <- na_run_start_ind + na_rle$lengths[2] - 1
new_target_data[na_run_start_ind:na_run_end_ind] <-
approx(
x = c(na_run_start_ind - 1, na_run_end_ind + 1),
y = new_target_data[c(na_run_start_ind - 1, na_run_end_ind + 1)],
xout = na_run_start_ind:na_run_end_ind,
method = "linear"
)$y
}
}
updated_sarima_fit <- Arima(
new_target_data,
model = sarima_fit)
## The update process via call to Arima can somehow result in 0/negative
## variance estimates. Reset to original values
updated_sarima_fit$sigma2 <- sarima_fit$sigma2
updated_sarima_fit$var.coef <- sarima_fit$var.coef
raw_trajectory_samples <- sample_predictive_trajectories_arima(
updated_sarima_fit,
h = max_prediction_horizon,
npaths = n_sims)
## Sampled trajectories are of seasonally differenced log incidence
## Get to trajectories for originally observed incidence ("inc") by
## adding seasonal lag of incidence and exponentiating
inc_trajectory_samples <- raw_trajectory_samples
if(params$seasonal_difference) {
for(prediction_horizon in seq_len(max_prediction_horizon)) {
inc_trajectory_samples[, prediction_horizon] <-
raw_trajectory_samples[, prediction_horizon] +
undifferenced_new_target_data[nrow(data) + prediction_horizon - 52]
}
}
if(identical(params$transformation, "log")) {
inc_trajectory_samples <- exp(inc_trajectory_samples)
}
return(inc_trajectory_samples)
}
#' Estimate SARIMA model using data up to but not including first_test_season
#'
#' @param data regional dataset with structure like regionflu-cleaned
#' @param reg_num region number for estimation
#' @param first_test_season string indicating first test season
#' @param d order of first differencing
#' @param D order of seasonal differencing
#' @param seasonal_difference boolean; take a seasonal difference before passing
#' to auto.arima?
#' @param transformation character specifying transformation type:
#' "box-cox", "log", or "none"
#' @param path path in which to save files
#'
#' @return NULL just saves files
#'
#' @export
fit_region_sarima <- function(
data,
region,
first_test_season,
d = NA,
D = NA,
seasonal_difference = TRUE,
transformation = "none",
path) {
require(forecast)
## subset data to be only the region of interest
data <- data[data$region == region,]
## Subset data to do estimation using only data up to (and not including)
## first_test_season. remainder are held out
first_ind_test_season <- min(which(data$season == first_test_season))
data <- data[seq_len(first_ind_test_season - 1), , drop = FALSE]
prediction_target_var <- "weighted_ili"
if(identical(transformation, "log")) {
undifferenced_target_data <- log(data[, prediction_target_var])
} else {
undifferenced_target_data <- data[, prediction_target_var]
}
if(seasonal_difference) {
pred_target <- ts(c(rep(NA, 52),
undifferenced_target_data[seq(from = 53, to = nrow(data))] -
undifferenced_target_data[seq(from = 1, to = nrow(data) - 52)]),
frequency = 52)
} else {
pred_target <- ts(undifferenced_target_data, frequency = 52)
}
if(identical(transformation, "box-cox")) {
lambda <- BoxCox.lambda(pred_target)
## get SARIMA fit
sarima_fit <- auto.arima(pred_target,
d = d,
D = D,
stationary = TRUE,
lambda = lambda)
} else {
## log transformation (already done manually) or no transformation
sarima_fit <- auto.arima(pred_target,
d = d,
D = D,
stationary = TRUE)
}
filename <- paste0(
path,
"sarima-",
gsub(" ", "_", region),
"-seasonal_difference_", seasonal_difference,
"-transformation_", transformation,
"-first_test_season_", gsub("/", "_", first_test_season),
".rds")
saveRDS(sarima_fit, file = filename)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.