#' Automatic Selection of Models Outlier DEtection for Epidemics (ASMODEE)
#'
#' This function implements an algorithm for epidemic time series analysis in
#' aim to detect recent deviation from the trend followed by the data. Data is
#' first partitioned into 'recent' data, using the last `k` observations as
#' supplementary individuals, and older data used to fit the
#' trend. Trend-fitting is done by fitting a series of user-specified models for
#' the time series, with different methods for selecting best fit (see details,
#' and the argument `method`). The prediction interval is then calculated for
#' the best model, and every data point (including the training set and
#' supplementary individuals) falling outside are classified as 'outliers'.
#'
#' @details Automatic model selection is used to determine the model best
#' fitting the training data from a list of user-provided models. First, all
#' models are fitted to the data. Second, models are selected using the
#' approach specified by the `method` argument. The default is
#' [`evaluate_aic()`] which uses Akaike's Information Criteria to assess model
#' fit penalised by model complexity. This approach is fast, but measures
#' model fit rather than predictive ability. The alternative is using
#' [`evaluate_resampling()`], uses cross-validation (10-fold by default) and
#' root mean squared error (RMSE) to assess model fit. This approach is likely
#' to select models with good predictive abilities, but is computationally
#' intensive. Also, it does not attempt to maximise the explained deviance, so
#' selected models may have good average predictions but underestimate
#' uncertainty.
#'
#' @author Thibaut Jombart, Dirk Schumacher and Tim Taylor, with inputs from
#' Michael Höhle, Mark Jit, John Edmunds, Andre Charlett, Stéphane Ghozzi
#'
#' @param data A `data.frame` or a `tibble` containing the response and
#' explanatory variables used in the `models`.
#' @param models A list of [`trending_model()`] objects,
#' generated by `lm_model`, `glm_model`, `glm_nb_model`, `brms_model` and
#' similar functions (see `?trending::trending_model()`) for details.
#' @param ... Not currently used.
#'
#' @return An `trendbreaker` object (S3 class inheriting `list`), containing items
#' which can be accessed by various accessors - see `?trendbreaker-accessors`
#'
#' @examples
#' if (require(cowplot) && require(tidyverse) && require(trending)) {
#' # load data
#' data(nhs_pathways_covid19)
#'
#' # select last 28 days
#' first_date <- max(nhs_pathways_covid19$date, na.rm = TRUE) - 28
#' pathways_recent <- nhs_pathways_covid19 %>%
#' filter(date >= first_date)
#'
#' # define candidate models
#' models <- list(
#' regression = lm_model(count ~ day),
#' poisson_constant = glm_model(count ~ 1, family = "poisson"),
#' negbin_time = glm_nb_model(count ~ day),
#' negbin_time_weekday = glm_nb_model(count ~ day + weekday)
#' )
#'
#' # analyses on all data
#' counts_overall <- pathways_recent %>%
#' group_by(date, day, weekday) %>%
#' summarise(count = sum(count))
#'
#' # results with fixed value of 'k' (7 days)
#' res_overall_k7 <- asmodee(counts_overall, models, date, k = 7)
#' plot(res_overall_k7, "date")
#' }
#'
#' @rdname asmodee
#' @export
#'
asmodee <- function(data, models, ...) {
UseMethod("asmodee", data)
}
#' @param date_index The name of a variable corresponding to time, quoted or
#' not.
#' @param alpha The alpha threshold to be used for the prediction interval
#' calculation; defaults to 0.05, i.e. 95% prediction intervals are
#' calculated.
#' @param k An `integer` indicating the number of recent data points to be
#' excluded from the trend fitting procedure. Defaults to 7.
#' @param method A function used to evaluate model fit. Current choices are
#' `evaluate_aic` (default) and `evaluate_resampling`. `evaluate_aic` uses
#' Akaike's Information Criterion instead, which is faster but possibly less
#' good a selecting models with the best predictive power.
#' `evaluate_resampling` uses cross-validation and, by default, RMSE to assess
#' model fit.
#' @param method_args Optional named list of additional arguments to pass to
#' method. Defaults to an empty list.
#' @param simulate_pi A `logical` indicating if prediction intervals should be
#' derived by bootstrap using the ciTools package, or calculated
#' analytically. Defaults to `TRUE`.
#' @param uncertain A `logical` indicating if uncertainty in the fitted
#' parameters should be taken into account when deriving prections
#' intervals. Only used for glm models and if simulate_pi = `FALSE`. Defaults
#' to `FALSE`.
#' @param include_fitting_warnings A `logical` indicating if results should
#' include models that triggered warnings (but not errors), during the fitting
#' procedure. Defaults to `FALSE`, as warnings can typically indicate lack of
#' convergence during the parameter estimation.
#' @param include_prediction_warnings A `logical` indicating if results should
#' include models that triggered warnings (but not errors), during the
#' prediciton stage. Defaults to `TRUE`.
#' @param force_positive A `logical` indicating if prediction should be forced
#' to be positive (or zero); can be useful when using Gaussian models for
#' count data, to avoid negative predictions. Defaults to `FALSE` for general
#' `data.frame` inputs, and to `TRUE` for `incidence2` objects.
#' @param keep_intermediate A `logical` indicating if all output from the
#' fitting and prediction stages should be returned. If `TRUE`, a tibble will
#' be returned in the fitted_results position of the resulting list output.
#' If `FALSE` (default) fitted_results will be `NULL`.
#'
#' @rdname asmodee
#' @importFrom rlang .data
#' @export
asmodee.data.frame <- function(data, models, date_index, alpha = 0.05, k = 7,
method = evaluate_aic, method_args = list(),
simulate_pi = TRUE, uncertain = FALSE,
include_fitting_warnings = FALSE,
include_prediction_warnings = TRUE,
force_positive = FALSE,
keep_intermediate = FALSE, ...) {
# basic input checks
stopifnot("models has a length of zero" = length(models) > 0)
stopifnot("`alpha` must be a finite number" = is.numeric(alpha) && is.finite(alpha))
stopifnot("`k` must be a finite number" = is.numeric(k) && is.finite(k))
stopifnot("`simulate_pi` should be TRUE or FALSE" = is.logical(simulate_pi))
stopifnot("`uncertain` should be TRUE or FALSE" = is.logical(uncertain))
stopifnot("`include_fitting_warnings` should be TRUE or FALSE" = is.logical(include_fitting_warnings))
stopifnot("`include_prediction_warnings` should be TRUE or FALSE" = is.logical(include_prediction_warnings))
stopifnot("`force_positive` should be TRUE or FALSE" = is.logical(force_positive))
stopifnot("`keep_intermediate` should be TRUE or FALSE" = is.logical(keep_intermediate))
ellipsis::check_dots_empty()
# As the method relies on a 'time' variable for defining training/testing
# sets, we first need to retrieve this information from the 'time_index'
# argument. We borrow the same strategy as the one used in the *incidence2*
# package. A by product of this is input checking of date_index!
date_index <- rlang::enquo(date_index)
idx <- tidyselect::eval_select(date_index, data)
date_index <- names(data)[idx]
dates <- data[[date_index]]
# Ensure k is a wholenumber of "reasonable" size
k <- int_cast(k) # this will error if cannot cast to integer
unique_dates <- unique(dates)
n_dates <- length(unique_dates)
if (k > (n_dates - 4)) {
msg <- sprintf("`k` (%d) is too high for the dataset size (%d)", k, n_dates)
stop(msg)
}
# The rest of the algorithm will basically rely on the following steps:
# 1. defining the training set by removing data of the most recent 'k' time
# units in date_index;
# 2. fitting all models to the data
# 3. removing models that error on fitting (or optionally give warnings)
# 4. evaluate models using the training set, rank from best to worst
# 5. removing models that cannot be evaluated
# 6. derive prediction intervals by decreasing model rank, keeping the first
# model not giving errors (or optionally give warnings) as the 'best' model
# 7. identify outliers
# Step 1: identify training set
data <- set_training_data(data, date_index = date_index, k = k)
last_training_date <- max(dates[data$.training], na.rm = TRUE)
first_testing_date <- NULL
if (k > 0) {
first_testing_date <- min(dates[!data$.training], na.rm = TRUE)
}
data_train <- get_training_data(data)
# Step 2: fit all of the models and capture the warnings and errors
fitted_results <- clapply(models, trending::fit, data = data_train)
colnames(fitted_results) <- c("trending_model_fit", "fitting_warnings", "fitting_errors")
# keep fitting_results (optional); useful for closer inspection of models
# which errored
all_fitted_results <- NULL
if (keep_intermediate) all_fitted_results <- fitted_results
# Step 3: remove models with fitting errors/warnings
keep <- vapply(fitted_results$fitting_errors, is.null, logical(1))
fitted_results <- fitted_results[keep,]
models <- models[keep]
# remove fitting warnings (optionally)
if (!include_fitting_warnings) {
keep <- vapply(fitted_results$fitting_warnings, is.null, logical(1))
fitted_results <- fitted_results[keep,]
models <- models[keep]
}
# Step 4: evaluate the models
method_args <- utils::modifyList(
method_args,
list(data = data_train, models = models, method = method),
keep.null = TRUE
)
model_results <- do.call(trendeval::evaluate_models, method_args)
model_results$model <- NULL # this is cleaning up from trendeval output
model_results$data <- NULL # this is cleaning up from trendeval output
model_results <- dplyr::rename(
model_results,
evaluation_warnings = .data$warning,
evaluation_errors = .data$error
)
# combine fitting, evaluation data
out <- dplyr::bind_cols(fitted_results, model_results)
# bring model_name to front (here we allow for potentially unnamed models)
if (!"model_name" %in% names(out)) {
out$model_name <- NA_character_
}
out <- dplyr::relocate(out, .data$model_name)
# Step 5: remove models which could not be evaluated
keep <- vapply(out$evaluation_errors, is.null, logical(1))
out <- out[keep,]
models <- models[keep]
# error if no possible models
if (nrow(out) == 0) {
stop("Unable to fit any model without warnings or errors.")
}
# Rank models from the best to the worst.
# TODO - this assumes only last column is of interest. Ok for aic but would
# be an issue if people passed multiple metrics through to a different
# method so we should warn in this scenario
models <- models[order(out[, ncol(out), drop = TRUE])]
out <- out[order(out[, ncol(out), drop = TRUE]), ]
# Step 6: loop through models from the best to the worst one, attempting to
# derive prediction intervals for each; the first one to work with no error
# (optionally, no warning) is retained as the 'best' model. This is necessary
# because the 'best fitting' model may not be able to yield predictions on the
# recent data points, e.g. when categorical predictors have new levels
# (i.e. absent from the training set).
i <- 0
success <- FALSE
tmp <- out$trending_model_fit
predict_catcher <- make_catcher(predict)
while(i < length(tmp) && !success) {
i <- i + 1
pred_result <- predict_catcher(
tmp[[i]],
new_data = data,
alpha = alpha,
simulate_pi = simulate_pi,
uncertain = uncertain
)
names(pred_result) <- c("result", "prediction_warnings", "prediction_errors")
pred_result <- lapply(pred_result, list)
pred_result <- tibble::as_tibble(pred_result)
# remove prediction errors
keep <- vapply(pred_result$prediction_errors, is.null, logical(1))
pred_result <- pred_result[keep,]
if (nrow(pred_result) == 0) next
# remove prediction warnings (optionally)
if (!include_prediction_warnings) {
keep <- vapply(pred_result$prediction_warnings, is.null, logical(1))
pred_result <- pred_result[keep,]
}
if (nrow(pred_result) == 0) next
success <- TRUE
}
# error if no possible models
if (!success) {
if (include_prediction_warnings) {
msg <- "Unable to derive predictions from any model without warnings or errors."
} else {
msg <- paste(
"Unable to derive predictions from any model without warnings or errors.",
"Consider using `include_prediction_warnings = TRUE`",
"to include models which can issue predictions with warnings.",
sep = "\n"
)
}
stop(msg)
}
# retain the best model and then combine with prediction results
out <- out[i, ]
models <- models[i]
out <- dplyr::bind_cols(out, pred_result)
# Step 7: identify outliers and shape the output
# (note the hacks needed for precision issues)
preds <- out$result[[1]]
model <- out$trending_model_fit[[1]]$fitted_model
observed <- all.vars(formula(model))[1]
preds <- dplyr::mutate(
preds,
outlier = .data[[observed]] < .data$lower_pi | .data[[observed]] > .data$upper_pi,
outlier = dplyr::case_when(
outlier & dplyr::near(.data[[observed]], .data$lower_pi) ~ FALSE,
outlier & dplyr::near(.data[[observed]], .data$upper_pi) ~ FALSE,
TRUE ~ outlier
),
classification = dplyr::case_when(
(.data[[observed]] < .data$lower_pi) & .data$outlier ~ "decrease",
(.data[[observed]] > .data$upper_pi) & .data$outlier ~ "increase",
TRUE ~ "normal"
),
classification = factor(
.data$classification,
levels = c("increase", "normal", "decrease")
)
)
# enforce positive predictions if required
if (force_positive) {
nms <- c("estimate", "lower_ci", "upper_ci", "lower_pi", "upper_pi")
preds[nms] <- lapply(preds[nms], neg_to_zero)
}
# final output
out <- list(
k = k,
model_name = out$model_name[[1]],
trending_model_fit = out$trending_model_fit[[1]],
alpha = alpha,
results = tibble::as_tibble(preds),
date_index = date_index,
last_training_date = last_training_date,
first_testing_date = first_testing_date,
fitted_results = all_fitted_results
)
class(out) <- c("trendbreaker", class(out))
out
}
#' @rdname asmodee
#'
#' @export
asmodee.incidence2 <- function(data, models, alpha = 0.05, k = 7,
method = evaluate_aic, method_args = list(),
simulate_pi = TRUE, uncertain = FALSE,
include_fitting_warnings = FALSE,
include_prediction_warnings = TRUE,
force_positive = TRUE,
keep_intermediate = FALSE, ...) {
# This implementation merely loops over all strata of the incidence2 object,
# removing all strata which errored (optionally, which issued warnings);
# future_clapply provides an entry point for high-level parallelisation, but
# in line with 'future' the setup is left to the user and belongs outside
# trendbreaker.
# check incidence2 package is present
check_suggests("incidence2")
groups <- incidence2::get_group_names(data)
if (!is.null(groups)) {
dat <- dplyr::grouped_df(data, groups)
keys <- dplyr::group_keys(dat)
split_dat <- dplyr::group_split(dat)
} else {
keys <- data.frame()
split_dat <- list(data)
}
date_index <- incidence2::get_dates_name(data)
out <- future_clapply(
split_dat,
asmodee.data.frame,
models = models,
date_index = date_index,
alpha = alpha,
k = k,
method = method,
simulate_pi = simulate_pi,
uncertain = uncertain,
include_fitting_warnings = include_fitting_warnings,
include_prediction_warnings = include_prediction_warnings,
force_positive = force_positive,
keep_intermediate = keep_intermediate,
...,
future.seed = TRUE
)
names(out) <- c("output", "warnings", "errors")
if (ncol(keys) > 0) {
out <- dplyr::bind_cols(keys, out)
}
# remove errors
keep <- vapply(out$errors, is.null, logical(1))
out <- out[keep,]
class(out) <- c("trendbreaker_incidence2", class(out))
attr(out, "groups") <- groups
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.