R/asmodee.R

Defines functions asmodee.incidence2 asmodee.data.frame asmodee

Documented in asmodee asmodee.data.frame asmodee.incidence2

#' Automatic Selection of Models Outlier DEtection for Epidemics (ASMODEE)
#'
#' This function implements an algorithm for epidemic time series analysis in
#' aim to detect recent deviation from the trend followed by the data. Data is
#' first partitioned into 'recent' data, using the last `k` observations as
#' supplementary individuals, and older data used to fit the
#' trend. Trend-fitting is done by fitting a series of user-specified models for
#' the time series, with different methods for selecting best fit (see details,
#' and the argument `method`). The prediction interval is then calculated for
#' the best model, and every data point (including the training set and
#' supplementary individuals) falling outside are classified as 'outliers'.
#'
#' @details Automatic model selection is used to determine the model best
#'   fitting the training data from a list of user-provided models. First, all
#'   models are fitted to the data. Second, models are selected using the
#'   approach specified by the `method` argument. The default is
#'   [`evaluate_aic()`] which uses Akaike's Information Criteria to assess model
#'   fit penalised by model complexity. This approach is fast, but measures
#'   model fit rather than predictive ability. The alternative is using
#'   [`evaluate_resampling()`], uses cross-validation (10-fold by default) and
#'   root mean squared error (RMSE) to assess model fit. This approach is likely
#'   to select models with good predictive abilities, but is computationally
#'   intensive. Also, it does not attempt to maximise the explained deviance, so
#'   selected models may have good average predictions but underestimate
#'   uncertainty.
#'
#' @author Thibaut Jombart, Dirk Schumacher and Tim Taylor, with inputs from
#'   Michael Höhle, Mark Jit, John Edmunds, Andre Charlett, Stéphane Ghozzi
#'
#' @param data A `data.frame` or a `tibble` containing the response and
#'   explanatory variables used in the `models`.
#' @param models A list of [`trending_model()`] objects,
#'   generated by `lm_model`, `glm_model`, `glm_nb_model`, `brms_model` and
#'   similar functions (see `?trending::trending_model()`) for details.
#' @param ... Not currently used.
#'
#' @return An `trendbreaker` object (S3 class inheriting `list`), containing items
#'   which can be accessed by various accessors - see `?trendbreaker-accessors`
#'
#' @examples
#' if (require(cowplot) && require(tidyverse) && require(trending)) {
#'   # load data
#'   data(nhs_pathways_covid19)
#'
#'   # select last 28 days
#'   first_date <- max(nhs_pathways_covid19$date, na.rm = TRUE) - 28
#'   pathways_recent <- nhs_pathways_covid19 %>%
#'     filter(date >= first_date)
#'
#'   # define candidate models
#'   models <- list(
#'     regression = lm_model(count ~ day),
#'     poisson_constant = glm_model(count ~ 1, family = "poisson"),
#'     negbin_time = glm_nb_model(count ~ day),
#'     negbin_time_weekday = glm_nb_model(count ~ day + weekday)
#'   )
#'
#'   # analyses on all data
#'   counts_overall <- pathways_recent %>%
#'     group_by(date, day, weekday) %>%
#'     summarise(count = sum(count))
#'
#'   # results with fixed value of 'k' (7 days)
#'   res_overall_k7 <- asmodee(counts_overall, models, date, k = 7)
#'   plot(res_overall_k7, "date")
#' }
#'
#' @rdname asmodee
#' @export
#'
asmodee <- function(data, models, ...) {
  UseMethod("asmodee", data)
}


#' @param date_index The name of a variable corresponding to time, quoted or
#'   not.
#' @param alpha The alpha threshold to be used for the prediction interval
#'   calculation; defaults to 0.05, i.e. 95% prediction intervals are
#'   calculated.
#' @param k An `integer` indicating the number of recent data points to be
#'   excluded from the trend fitting procedure. Defaults to 7.
#' @param method A function used to evaluate model fit. Current choices are
#'   `evaluate_aic` (default) and `evaluate_resampling`. `evaluate_aic` uses
#'   Akaike's Information Criterion instead, which is faster but possibly less
#'   good a selecting models with the best predictive power.
#'   `evaluate_resampling` uses cross-validation and, by default, RMSE to assess
#'   model fit.
#' @param method_args Optional named list of additional arguments to pass to
#'   method.  Defaults to an empty list.
#' @param simulate_pi A `logical` indicating if prediction intervals should be
#'   derived by bootstrap using the ciTools package, or calculated
#'   analytically. Defaults to `TRUE`.
#' @param uncertain A `logical` indicating if uncertainty in the fitted
#'   parameters should be taken into account when deriving prections
#'   intervals. Only used for glm models and if simulate_pi = `FALSE`. Defaults
#'   to `FALSE`.
#' @param include_fitting_warnings A `logical` indicating if results should
#'   include models that triggered warnings (but not errors), during the fitting
#'   procedure. Defaults to `FALSE`, as warnings can typically indicate lack of
#'   convergence during the parameter estimation.
#' @param include_prediction_warnings A `logical` indicating if results should
#'   include models that triggered warnings (but not errors), during the
#'   prediciton stage. Defaults to `TRUE`.
#' @param force_positive A `logical` indicating if prediction should be forced
#'   to be positive (or zero); can be useful when using Gaussian models for
#'   count data, to avoid negative predictions. Defaults to `FALSE` for general
#'   `data.frame` inputs, and to `TRUE` for `incidence2` objects.
#' @param keep_intermediate A `logical` indicating if all output from the
#'   fitting and prediction stages should be returned. If `TRUE`, a tibble will
#'   be returned in the fitted_results position of the resulting list output.
#'   If `FALSE` (default) fitted_results will be `NULL`.
#'
#' @rdname asmodee
#' @importFrom rlang .data
#' @export
asmodee.data.frame <- function(data, models, date_index, alpha = 0.05, k = 7,
                               method = evaluate_aic, method_args = list(),
                               simulate_pi = TRUE, uncertain = FALSE,
                               include_fitting_warnings = FALSE,
                               include_prediction_warnings = TRUE,
                               force_positive = FALSE,
                               keep_intermediate = FALSE, ...) {

  # basic input checks
  stopifnot("models has a length of zero" = length(models) > 0)
  stopifnot("`alpha` must be a finite number" = is.numeric(alpha) && is.finite(alpha))
  stopifnot("`k` must be a finite number" = is.numeric(k) && is.finite(k))
  stopifnot("`simulate_pi` should be TRUE or FALSE" = is.logical(simulate_pi))
  stopifnot("`uncertain` should be TRUE or FALSE" = is.logical(uncertain))
  stopifnot("`include_fitting_warnings` should be TRUE or FALSE" = is.logical(include_fitting_warnings))
  stopifnot("`include_prediction_warnings` should be TRUE or FALSE" = is.logical(include_prediction_warnings))
  stopifnot("`force_positive` should be TRUE or FALSE" = is.logical(force_positive))
  stopifnot("`keep_intermediate` should be TRUE or FALSE" = is.logical(keep_intermediate))
  ellipsis::check_dots_empty()

  # As the method relies on a 'time' variable for defining training/testing
  # sets, we first need to retrieve this information from the 'time_index'
  # argument. We borrow the same strategy as the one used in the *incidence2*
  # package. A by product of this is input checking of date_index!
  date_index <- rlang::enquo(date_index)
  idx <- tidyselect::eval_select(date_index, data)
  date_index <- names(data)[idx]
  dates <- data[[date_index]]

  # Ensure k is a wholenumber of "reasonable" size
  k <- int_cast(k) # this will error if cannot cast to integer
  unique_dates <- unique(dates)
  n_dates <- length(unique_dates)
  if (k > (n_dates - 4)) {
    msg <- sprintf("`k` (%d) is too high for the dataset size (%d)", k, n_dates)
    stop(msg)
  }

  # The rest of the algorithm will basically rely on the following steps:
  #  1. defining the training set by removing data of the most recent 'k' time
  #     units in date_index;
  #  2. fitting all models to the data
  #  3. removing models that error on fitting (or optionally give warnings)
  #  4. evaluate models using the training set, rank from best to worst
  #  5. removing models that cannot be evaluated
  #  6. derive prediction intervals by decreasing model rank, keeping the first
  #    model not giving errors (or optionally give warnings) as the 'best' model
  #  7. identify outliers

  # Step 1: identify training set
  data <- set_training_data(data, date_index = date_index, k = k)
  last_training_date <- max(dates[data$.training], na.rm = TRUE)
  first_testing_date <- NULL
  if (k > 0) {
    first_testing_date <- min(dates[!data$.training], na.rm = TRUE)
  }
  data_train <- get_training_data(data)

  # Step 2: fit all of the models and capture the warnings and errors
  fitted_results <- clapply(models, trending::fit, data = data_train)
  colnames(fitted_results) <- c("trending_model_fit", "fitting_warnings", "fitting_errors")

  # keep fitting_results (optional); useful for closer inspection of models
  # which errored
  all_fitted_results <- NULL
  if (keep_intermediate) all_fitted_results <- fitted_results

  # Step 3: remove models with fitting errors/warnings
  keep <- vapply(fitted_results$fitting_errors, is.null, logical(1))
  fitted_results <- fitted_results[keep,]
  models <- models[keep]

  # remove fitting warnings (optionally)
  if (!include_fitting_warnings) {
    keep <- vapply(fitted_results$fitting_warnings, is.null, logical(1))
    fitted_results <- fitted_results[keep,]
    models <- models[keep]
  }

  # Step 4: evaluate the models
  method_args <- utils::modifyList(
    method_args,
    list(data = data_train, models = models, method = method),
    keep.null = TRUE
  )
  model_results <- do.call(trendeval::evaluate_models, method_args)
  model_results$model <- NULL   # this is cleaning up from trendeval output
  model_results$data <- NULL    # this is cleaning up from trendeval output
  model_results <- dplyr::rename(
    model_results,
    evaluation_warnings = .data$warning,
    evaluation_errors = .data$error
  )

  # combine fitting, evaluation data
  out <- dplyr::bind_cols(fitted_results, model_results)

  # bring model_name to front (here we allow for potentially unnamed models)
  if (!"model_name" %in% names(out)) {
    out$model_name <- NA_character_
  }
  out <- dplyr::relocate(out, .data$model_name)

  # Step 5: remove models which could not be evaluated
  keep <- vapply(out$evaluation_errors, is.null, logical(1))
  out <- out[keep,]
  models <- models[keep]

  # error if no possible models
  if (nrow(out) == 0) {
    stop("Unable to fit any model without warnings or errors.")
  }

  # Rank models from the best to the worst.
  # TODO - this assumes only last column is of interest. Ok for aic but would
  #        be an issue if people passed multiple metrics through to a different
  #        method so we should warn in this scenario
  models <- models[order(out[, ncol(out), drop = TRUE])]
  out <- out[order(out[, ncol(out), drop = TRUE]), ]

  # Step 6: loop through models from the best to the worst one, attempting to
  # derive prediction intervals for each; the first one to work with no error
  # (optionally, no warning) is retained as the 'best' model. This is necessary
  # because the 'best fitting' model may not be able to yield predictions on the
  # recent data points, e.g. when categorical predictors have new levels
  # (i.e. absent from the training set).
  i <- 0
  success <- FALSE
  tmp <- out$trending_model_fit
  predict_catcher <- make_catcher(predict)
  while(i < length(tmp) && !success) {
    i <- i + 1
    pred_result <- predict_catcher(
      tmp[[i]],
      new_data = data,
      alpha = alpha,
      simulate_pi = simulate_pi,
      uncertain = uncertain
    )
    names(pred_result) <- c("result", "prediction_warnings", "prediction_errors")
    pred_result <- lapply(pred_result, list)
    pred_result <- tibble::as_tibble(pred_result)

    # remove prediction errors
    keep <- vapply(pred_result$prediction_errors, is.null, logical(1))
    pred_result <- pred_result[keep,]
    if (nrow(pred_result) == 0) next

    # remove prediction warnings (optionally)
    if (!include_prediction_warnings) {
      keep <- vapply(pred_result$prediction_warnings, is.null, logical(1))
      pred_result <- pred_result[keep,]
    }

    if (nrow(pred_result) == 0) next
    success <- TRUE
  }

  # error if no possible models
  if (!success) {
    if (include_prediction_warnings) {
      msg <- "Unable to derive predictions from any model without warnings or errors."
    } else {
      msg <- paste(
        "Unable to derive predictions from any model without warnings or errors.",
        "Consider using `include_prediction_warnings = TRUE`",
        "to include models which can issue predictions with warnings.",
        sep = "\n"
      )
    }
    stop(msg)
  }

  # retain the best model and then combine with prediction results
  out <- out[i, ]
  models <- models[i]
  out <- dplyr::bind_cols(out, pred_result)


  # Step 7: identify outliers and shape the output
  # (note the hacks needed for precision issues)
  preds <- out$result[[1]]
  model <- out$trending_model_fit[[1]]$fitted_model
  observed <- all.vars(formula(model))[1]
  preds <- dplyr::mutate(
    preds,
    outlier = .data[[observed]] < .data$lower_pi | .data[[observed]] > .data$upper_pi,
    outlier = dplyr::case_when(
      outlier & dplyr::near(.data[[observed]], .data$lower_pi) ~ FALSE,
      outlier & dplyr::near(.data[[observed]], .data$upper_pi) ~ FALSE,
      TRUE ~ outlier
    ),
    classification = dplyr::case_when(
      (.data[[observed]] < .data$lower_pi) & .data$outlier ~ "decrease",
      (.data[[observed]] > .data$upper_pi) & .data$outlier ~ "increase",
      TRUE ~ "normal"
    ),
    classification = factor(
      .data$classification,
      levels = c("increase", "normal", "decrease")
    )
  )

  # enforce positive predictions if required
  if (force_positive) {
    nms <- c("estimate", "lower_ci", "upper_ci", "lower_pi", "upper_pi")
    preds[nms] <- lapply(preds[nms], neg_to_zero)
  }

  # final output
  out <- list(
    k = k,
    model_name = out$model_name[[1]],
    trending_model_fit = out$trending_model_fit[[1]],
    alpha = alpha,
    results = tibble::as_tibble(preds),
    date_index = date_index,
    last_training_date = last_training_date,
    first_testing_date = first_testing_date,
    fitted_results = all_fitted_results
  )
  class(out) <- c("trendbreaker", class(out))
  out
}


#' @rdname asmodee
#'
#' @export
asmodee.incidence2 <- function(data, models, alpha = 0.05, k = 7,
                               method = evaluate_aic, method_args = list(),
                               simulate_pi = TRUE, uncertain = FALSE,
                               include_fitting_warnings = FALSE,
                               include_prediction_warnings = TRUE,
                               force_positive = TRUE,
                               keep_intermediate = FALSE, ...) {

  # This implementation merely loops over all strata of the incidence2 object,
  # removing all strata which errored (optionally, which issued warnings);
  # future_clapply provides an entry point for high-level parallelisation, but
  # in line with 'future' the setup is left to the user and belongs outside
  # trendbreaker.

  # check incidence2 package is present
  check_suggests("incidence2")

  groups <- incidence2::get_group_names(data)
  if (!is.null(groups)) {
    dat <- dplyr::grouped_df(data, groups)
    keys <- dplyr::group_keys(dat)
    split_dat <- dplyr::group_split(dat)
  } else {
    keys <- data.frame()
    split_dat <- list(data)
  }
  date_index <- incidence2::get_dates_name(data)

  out <- future_clapply(
    split_dat,
    asmodee.data.frame,
    models = models,
    date_index = date_index,
    alpha = alpha,
    k = k,
    method = method,
    simulate_pi = simulate_pi,
    uncertain = uncertain,
    include_fitting_warnings = include_fitting_warnings,
    include_prediction_warnings = include_prediction_warnings,
    force_positive = force_positive,
    keep_intermediate = keep_intermediate,
    ...,
    future.seed = TRUE
  )
  names(out) <- c("output", "warnings", "errors")

  if (ncol(keys) > 0) {
    out <- dplyr::bind_cols(keys, out)
  }

  # remove errors
  keep <- vapply(out$errors, is.null, logical(1))
  out <- out[keep,]

  class(out) <- c("trendbreaker_incidence2", class(out))
  attr(out, "groups") <- groups
  out
}
reconhub/epichange documentation built on April 28, 2021, 2 p.m.