R/outliers.R

Defines functions roll_iqr detect_outlr_stl detect_outlr_rm detect_outlr

Documented in detect_outlr detect_outlr_rm detect_outlr_stl

#' Detect outliers
#'
#' @description Applies one or more outlier detection methods to a given signal variable, and
#' optionally aggregates the outputs to create a consensus result. See the
#' [outliers
#' vignette](https://cmu-delphi.github.io/epiprocess/articles/outliers.html) for
#' examples.
#'
#' @param x Design points corresponding to the signal values `y`. Default is
#'   `seq_along(y)` (that is, equally-spaced points from 1 to the length of
#'   `y`).
#' @param y Signal values.
#' @param methods A tibble specifying the method(s) to use for outlier
#'   detection, with one row per method, and the following columns:
#'   * `method`: Either "rm" or "stl", or a custom function for outlier
#'   detection; see details for further explanation.
#'   * `args`: Named list of arguments that will be passed to the detection
#'   method.
#'   * `abbr`: Abbreviation to use in naming output columns with results from
#'   this method.
#' @param combiner String, one of "median", "mean", or "none", specifying how to
#'   combine results from different outlier detection methods for the thresholds
#'   determining whether a particular observation is classified as an outlier,
#'   as well as a replacement value for any outliers.  If "none", then no
#'   summarized results are calculated. Note that if the number of `methods`
#'   (number of rows) is odd, then "median" is equivalent to a majority vote for
#'   purposes of determining whether a given observation is an outlier.
#' @return An tibble with number of rows equal to `length(y)` and columns
#'  giving the outlier detection thresholds (`lower` and `upper`) and
#'  replacement values from each detection method (`replacement`).
#'
#' @details Each outlier detection method, one per row of the passed `methods`
#'   tibble, is a function that must take as its first two arguments `x` and
#'   `y`, and then any number of additional arguments. The function must return
#'   a tibble with the number of rows equal to `length(y)`, and with columns
#'   `lower`, `upper`, and `replacement`, representing lower and upper bounds
#'   for what would be considered an outlier, and a posited replacement value,
#'   respectively.
#'
#'  For convenience, the outlier detection method can be specified (in the
#'   `method` column of `methods`) by a string "rm", shorthand for
#'   `detect_outlr_rm()`, which detects outliers via a rolling median; or by
#'   "stl", shorthand for `detect_outlr_stl()`, which detects outliers via an
#'   STL decomposition.
#'
#' @rdname detect_outlr
#' @export
#' @importFrom dplyr select
#' @examples
#' detection_methods <- dplyr::bind_rows(
#'   dplyr::tibble(
#'     method = "rm",
#'     args = list(list(
#'       detect_negatives = TRUE,
#'       detection_multiplier = 2.5
#'     )),
#'     abbr = "rm"
#'   ),
#'   dplyr::tibble(
#'     method = "stl",
#'     args = list(list(
#'       detect_negatives = TRUE,
#'       detection_multiplier = 2.5,
#'       seasonal_period = 7
#'     )),
#'     abbr = "stl_seasonal"
#'   ),
#'   dplyr::tibble(
#'     method = "stl",
#'     args = list(list(
#'       detect_negatives = TRUE,
#'       detection_multiplier = 2.5,
#'       seasonal_period = 7,
#'       seasonal_as_residual = TRUE
#'     )),
#'     abbr = "stl_reseasonal"
#'   )
#' )
#'
#' x <- covid_incidence_outliers %>%
#'   dplyr::select(geo_value, time_value, cases) %>%
#'   as_epi_df() %>%
#'   group_by(geo_value) %>%
#'   mutate(outlier_info = detect_outlr(
#'     x = time_value, y = cases,
#'     methods = detection_methods,
#'     combiner = "median"
#'   )) %>%
#'   unnest(outlier_info)
detect_outlr <- function(x = seq_along(y), y,
                         methods = tibble::tibble(
                           method = "rm",
                           args = list(list()),
                           abbr = "rm"
                         ),
                         combiner = c("median", "mean", "none")) {
  # Validate combiner
  combiner <- rlang::arg_match(combiner)

  # Validate that x contains all distinct values
  if (any(duplicated(x))) {
    cli_abort(
      "`x` cannot contain duplicate values. (If being run on a
      column in an `epi_df`, did you group by relevant key variables?)"
    )
  }

  # Run all outlier detection methods
  results <- purrr::pmap_dfc(methods, function(method, args, abbr) {
    if (is.character(method)) method <- paste0("detect_outlr_", method)

    # Call the method
    results <- do.call(method, args = c(list("x" = x, "y" = y), args))

    # Validate the output
    assert_data_frame(results)
    if (!test_subset(c("lower", "upper", "replacement"), colnames(results))) {
      cli_abort(
        "Columns `lower`, `upper`, and `replacement` must be present in the output of the outlier detection method."
      )
    }

    # Update column names with model abbreviation
    colnames(results) <- paste(abbr, colnames(results), sep = "_")
    return(results)
  })

  # Combine information about detected outliers
  if (combiner != "none") {
    if (combiner == "mean") {
      combine_fun <- mean
    } else if (combiner == "median") {
      combine_fun <- median
    }

    for (target in c("lower", "upper", "replacement")) {
      results[[paste0("combined_", target)]] <- apply(
        results %>%
          dplyr::select(dplyr::ends_with(target)), 1, combine_fun
      )
    }
  }

  return(results)
}

#' @description `detect_outlr_rm` detects outliers based on a distance from the
#' rolling median specified in terms of multiples of the rolling interquartile
#' range (IQR).
#'
#' @param n Number of time steps to use in the rolling window. Default is 21.
#'   This value is centrally aligned. When `n` is an odd number, the rolling
#'   window extends from `(n-1)/2` time steps before each design point to `(n-1)/2`
#'   time steps after. When `n` is even, then the rolling range extends from
#'   `n/2-1` time steps before to `n/2` time steps after.
#' @template outlier-detection-options
#'
#' @rdname detect_outlr
#' @export
#' @examples
#' # Detect outliers based on a rolling median
#' covid_incidence_outliers %>%
#'   dplyr::select(geo_value, time_value, cases) %>%
#'   as_epi_df() %>%
#'   group_by(geo_value) %>%
#'   mutate(outlier_info = detect_outlr_rm(
#'     x = time_value, y = cases
#'   ))
detect_outlr_rm <- function(x = seq_along(y), y, n = 21,
                            log_transform = FALSE,
                            detect_negatives = FALSE,
                            detection_multiplier = 2,
                            min_radius = 0,
                            replacement_multiplier = 0) {
  # Transform if requested
  if (log_transform) {
    # Replace all negative values with 0
    y <- pmax(0, y)
    offset <- as.integer(any(y == 0))
    y <- log(y + offset)
  }

  # Detect negatives if requested
  if (detect_negatives && !log_transform) {
    min_lower <- 0
  } else {
    min_lower <- -Inf
  }

  # Make an epi_df for easy sliding
  z <- as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y))

  # Calculate lower and upper thresholds and replacement value
  z <- z %>%
    epi_slide(fitted = median(y, na.rm = TRUE), .window_size = n, .align = "center") %>%
    dplyr::mutate(resid = y - fitted) %>%
    roll_iqr(
      n = n,
      detection_multiplier = detection_multiplier,
      min_radius = min_radius,
      replacement_multiplier = replacement_multiplier,
      min_lower = min_lower
    )

  # Undo log transformation if necessary
  if (log_transform) {
    z$lower <- exp(z$lower) - offset
    z$upper <- exp(z$upper) - offset
    z$replacement <- exp(z$replacement) - offset
  }

  return(z)
}

#' @description `detect_outlr_stl` detects outliers based on a seasonal-trend
#' decomposition using LOESS (STL).
#'
#' @param n_trend Number of time steps to use in the rolling window for trend.
#'   Default is 21.
#' @param n_seasonal Number of time steps to use in the rolling window for
#'   seasonality. Default is 21. Can also be the string "periodic". See
#'   `s.window` in [`stats::stl`].
#' @param n_threshold Number of time steps to use in rolling window for the IQR
#'   outlier thresholds.
#' @param seasonal_period Integer specifying period of "seasonality". For
#'   example, for daily data, a period 7 means weekly seasonality. It must be
#'   strictly larger than 1. Also impacts the size of the low-pass filter
#'   window; see `l.window` in [`stats::stl`].
#' @param seasonal_as_residual Boolean specifying whether the seasonal(/weekly)
#'   component should be treated as part of the residual component instead of as
#'   part of the predictions. The default, FALSE, treats them as part of the
#'   predictions, so large seasonal(/weekly) components will not lead to
#'   flagging points as outliers. `TRUE` may instead consider the extrema of
#'   large seasonal variations to be outliers; `n_seasonal` and
#'   `seasonal_period` will still have an impact on the result, though, by
#'   impacting the estimation of the trend component.
#' @template outlier-detection-options
#'
#' @details The STL decomposition is computed using [`stats::stl()`]. Once
#'   computed, the outlier detection method is analogous to the rolling median
#'   method in [`detect_outlr_rm()`], except with the fitted values and residuals
#'   from the STL decomposition taking the place of the rolling median and
#'   residuals to the rolling median, respectively.
#'
#' The last set of arguments, `log_transform` through `replacement_multiplier`,
#'   are exactly as in `detect_outlr_rm()`.
#'
#' @rdname detect_outlr
#' @importFrom stats median
#' @importFrom tidyselect starts_with
#' @export
#' @examples
#' # Detects outliers based on a seasonal-trend decomposition using LOESS
#' covid_incidence_outliers %>%
#'   dplyr::select(geo_value, time_value, cases) %>%
#'   as_epi_df() %>%
#'   group_by(geo_value) %>%
#'   mutate(outlier_info = detect_outlr_stl(
#'     x = time_value, y = cases,
#'     seasonal_period = 7 # weekly seasonality for daily data
#'   ))
detect_outlr_stl <- function(x = seq_along(y), y,
                             n_trend = 21,
                             n_seasonal = 21,
                             n_threshold = 21,
                             seasonal_period,
                             seasonal_as_residual = FALSE,
                             log_transform = FALSE,
                             detect_negatives = FALSE,
                             detection_multiplier = 2,
                             min_radius = 0,
                             replacement_multiplier = 0) {
  if (dplyr::n_distinct(x) != length(y)) {
    cli_abort("`x` contains duplicate values. (If being run on a column in an
               `epi_df`, did you group by relevant key variables?)")
  }
  if (length(y) <= 1L) {
    cli_abort("`y` has length {length(y)}; that's definitely too little for
               STL.  (If being run in a `mutate()` or `epi_slide()`, check
               whether you grouped by too many variables; you should not be
               grouping by `time_value` in particular.)")
  }
  distinct_x_skips <- unique(diff(x))
  if (diff(range(distinct_x_skips)) > 1e-4 * mean(distinct_x_skips)) {
    cli_abort("`x` does not appear to have regular spacing; consider filling in
               gaps with imputed values (STL does not allow NAs).")
  }
  if (is.unsorted(x)) { # <- for performance in common (sorted) case
    o <- order(x)
    x <- x[o]
    y <- y[o]
  }

  # Transform if requested
  if (log_transform) {
    # Replace all negative values with 0
    y <- pmax(0, y)
    offset <- as.integer(any(y == 0))
    y <- log(y + offset)
  }

  assert_int(seasonal_period, lower = 2L)
  assert_logical(seasonal_as_residual, len = 1L, any.missing = FALSE)

  yts <- stats::ts(y, frequency = seasonal_period)
  stl_comp <- stats::stl(yts,
    t.window = n_trend, s.window = n_seasonal,
    robust = TRUE
  )$time.series %>%
    tibble::as_tibble() %>%
    dplyr::rename(resid = .data$remainder)

  # Allocate the seasonal term from STL to either fitted or resid
  if (!seasonal_as_residual) {
    stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend + .data$seasonal)
  } else {
    stl_comp <- dplyr::mutate(stl_comp, fitted = .data$trend, resid = .data$seasonal + .data$resid)
  }

  # Detect negatives if requested
  if (detect_negatives && !log_transform) {
    min_lower <- 0
  } else {
    min_lower <- -Inf
  }

  # Make an epi_df for easy sliding
  z <- as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y))

  # Calculate lower and upper thresholds and replacement value
  z <- z %>%
    dplyr::mutate(fitted = stl_comp$fitted, resid = stl_comp$resid) %>%
    roll_iqr(
      n = n_threshold,
      detection_multiplier = detection_multiplier,
      min_radius = min_radius,
      replacement_multiplier = replacement_multiplier,
      min_lower = min_lower
    )

  # Undo log transformation if necessary
  if (log_transform) {
    z$lower <- exp(z$lower) - offset
    z$upper <- exp(z$upper) - offset
    z$replacement <- exp(z$replacement) - offset
  }

  return(z)
}

# Common function for rolling IQR, using fitted and resid variables
roll_iqr <- function(z, n, detection_multiplier, min_radius,
                     replacement_multiplier, min_lower) {
  if (typeof(z$y) == "integer") {
    as_type <- as.integer # nolint: object_usage_linter
  } else {
    as_type <- as.numeric
  }

  z %>%
    epi_slide(
      roll_iqr = stats::IQR(resid, na.rm = TRUE),
      .window_size = n, .align = "center"
    ) %>%
    dplyr::mutate(
      lower = pmax(
        min_lower,
        fitted - pmax(min_radius, detection_multiplier * roll_iqr)
      ),
      upper = fitted + pmax(min_radius, detection_multiplier * roll_iqr),
      replacement = dplyr::case_when(
        (y < lower) ~ as_type(fitted - replacement_multiplier * roll_iqr),
        (y > upper) ~ as_type(fitted + replacement_multiplier * roll_iqr),
        TRUE ~ y
      )
    ) %>%
    dplyr::select(.data$lower, .data$upper, .data$replacement) %>%
    tibble::as_tibble()
}
cmu-delphi/epiprocess documentation built on Oct. 29, 2024, 5:37 p.m.