R/revision_analysis.R

Defines functions difftime_summary num_percent f_no_na get_last_run time_within_x_latest revision_summary

Documented in difftime_summary f_no_na get_last_run num_percent revision_summary time_within_x_latest

#' A function to describe revision behavior for an archive.
#'
#' @description
#' `revision_summary` removes all missing values (if requested), and then
#'   computes some basic statistics about the revision behavior of an archive,
#'   returning a tibble summarizing the revisions per time_value+epi_key
#'   features. If `print_inform` is true, it prints a concise summary. The
#'   columns returned are:
#'  1. `n_revisions`: the total number of revisions for that entry
#'  2. `min_lag`: the minimum time to any value (if `drop_nas=FALSE`, this
#'   includes `NA`'s)
#'  3. `max_lag`: the amount of time until the final (new) version (same caveat
#'   for `drop_nas=FALSE`, though it is far less likely to matter)
#'  4. `min_value`: the minimum value across revisions
#'  5. `max_value`: the maximum value across revisions
#'  6. `median_value`: the median value across revisions
#'  7. `spread`: the difference between the smallest and largest values (this
#'   always excludes `NA` values)
#'  8. `rel_spread`: `spread` divided by the largest value (so it will
#'   always be less than 1). Note that this need not be the final value. It will
#'   be `NA` whenever `spread` is 0.
#'  9. `time_near_latest`: the time taken for the revisions to settle to within
#'   `within_latest` (default 20%) of the final value and stay there. For
#'   example, consider the series (0, 20, 99, 150, 102, 100); then
#'   `time_near_latest` is 5, since even though 99 is within 20%, it is outside
#'   the window afterwards at 150.
#'
#' @param epi_arch an epi_archive to be analyzed
#' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to
#'   summarize. If empty, it chooses the first. Currently only implemented for
#'   one column at a time.
#' @param drop_nas bool, drop any `NA` values from the archive? After dropping
#'   `NA`'s compactify is run again to make sure there are no duplicate values
#'   from occasions when the signal is revised to `NA`, and then back to its
#'   immediately-preceding value.
#' @param print_inform bool, determines whether to print summary information, or
#'   only return the full summary tibble
#' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any
#'   time_values not earlier than `min_waiting_period` before `versions_end` are
#'   removed. `min_waiting_period` should characterize the typical time during
#'   which revisions occur.  The default of 60 days corresponds to a typical
#'   final value for case counts as reported in the context of insurance. To
#'   avoid this filtering, either set to `NULL` or 0.
#' @param within_latest double between 0 and 1. Determines the threshold
#'   used for the `time_to`
#' @param quick_revision difftime or integer (integer is treated as days), for
#'   the printed summary, the amount of time between the final revision and the
#'   actual time_value to consider the revision quickly resolved. Default of 3
#'   days
#' @param few_revisions integer, for the printed summary, the upper bound on the
#'   number of revisions to consider "few". Default is 3.
#' @param abs_spread_threshold numeric, for the printed summary, the maximum
#'   spread used to characterize revisions which don't actually change very
#'   much. Default is 5% of the maximum value in the dataset, but this is the
#'   most unit dependent of values, and likely needs to be chosen appropriate
#'   for the scale of the dataset.
#' @param rel_spread_threshold float between 0 and 1, for the printed summary,
#'   the relative spread fraction used to characterize revisions which don't
#'   actually change very much. Default is .1, or 10% of the final value
#' @param compactify_tol float, used if `drop_nas=TRUE`, it determines the
#'   threshold for when two floats are considered identical.
#' @param should_compactify bool. Compactify if `TRUE`.
#'
#' @examples
#' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli)
#' revision_example %>% arrange(desc(spread))
#'
#' @export
#' @importFrom cli cli_inform cli_abort cli_li
#' @importFrom rlang list2 syms
#' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across
#'   everything ungroup summarize if_else %>%
revision_summary <- function(epi_arch,
                             ...,
                             drop_nas = TRUE,
                             print_inform = TRUE,
                             min_waiting_period = as.difftime(60, units = "days"),
                             within_latest = 0.2,
                             quick_revision = as.difftime(3, units = "days"),
                             few_revisions = 3,
                             abs_spread_threshold = NULL,
                             rel_spread_threshold = 0.1,
                             compactify_tol = .Machine$double.eps^0.5,
                             should_compactify = TRUE) {
  arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT))
  if (length(arg) == 0) {
    # Choose the first column that's not a key or version
    arg <- setdiff(names(epi_arch$DT), c(key_colnames(epi_arch), "version"))[[1]]
  } else if (length(arg) > 1) {
    cli_abort("Not currently implementing more than one column at a time. Run each separately")
  }
  if (is.null(abs_spread_threshold)) {
    abs_spread_threshold <- .05 * epi_arch$DT %>%
      pull(...) %>%
      max(na.rm = TRUE)
  }
  # for each time_value, get
  #   the number of revisions
  #   the maximum spread in value (both absolute and relative)
  #   the min lag
  #   the max lag
  #
  # revision_tibble
  keys <- key_colnames(epi_arch)

  revision_behavior <- epi_arch$DT %>%
    select(all_of(unique(c("geo_value", "time_value", keys, "version", arg))))
  if (!is.null(min_waiting_period)) {
    revision_behavior <- revision_behavior %>%
      filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period)
  }

  if (drop_nas) {
    # if we're dropping NA's, we should recompactify
    revision_behavior <-
      revision_behavior %>%
      filter(!is.na(c_across(!!arg)))
  } else {
    revision_behavior <- epi_arch$DT
  }
  if (should_compactify) {
    revision_behavior <- revision_behavior %>%
      arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying
      apply_compactify(c(keys, version), compactify_tol)
  }
  revision_behavior <-
    revision_behavior %>%
    mutate(lag = as.integer(version) - as.integer(time_value)) %>% # nolint: object_usage_linter
    group_by(across(all_of(keys))) %>% # group by all the keys
    summarize(
      n_revisions = dplyr::n() - 1,
      min_lag = min(lag), # nolint: object_usage_linter
      max_lag = max(lag), # nolint: object_usage_linter
      min_value = f_no_na(min, pick(!!arg)),
      max_value = f_no_na(max, pick(!!arg)),
      median_value = f_no_na(median, pick(!!arg)),
      time_to = time_within_x_latest(lag, pick(!!arg), prop = within_latest), # nolint: object_usage_linter
      .groups = "drop"
    ) %>%
    mutate(
      spread = max_value - min_value, # nolint: object_usage_linter
      rel_spread = spread / max_value, # nolint: object_usage_linter
      # TODO the units here may be a problem
      min_lag = as.difftime(min_lag, units = "days"), # nolint: object_usage_linter
      max_lag = as.difftime(max_lag, units = "days"), # nolint: object_usage_linter
      time_near_latest = as.difftime(time_to, units = "days") # nolint: object_usage_linter
    ) %>%
    select(-time_to) %>%
    relocate(
      time_value, geo_value, all_of(keys), n_revisions, min_lag, max_lag, # nolint: object_usage_linter
      time_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter
    )
  if (print_inform) {
    cli_inform("Min lag (time to first version):")
    difftime_summary(revision_behavior$min_lag) %>% print()
    if (!drop_nas) {
      total_na <- epi_arch$DT %>%
        filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter
        nrow()
      cli_inform("Fraction of all versions that are `NA`:")
      cli_li(num_percent(total_na, nrow(epi_arch$DT), ""))
      cli_inform("")
    }
    cli_inform("Fraction of epi_key+time_values with")
    total_num <- nrow(revision_behavior) # nolint: object_usage_linter
    total_num_unrevised <- sum(revision_behavior$n_revisions == 0) # nolint: object_usage_linter
    cli_inform("No revisions:")
    cli_li(num_percent(total_num_unrevised, total_num, ""))
    total_quickly_revised <- sum( # nolint: object_usage_linter
      revision_behavior$max_lag <=
        as.difftime(quick_revision, units = "days")
    )
    cli_inform("Quick revisions (last revision within {quick_revision}
{units(quick_revision)} of the `time_value`):")
    cli_li(num_percent(total_quickly_revised, total_num, ""))
    total_barely_revised <- sum( # nolint: object_usage_linter
      revision_behavior$n_revisions <=
        few_revisions
    )
    cli_inform("Few revisions (At most {few_revisions} revisions for that `time_value`):")
    cli_li(num_percent(total_barely_revised, total_num, ""))
    cli_inform("")
    cli_inform("Fraction of revised epi_key+time_values which have:")

    real_revisions <- revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter
    n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter
    rel_spread <- sum( # nolint: object_usage_linter
      real_revisions$rel_spread <
        rel_spread_threshold,
      na.rm = TRUE
    ) + sum(is.na(real_revisions$rel_spread))
    cli_inform("Less than {rel_spread_threshold} spread in relative value:")
    cli_li(num_percent(rel_spread, n_real_revised, ""))
    abs_spread <- sum( # nolint: object_usage_linter
      real_revisions$spread >
        abs_spread_threshold
    ) # nolint: object_usage_linter
    cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):")
    cli_li(num_percent(abs_spread, n_real_revised, ""))

    cli_inform("{units(quick_revision)} until within {within_latest*100}% of the latest value:")
    difftime_summary(revision_behavior[["time_near_latest"]]) %>% print()
  }
  return(revision_behavior)
}

#' pull the value from lags when values starts indefinitely being within prop of it's last value.
#' @param values this should be a 1 column tibble. errors may occur otherwise
#' @keywords internal
time_within_x_latest <- function(lags, values, prop = .2) {
  values <- values[[1]]
  latest_value <- values[[length(values)]]
  close_enough <- abs(values - latest_value) < prop * latest_value
  # we want to ignore any stretches where it's close, but goes farther away later
  return(get_last_run(close_enough, lags))
}

#' return the first value in values_from from the last string of trues in bool_vec
#' @description
#' the point of this operation is to get the value in values_from which occurs
#'   at the same index as the start of the last run of true values in bool_vec.
#'   for example, in c(1,1,0,1,1), we want the 4th entry, since there's a 0
#'   breaking the run
#' @keywords internal
get_last_run <- function(bool_vec, values_from) {
  runs <- rle(bool_vec)
  length(bool_vec) - tail(runs$lengths, n = 1)
  values_from[[length(bool_vec) - tail(runs$lengths, n = 1) + 1]]
}

#' use when the default behavior returns a warning on empty lists, which we do
#' not want, and there is no super clean way of preventing this
#' @keywords internal
f_no_na <- function(f, x) {
  x <- x[!is.na(x)]
  if (length(x) == 0) {
    return(Inf)
  } else {
    return(f(x))
  }
}


#' simple util for printing a fraction and it's percent
#' @keywords internal
num_percent <- function(a, b, b_description) {
  glue::glue("{prettyNum(a, big.mark=',')} out of {prettyNum(b, big.mark=',')} {b_description}
({round(a/b*100,digits=2)}%)")
}

#' summary doesn't work on difftimes
#' @keywords internal
difftime_summary <- function(diff_time_val) {
  if (length(diff_time_val) > 0) {
    res <- data.frame(
      min = min(diff_time_val),
      median = median(diff_time_val),
      mean = round(mean(diff_time_val), 1),
      max = max(diff_time_val),
      row.names = " ",
      check.names = FALSE
    )
    return(res)
  } else {
    return(data.frame())
  }
}
cmu-delphi/epiprocess documentation built on Oct. 29, 2024, 5:37 p.m.