R/get_ds_rt.R

Defines functions get_ds_rt_var get_ds_rt

Documented in get_ds_rt get_ds_rt_var

#' Perform direct standardization
#'
#' @description Compute a directly standardized rate given the counts of events,
#' the size of each defined population and the size of each group of the
#' standard population.
#'
#' @param counts A numeric vector containing the frequency of events.
#' @param popn A numeric vector containing the size of each defined population.
#' @param std_popn A numeric vector containing the size of each group of the
#' standard population.
#' @param scale,power Provide either `scale`, a positive integer, to specify the
#' scale, or `power`, a non-negative integer, to specify the power of 10 to use
#' when expressing the result. If neither argument is supplied, `scale = 1` will
#' be applied.
#' @param output_type If `"rate"` (default), the directly standardized rate is
#' returned. If `"counts"`, the expected number of events for each stratum is
#' returned.
#' @param clean_strata
#'
#'  * `"none"` returns a directly standardized rate if and only if the evaluated
#'  specific rates are free of `NA` and `std_popn` is free of `NA` and 0.
#'  * `"exclude"` removes strata with specific rates of `NA` or `std_popn`
#'  values of `NA` or `0` from the calculation of the directly standardized rate.
#'  `NA` is returned if all strata are excluded from the calculation.
#'  * `"zero"` sets specific rates evaluated to `NA` as 0. In addition, when
#'  `std_popn` contains 0, the respective stratum is excluded from the
#'  calculation of the directly standardized rate. However, when `std_popn`
#'  contains `NA`, the directly standardized rate evaluates to `NA`. When
#'  `output_type = "counts"`, the expected counts are calculated *after*
#'  specific rates of `NA` have been set to 0. *Confidence intervals cannot be
#'  constructed when this argument is used.*
#' @param dist A string to indicate the type of probability distribution for the
#' confidence interval to follow. Can be either `"normal"`, `"log normal"` or
#' `"gamma"`. This parameter is case insensitive.
#' @param interval A scalar, between 0 and 1, indicating the width of the
#' confidence interval. For example, for a 95% confidence interval, use
#' `interval = 0.95`.
#' @param method A string specifying the method to use when using the gamma
#' distribution. Use `"tcz06"` (default) for the method proposed by Tiwari,
#' Clegg, and Zou (2006) and `"ff97"` for the more conservative method proposed
#' by Fay and Feuer (1997). This parameter must be left empty when calculating
#' confidence intervals using other distributions. See \href{http://documentation.sas.com/doc/en/pgmsascdc/9.4_3.4/statug/statug_stdrate_details14.htm#statug_stdrate002267}{SAS Help Center: Direct Standardization}
#' for more details.
#'
#' @details This low-level function assumes that the counts in each stratum,
#' the size of each stratum in the observed population and the size of each
#' stratum in the standard population are provided in the same order.
#'
#' To construct confidence intervals for the rate estimates, arguments must be
#' explicitly supplied to both `dist` and `interval`.
#'
#' @return
#' If `output_type = "rate"` (default) and neither `dist` nor `interval` is
#' supplied, the directly standardized rate is returned as a numeric vector.
#'
#' If `output_type = "rate"` and both `dist` and `interval` are provided, then a
#' data frame is returned with the columns `dsr`, `lower`, `upper` and
#' `interval`.
#'
#' If `output_type = "counts"`, the expected counts are returned as a numeric
#' vector.
#'
#' @references \href{http://documentation.sas.com/doc/en/pgmsascdc/9.4_3.4/statug/statug_stdrate_details.htm}{The STDRATE Procedure}
#'
#' @examples
#' \dontrun{
#' # using vectors
#' counts <- c(10, 7, 6, 4, 2)
#' popn <- c(300, 595, 492, 43, 422)
#' std_popn <- c(807, 355, 379, 244, 781)
#'
#' get_ds_rt(counts, popn, std_popn)
#' get_ds_rt(counts, popn, std_popn, scale = 1000)
#' get_ds_rt(counts, popn, std_popn, power = 3)
#' get_ds_rt(counts, popn, std_popn, output_type = "counts")
#' get_ds_rt(counts, popn, std_popn, clean_strata = "exclude")
#' get_ds_rt(
#'   counts, popn, std_popn,
#'   scale = 1000,
#'   dist = "normal", interval = 0.95
#' )
#' get_ds_rt(
#'   counts, popn, std_popn,
#'   scale = 100000,
#'   dist = "gamma", interval = 0.9, method = "ff97"
#' )
#'
#' # using a data frame
#' df <- data.frame(counts, popn, std_popn)
#'
#' get_ds_rt(df$counts, df$popn, df$std_popn, scale = 1000)
#' get_ds_rt(df$counts, df$popn, df$std_popn, power = 3)
#' }
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @seealso [get_ci_norm()], [get_ci_lnorm()], [get_ci_gamma()]
#' @export
get_ds_rt <- function(counts, popn, std_popn, scale = NULL, power = NULL,
                      output_type = "rate", clean_strata = "none",
                      dist = NULL, interval = NULL, method = NULL) {

  # check validity of inputs

  if (length(unique(purrr::map_dbl(list(counts, popn, std_popn), length))) != 1) {
    stop("input vectors must be the same length")
  }

  if (!is.numeric(std_popn)) {
    stop("`std_popn` must be a numeric vector")
  }

  if (any(is.na(std_popn))) {
    warning("at least one element in `std_popn` is NA")
  }

  if (any(std_popn == 0, na.rm = TRUE)) {
    warning("at least one element in `std_popn` is 0")
  }

  if (!output_type %in% c("rate", "counts")) {
    stop("`output_type` must be either 'rate' or 'counts'")
  }

  if (!is.null(scale) & !is.null(power)) {
    stop("must supply exactly one of `scale` and `power`")
  }

  # set multiplier
  if (!is.null(scale)) { # if only `scale` is supplied
    if (!is.numeric(scale)) {
      stop("`scale` must be a positive integer of length 1")
    }

    if (length(scale) != 1) {
      stop("`scale` must be a positive integer of length 1")
    }

    if (scale %% 1 != 0 | scale < 1) {
      stop("`scale` must be a positive integer of length 1")
    }

    multiplier <- scale
  } else if (!is.null(power)) { # if only `power` is supplied
    if (!is.numeric(power)) {
      stop("`power` must be a non-negative integer of length 1")
    }

    if (length(power) != 1) {
      stop("`power` must be a non-negative integer of length 1")
    }

    if (power %% 1 != 0 | power < 0) {
      stop("`power` must be a non-negative integer of length 1")
    }

    multiplier <- 10**power
  } else { # if neither `scale` nor `power` is supplied, return the estimates as is (i.e. `scale` = 1)
    multiplier <- 1
  }

  # calculate stratum-specific rates
  spec_rt <- get_spec_rt(counts, popn) %>%
    as.numeric()

  df <- data.frame(counts, popn, std_popn, spec_rt)

  # calculate directly standardized rate based on specified method
  if (clean_strata == "none") {

    # calculate expected counts for each stratum
    df <- df %>%
      dplyr::mutate(exp_counts = .data$spec_rt * .data$std_popn)

    # check if true rate can be calculated
    n_valid_rows <- df %>%
      dplyr::filter(!is.na(spec_rt), std_popn > 0) %>%
      nrow()

    # only calculate rate if all strata are valid
    if (n_valid_rows == nrow(df)) {
      df_dsr <- df %>%
        dplyr::summarise(
          dsr_n = sum(.data$exp_counts),
          dsr_d = sum(.data$std_popn),
          dsr = .data$dsr_n / .data$dsr_d
        ) %>%
        dplyr::select(.data$dsr)
    } else {
      df_dsr <- data.frame(dsr = NA_real_)
    }
  } else if (clean_strata == "exclude") {

    # calculate expected counts for each stratum
    df <- df %>%
      dplyr::mutate(exp_counts = .data$spec_rt * .data$std_popn)

    # exclude rows where specific rate is NA or standard population is 0 or NA
    df_excl <- df %>%
      dplyr::filter(!is.na(spec_rt), std_popn > 0)

    # calculate rate
    if (!plyr::empty(df_excl)) {
      df_dsr <- df_excl %>%
        dplyr::summarise(
          dsr_n = sum(.data$exp_counts),
          dsr_d = sum(std_popn),
          dsr = .data$dsr_n / .data$dsr_d
        ) %>%
        dplyr::select(.data$dsr)
    } else {
      (df_dsr <- data.frame(dsr = NA_real_))
    }
  } else if (clean_strata == "zero") {
    # set specific rate of NA to 0 then calculate expected counts
    df <- df %>%
      dplyr::mutate(spec_rt = dplyr::if_else(is.na(spec_rt), 0, spec_rt)) %>%
      dplyr::mutate(exp_counts = .data$spec_rt * .data$std_popn)

    df_dsr <- df %>%
      dplyr::summarise(
        dsr_n = sum(.data$exp_counts),
        dsr_d = sum(.data$std_popn),
        dsr = .data$dsr_n / .data$dsr_d
      ) %>%
      dplyr::select(.data$dsr)
  } else {
    stop("`clean_strata` must be one of 'none', 'exclude' or 'zero'")
  }

  if (xor(is.null(dist), is.null(interval))) {
    stop("both `dist` and `interval` must be supplied to construct confidence intervals")
  }

  if (!is.null(dist) & !is.null(interval)) {
    if (clean_strata == "zero") {
      stop("construction of confidence interval with clean_strata = 'zero' not yet implemented")
    }

    # check validity of distribution name calculate confidence interval
    dist_name <- get_dist_name(dist)
    var <- get_ds_rt_var(df)

    if (dist_name %in% c("normal", "lognormal")) {
      if (!is.null(method)) {
        stop("`method` must be NULL when `dist` is 'normal' or 'lognormal'")
      }
    }

    if (dist_name == "normal") {
      df_dsr <- df_dsr %>%
        dplyr::mutate(
          get_ci_norm(interval = interval, estimate = .data$dsr, variance = var)
        )
    } else if (dist_name == "lognormal") {
      df_dsr <- df_dsr %>%
        dplyr::mutate(
          var_log = (1 / (.data$dsr**2)) * var, # estimation found in Direct Standardization section of STDRATE Procedure Details
          get_ci_lnorm(interval = interval, estimate = .data$dsr, variance_log = .data$var_log)
        )
    } else if (dist_name == "gamma") {
      weights <- df %>%
        dplyr::mutate(w_j = (std_popn / sum(std_popn)) * (1 / popn)) %>%
        dplyr::pull(.data$w_j) %>%
        list()

      # if user doesn't specify a method, use the default
      if (is.null(method)) {
        method <- "tcz06"
      }

      df_dsr <- df_dsr %>%
        dplyr::mutate(get_ci_gamma(
          interval = interval, estimate = .data$dsr, weights = weights,
          variance = var, method = method
        ))
    } else {
      stop("for a directly standardized rate, `dist` should be one of 'normal', 'log normal', or 'gamma'")
    }

    # clean output
    df_dsr <- df_dsr %>%
      dplyr::mutate(interval = interval) %>%
      dplyr::select(.data$dsr, .data$lower, .data$upper, .data$interval) %>%
      dplyr::mutate(dplyr::across(.cols = c(.data$lower, .data$upper), ~ .x * multiplier)) # apply multiplier to lower and upper limits
  }

  # apply multiplier to rate
  df_dsr <- df_dsr %>%
    dplyr::mutate(dsr = .data$dsr * multiplier)

  # return specified outputs
  if (output_type == "rate") {
    if ("interval" %in% colnames(df_dsr)) {
      result <- df_dsr
    } else {
      result <- df_dsr$dsr # confidence interval not calculated
    }
  } else {
    result <- df$exp_counts
  }

  return(result)
}

#' Calculate the variance for a directly standardized rate
#'
#' @param df A data frame with the columns `popn`, `std_popn`, `spec_rt`
#'
#' @return A variance as a double
#'
#' @examples
#' \dontrun{
#' df <- data.frame(counts, popn, std_popn) %>%
#'   dplyr::mutate(
#'     spec_rt = get_spec_rt(counts, popn),
#'     exp_counts = spec_rt * std_popn
#'   )
#'
#' get_ds_rt_var(df)
#' }
#'
#' @keywords internal
get_ds_rt_var <- function(df) {
  var_n <- df %>%
    dplyr::mutate(
      std_pop_sq = .data$std_popn**2,
      var_spec_rt = .data$spec_rt / .data$popn
    ) %>%
    dplyr::summarise(numerator = sum(.data$std_pop_sq * .data$var_spec_rt)) %>%
    dplyr::pull(.data$numerator)

  var_d <- sum(df$std_popn)**2

  var <- var_n / var_d

  return(var)
}
bcgov/bcEpiRate documentation built on Feb. 24, 2022, 4:05 p.m.