R/utils.R
In hcruR: Estimate, Compare, and Visualize Healthcare Resource Utilization for Real-World Evidence

Documented in preproc_hcru_fun summarize_descriptives summarize_descriptives_gt

#' @title preproc_hcru_fun
#' @description
#' This function helps to pre-process the heath care resource utilization
#' (HCRU) for a given electronic health record data for a given set of settings
#' e.g. IP, OP, ED/ER, etc.
#'
#' @param data A dataframe specifying the health care details
#' @param cohort_col A character specifying the name of the cohort column
#' @param patient_id_col A character specifying the name of the patient
#' identifier column
#' @param admit_col A character specifying the name of the date of admission
#' column
#' @param discharge_col A character specifying the name of the date of
#' discharge column
#' @param index_col A character specifying the name of the index date or
#' diagnosis column
#' @param visit_col A character specifying the name of the date of
#' visit/claim column
#' @param encounter_id_col A character specifying the name of the
#' encounter/claim column
#' @param setting_col A character specifying the name of the HCRU
#' setting column e.g. IP, ED, OP, etc.
#' @param pre_days Number of days before index (default 180 days)
#' @param post_days Number of days after index (default 365 days)
#' @param readmission_days_rule Rule for how many days can be permissible to
#' define readmission criteria in AP setting (default 30 days)
#'
#' @importFrom dplyr select left_join group_by mutate case_when n
#' filter
#' @import checkmate
#'
#' @returns dataframe with HCRU estimates.
#'
#' @export
#'
#' @examples
#' \donttest{
#' preproc_hcru_fun(data = hcru_sample_data)
#' }
preproc_hcru_fun <- function(data,
                             cohort_col = "cohort",
                             patient_id_col = "patient_id",
                             admit_col = "admission_date",
                             discharge_col = "discharge_date",
                             index_col = "index_date",
                             visit_col = "visit_date",
                             encounter_id_col = "encounter_id",
                             setting_col = "care_setting",
                             pre_days = 180,
                             post_days = 365,
                             readmission_days_rule = 30) {
  # Primary input checks
  checkmate::assert_data_frame(data, min.rows = 1)
  checkmate::assert_character(cohort_col, min.chars = 1)
  checkmate::assert_character(patient_id_col, min.chars = 1)
  checkmate::assert_character(admit_col, min.chars = 1)
  checkmate::assert_character(discharge_col, min.chars = 1)
  checkmate::assert_character(index_col, min.chars = 1)
  checkmate::assert_character(visit_col, min.chars = 1)
  checkmate::assert_character(encounter_id_col, min.chars = 1)
  checkmate::assert_character(setting_col, min.chars = 1)
  checkmate::assert_numeric(pre_days)
  checkmate::assert_numeric(post_days)
  checkmate::assert_numeric(readmission_days_rule)

  # Convert date columns into R date formats
  data <- data |>
    dplyr::mutate_at(
      dplyr::vars(admit_col, discharge_col, index_col, visit_col), as.Date
    )

  # Mutate visit_days and period columns
  data <- data |>
    dplyr::mutate(
      visit_days = dplyr::case_when(
        .data[[visit_col]] < .data[[index_col]] ~ as.numeric(
          .data[[index_col]] - .data[[visit_col]]
        ) + 1,
        .data[[visit_col]] >= .data[[index_col]] ~ as.numeric(
          .data[[visit_col]] - .data[[index_col]]
        ) + 1,
        TRUE ~ NA_real_
      ),
      period = dplyr::case_when(
        .data[[visit_col]] < .data[[index_col]] ~ "pre",
        .data[[visit_col]] >= .data[[index_col]] ~ "post",
        TRUE ~ NA_character_
      )
    )

  # Create timeline window
  data <- data |>
    dplyr::mutate(
      time_window = dplyr::case_when(
        .data[["period"]] == "pre" & .data[["visit_days"]] >= 0 &
          .data[["visit_days"]] <= pre_days ~ "Pre",
        .data[["period"]] == "post" & .data[["visit_days"]] >= 0 &
          .data[["visit_days"]] <= post_days ~ "Post",
        TRUE ~ NA_character_
      )
    ) |>
    dplyr::filter(!is.na(.data[["time_window"]]))

  # Prepare Length of stay (LOS)
  data <- data |>
    dplyr::mutate(
      length_of_stay = as.numeric(
        .data[[discharge_col]] - .data[[admit_col]]
      ) + 1
    )

  # Readmission logic (IP only)
  final_data <- data |>
    dplyr::arrange(
      .data[[cohort_col]], .data[[patient_id_col]], .data[[admit_col]]
    ) |>
    dplyr::group_by(
      .data[[cohort_col]], .data[[patient_id_col]], .data[[setting_col]]
    ) |>
    dplyr::mutate(
      next_admit = dplyr::lead(
        .data[[admit_col]]
      ),
      days_to_next = as.numeric(
        .data[["next_admit"]] - .data[[discharge_col]]
      )
    ) |>
    dplyr::ungroup() |>
    dplyr::mutate(
      readmission = dplyr::case_when(
        .data[[setting_col]] == "IP" & !is.na(.data[["days_to_next"]]) &
          .data[["days_to_next"]] <= as.numeric(readmission_days_rule) ~ 1,
        (.data[[setting_col]] == "IP" & is.na(.data[["days_to_next"]])) |
          (.data[[setting_col]] == "IP" & !is.na(.data[["days_to_next"]]) &
            .data[["days_to_next"]] > as.numeric(readmission_days_rule)) ~ 0,
        TRUE ~ NA_integer_
      )
    )

  return(final_data)
}

#' Generate Detailed Descriptive Statistics with Custom P-Value Tests
#'
#' @param data A dataframe with variables to summarize from the output of
#' the summarize_descriptives function. Kindly filter the data for timeline.
#' @param var_list Optional quoted variable list (e.g. care_setting).
#' @param test Optional named list of statistical tests
#' (e.g. age ~ "wilcox.test").
#' @param patient_id_col A character specifying the name of patient identifier
#' column.
#' @param group_var_main A character specifying the name of the main
#' grouping column.
#' @param group_var_by A character specifying the name of the secondary
#' grouping column.
#' @param timeline A character specifying the timeline window (default "Pre").
#'
#' @importFrom gtsummary tbl_summary add_overall add_p modify_header add_n
#' modify_spanning_header all_continuous all_categorical all_stat_cols
#' all_of
#' @import checkmate
#' @import glue
#' @import purrr
#'
#' @return A gtsummary table object
#'
#' @export
#'
#' @examples
#'  \donttest{
#' if (requireNamespace("gtsummary", quietly = TRUE) &&
#'     requireNamespace("dplyr", quietly = TRUE) &&
#'     requireNamespace("purrr", quietly = TRUE) &&
#'     requireNamespace("checkmate", quietly = TRUE) &&
#'     requireNamespace("glue", quietly = TRUE)) {
#'   hcru_sample_data <- data.frame(
#'     patient_id = rep(1:10, each = 2),
#'     cohort = rep(c("A", "B"), 10),
#'     care_setting = rep(c("IP", "OP"), 10),
#'     admission_date = Sys.Date() - sample(1:100, 20, TRUE),
#'     discharge_date = Sys.Date() - sample(1:90, 20, TRUE),
#'     index_date = Sys.Date() - 50,
#'     visit_date = Sys.Date() - sample(1:100, 20, TRUE),
#'     encounter_id = 1:20,
#'     cost_usd = runif(20, 100, 1000)
#'   )
#'   df <- preproc_hcru_fun(data = hcru_sample_data)
#'   summary_df <- summarize_descriptives(data = df)
#'   # Only keep required columns for demonstration
#'   summary_df$LOS <- ifelse(summary_df$care_setting == "IP",
#' sample(1:10, nrow(summary_df), TRUE), NA)
#'   summary_df$Readmission <- ifelse(summary_df$care_setting == "IP",
#' sample(0:1, nrow(summary_df), TRUE), NA)
#'   summary_df$time_window <- "Pre"
#'   # Run the function (should execute within 5 seconds)
#'   summarize_descriptives_gt(
#'     data = summary_df,
#'     patient_id_col = "patient_id",
#'     var_list = c("Visits", "Cost", "LOS", "Readmission"),
#'     group_var_main = "cohort",
#'     group_var_by = "care_setting",
#'     timeline = "Pre"
#'   )
#' }
#' }
summarize_descriptives_gt <- function(
  data,
  patient_id_col = "patient_id",
  var_list = NULL,
  group_var_main = "cohort", # e.g., "cohort"
  group_var_by = "care_setting", # e.g., "setting"
  test = NULL,
  timeline = "Pre") {
  # Basic checks
  checkmate::assert_data_frame(data, min.rows = 1)
  checkmate::assert_character(var_list, null.ok = TRUE)
  checkmate::assert_character(group_var_main, len = 1)
  checkmate::assert_character(group_var_by, len = 1)
  checkmate::assert_list(test, null.ok = TRUE)
  checkmate::assert_character(timeline, len = 1)
  checkmate::assert_character(patient_id_col, len = 1)

  # Filter the data as per the timeline
  data <- data |>
    dplyr::filter(.data[["time_window"]] == timeline)

  # Unique settings (IP, OP, ED...)
  settings <- unique(data[[group_var_by]])

  # Iterate by setting
  setting_tbls <- purrr::map(settings, function(setting) {
    df_setting <- dplyr::filter(data, .data[[group_var_by]] == setting)

    # Drop IP-only vars if not IP
    vars_this <- var_list
    if (setting != "IP") {
      vars_this <- setdiff(vars_this, c("LOS", "Readmission"))
    }

  # Cohort group (control, treatment, etc.)
  cohort_sym <- rlang::sym(group_var_main)

  # Get N per group
  n_df <- df_setting |>
    dplyr::distinct(.data[[patient_id_col]], .data[[group_var_main]]) |>
    dplyr::count(.data[[group_var_main]], name = "n") |>
    dplyr::mutate(
      colname = paste0("stat_", dplyr::row_number()),
      header = glue::glue("{.data[[group_var_main]]}, N = {n}")
    )
  header_map <- stats::setNames(n_df$header, n_df$colname)

  # Build summary
  tbl <- gtsummary::tbl_summary(
    data = df_setting,
    by = !!cohort_sym,
    include = gtsummary::all_of(vars_this),
    missing = "no",
    type = list(all_continuous() ~ "continuous2"),
    statistic = list(
    all_continuous() ~ c(
      "Total" = "{sum}",
      "Mean (SD)" = "{mean} ({sd})",
      "Median (IQR)" = "{median} ({p25}, {p75})",
      "Q1" = "{p25}",
      "Q3" = "{p75}",
      "Range" = "{min} - {max}"
    )
    )
  )

  if (length(unique(df_setting[[group_var_main]])) > 1) {
    tbl <- if (!is.null(test)) {
    tbl |> gtsummary::add_p(test = test)
    } else {
    tbl |> gtsummary::add_p()
    }
  }

  tbl <- tbl |>
    gtsummary::modify_header(update = header_map) |>
    gtsummary::modify_caption(glue::glue("**Summary Table**")) |>
    gtsummary::bold_labels()
  })

  # Merge all setting-wise cohort tables side-by-side
  gtsummary::tbl_merge(
  tbls = setting_tbls,
  tab_spanner = settings
  )
}


#' Generate Detailed Descriptive Statistics
#'
#' @param data A dataframe with variables to summarize.
#' @param patient_id_col A character specifying the name of patient
#' identifier column
#' @param setting_col A character specifying the name of HRCU setting column
#' @param cohort_col A character specifying the name of cohort column
#' @param encounter_id_col A character specifying the name of encounter/claim
#' column
#' @param cost_col A character specifying the name of cost column
#' @param los_col A character specifying the name of length of stay column
#' @param readmission_col A character specifying the name of readmission column
#' @param time_window_col A character specifying the name of time window column
#'
#' @importFrom dplyr select left_join group_by reframe mutate case_when n
#' filter ungroup n_distinct if_else distinct
#' @import checkmate
#'
#' @return A table object
#'
#' @export
#'
#' @examples
#' if (requireNamespace("dplyr", quietly = TRUE) &&
#'     requireNamespace("checkmate", quietly = TRUE)) {
#'   hcru_sample_data <- data.frame(
#'     patient_id = rep(1:10, each = 2),
#'     cohort = rep(c("A", "B"), 10),
#'     care_setting = rep(c("IP", "OP"), 10),
#'     admission_date = Sys.Date() - sample(1:100, 20, TRUE),
#'     discharge_date = Sys.Date() - sample(1:90, 20, TRUE),
#'     index_date = Sys.Date() - 50,
#'     visit_date = Sys.Date() - sample(1:100, 20, TRUE),
#'     encounter_id = 1:20,
#'     cost_usd = runif(20, 100, 1000)
#'   )
#'   df <- preproc_hcru_fun(data = hcru_sample_data)
#'   summary_df <- summarize_descriptives(data = df)
#'   # Only keep required columns for demonstration
#'   summary_df$LOS <- ifelse(summary_df$care_setting == "IP",
#'     sample(1:10, nrow(summary_df), TRUE), NA)
#'   summary_df$Readmission <- ifelse(summary_df$care_setting == "IP",
#'     sample(0:1, nrow(summary_df), TRUE), NA)
#'   summary_df$time_window <- "Pre"
#'   summary_df
#' }
summarize_descriptives <- function(data,
                   patient_id_col = "patient_id",
                   setting_col = "care_setting",
                   cohort_col = "cohort",
                   encounter_id_col = "encounter_id",
                   cost_col = "cost_usd",
                   los_col = "length_of_stay",
                   readmission_col = "readmission",
                   time_window_col = "time_window") {
  # Primary input checks
  checkmate::assert_data_frame(data, min.rows = 1)
  checkmate::assert_character(patient_id_col, min.chars = 1)
  checkmate::assert_character(cohort_col, min.chars = 1)
  checkmate::assert_character(encounter_id_col, min.chars = 1)
  checkmate::assert_character(setting_col, min.chars = 1)
  checkmate::assert_character(readmission_col, min.chars = 1)
  checkmate::assert_character(time_window_col, min.chars = 1)
  checkmate::assert_character(cost_col, min.chars = 1)
  checkmate::assert_character(los_col, min.chars = 1)

  # Generate summary
  summary_df <- data |>
    dplyr::group_by(
      .data[[patient_id_col]],
      .data[[cohort_col]],
      .data[[setting_col]],
      .data[[time_window_col]]
    ) |>
    dplyr::reframe(
      Days = as.numeric(sum(.data[["visit_days"]], na.rm = TRUE)),
      Month = as.numeric(.data[["Days"]]) / 30.417,
      Year = as.numeric(.data[["Days"]]) / 365.5,
      Visits = dplyr::n_distinct(.data[[encounter_id_col]]) |> as.numeric(),
      Cost = sum(.data[[cost_col]], na.rm = TRUE),
      LOS = dplyr::if_else(
        .data[[setting_col]] == "IP",
        sum(.data[[los_col]], na.rm = TRUE),
        NA_real_
      ),
      Readmission = dplyr::if_else(
        .data[[setting_col]] == "IP",
        sum(.data[[readmission_col]], na.rm = TRUE),
        NA_real_
      ),
      Visit_PPPM = .data[["Visits"]] / .data[["Month"]],
      Visit_PPPY = .data[["Visits"]] / .data[["Year"]],
      Cost_PPPM = .data[["Cost"]] / .data[["Month"]],
      Cost_PPPY = .data[["Cost"]] / .data[["Year"]]
    ) |>
    dplyr::ungroup() |>
    dplyr::distinct()

  return(summary_df)
}

Any scripts or data that you put into this service are public.

hcruR documentation built on Sept. 2, 2025, 1:09 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

hcruR
Estimate, Compare, and Visualize Healthcare Resource Utilization for Real-World Evidence

R/utils.R
In hcruR: Estimate, Compare, and Visualize Healthcare Resource Utilization for Real-World Evidence

Defines functions summarize_descriptives summarize_descriptives_gt preproc_hcru_fun

Documented in preproc_hcru_fun summarize_descriptives summarize_descriptives_gt

Try the hcruR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hcruR Estimate, Compare, and Visualize Healthcare Resource Utilization for Real-World Evidence

R/utils.R In hcruR: Estimate, Compare, and Visualize Healthcare Resource Utilization for Real-World Evidence

Defines functions summarize_descriptives summarize_descriptives_gt preproc_hcru_fun

Documented in preproc_hcru_fun summarize_descriptives summarize_descriptives_gt

Try the hcruR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hcruR
Estimate, Compare, and Visualize Healthcare Resource Utilization for Real-World Evidence

R/utils.R
In hcruR: Estimate, Compare, and Visualize Healthcare Resource Utilization for Real-World Evidence