#' Calculating unadjusted and adjusted differences in rates
#'
#' This function calculates the unadjusted or adjusted difference in rates with confidence interval.
#'
#' @param data A data frame
#' @param y vector of binary outcome variables. Outcome variables can be
#' numeric, character or factor, but must have two and only two non-missing levels
#' @param x string indicating the binary stratifying variable. The stratifying
#' variable can be numeric, character or factor, but must have two and only two
#' non-missing levels
#' @param formula By default, `"{y} ~ {x}"`. To include covariates for an adjusted
#' risk difference, add covariate names to the formula, e.g. `"{y} ~ {x} + age"`
#' @param label List of formulas specifying variables labels, If a variable's label is
#' not specified here, the label attribute (`attr(data$high_grade, "label")`) is used.
#' If attribute label is `NULL`, the variable name will be used.
#' @param statistic Statistics to display for each group. Default `"{n} ({p}%)"`
#' @param method The method for calculating p-values and confidence intervals around the
#' difference in rates. The options are `"chisq"`, `"exact"`, `"boot_centile"`,
#' and `"boot_sd"`. See below for details. Default method is `"chisq"`.
#' @param conf.level Confidence level of the returned confidence interval.
#' Must be a single number between 0 and 1. The default is a 95% confidence interval.
#' @param bootstrapn The number of bootstrap resamples to use. The default is 2000
#' for `"boot_centile"` and 200 for `"boot_sd"`
#' @param estimate_fun Function to round and format estimates. By default
#' `style_sigfig`, but can take any formatting function
#' @param pvalue_fun Function to round and format p-value. By default
#' `style_pvalue`, but can take any formatting function
#'
#' @return A `tbl_propdiff` object, with sub-class `"gtsummary"`
#' @export
#'
#' @section Methods:
#'
#' * The `chisq` option returns a p-value from the `prop.test` function and a
#' confidence interval for the unadjusted difference in proportions based on
#' the normal approximation.
#'
#' * The `exact` option returns a p-value from the `fisher.test` function. The
#' confidence interval returned by this option is the same as the confidence
#' interval returned by the `chisq` option and is based on the normal approximation.
#'
#' * The `boot_centile` option calculates the adjusted difference between groups
#' in all bootstrap samples (the default for this method is 2000 resamples)
#' and generates the confidence intervals from the distribution of these
#' differences. For the default, a 95% confidence interval, the 2.5 and 97.5
#' centiles are used. The p-value presented is from a logistic regression model.
#'
#' * The `boot_sd` option calculates the adjusted difference between groups
#' in all bootstrap samples (the default for this method is 200 resamples).
#' The mean and standard deviation of the adjusted difference across all
#' resamples are calculated. The standard deviation is then used as the
#' standard error to calculate the confidence interval based on the true
#' adjusted difference. The p-value presented is from a logistic regression model.
#'
#' @examples
#' tbl_propdiff(
#' data = trial,
#' y = "response",
#' x = "trt"
#' )
#'
#' tbl_propdiff(
#' data = trial,
#' y = "response",
#' x = "trt",
#' formula = "{y} ~ {x} + age + stage",
#' method = "boot_sd",
#' bootstrapn = 25
#' )
tbl_propdiff <- function(data, y, x,
formula = "{y} ~ {x}",
label = NULL,
statistic = "{n} ({p}%)",
method = c("chisq", "exact", "boot_sd", "boot_centile"),
conf.level = 0.95,
bootstrapn = ifelse(method == "boot_centile", 2000, 200),
estimate_fun = style_sigfig,
pvalue_fun = style_pvalue) {
lifecycle::deprecate_warn(
"0.3.0", "hotfun::tbl_propdiff()", "gtsummary::add_difference()")
# converting inputs to named list --------------------------------------------
y <- var_input_to_string(data = data, select_input = {{ y }}, arg_name = "y", select_single = FALSE)
x <- var_input_to_string(data = data, select_input = {{ x }}, arg_name = "x", select_single = TRUE)
label <- tidyselect_to_list(.data = data, x = {{ label }}, arg_name = "label", select_single = FALSE)
### CHECKS------------------
# Matching arguments for method
method <- match.arg(method)
# Save out list of covariates from formula
rhs <- formula %>% stringr::str_replace_all(pattern = "\\{y\\}|\\{x\\}", ".") %>% stats::as.formula() %>% all.vars() %>% setdiff(".")
covariates <- setdiff(rhs, x)
# Confirm that x and y variables and covariates exist
if (length(setdiff(c(x, y, covariates), names(data))) != 0) {
stop(glue(
"These variables do not exist in the dataset: ",
glue_collapse(setdiff(c(x, y, covariates), names(data)), sep = ", ")
),
call. = FALSE
)
}
# If specifying covariates but not specifying a multivariable method
if (length(covariates) != 0 & !(method %in% c("boot_centile", "boot_sd"))) {
stop(glue(
"When specifying a formula that includes covariates, you must specify one of the two multivariable methods ('boot_centile' or 'boot_sd')."
),
call. = FALSE)
}
# If selecting a multivariable method but no covariates
if (method %in% c("boot_sd", "boot_centile") & length(covariates) == 0) {
message(glue(
"Method '{method}' was selected but no covariates were provided. ",
"The {conf.level*100}% confidence interval around the unadjusted difference in rates will be bootstrapped."))
}
# checking the x variable has two levels
if (data[[x]] %>% stats::na.omit() %>% unique() %>% length() != 2) {
stop(glue::glue("The stratifying variable, '{x}', must have two levels."),
call. = FALSE
)
}
# checking the y variables have two levels
if (purrr::every(y, function(x) {
data[[x]] %>%
stats::na.omit() %>%
unique() %>%
length() == 2
}) == FALSE) {
stop(glue::glue("All outcome variables, '{y}', must have two levels."),
call. = FALSE
)
}
# Confirm that conf.level is not < 0 or > 1
if (conf.level < 0 | conf.level > 1) {
stop("The confidence level specified in the `conf.level=` option must be between 0 and 1.",
call. = FALSE
)
}
# Checking estimate_fun and pvalue_fun are functions
if (!purrr::every(list(estimate_fun, pvalue_fun), is.function)) {
stop("Inputs `estimate_fun` and `pvalue_fun` must be functions.",
call. = FALSE
)
}
### DATAFRAME FOR ALL MODELS--------------------------------------
df_propdiff <-
tibble::enframe(y, name = NULL, value = "y") %>%
mutate(x = x)
# If logical, convert to 0/1
# Reverse factor levels for x variable to match tbl_ancova output
# Convert y to factor
data <-
data %>%
dplyr::mutate_if(is.logical, as.numeric) %>%
mutate_at(vars(y), ~ factor(.))
### UNADJUSTED RATES---------------------------------
df_propdiff_summary <-
df_propdiff %>%
mutate(
# Before creating tables, save out outcome label
outcome_label =
map_chr(
y,
~ label[[..1]] %||% attr(data[[..1]], "label") %||% ..1
),
# Save out table of unadjusted rates
tbl_rates =
pmap(
list(.data$x, .data$y, .data$outcome_label),
function(x, y, z) {
data %>%
select(all_of(c(x, y))) %>%
mutate_at(vars(any_of(x)), ~factor(.) %>% forcats::fct_rev()) %>%
gtsummary::tbl_summary(
by = .data[[x]], missing = "no",
label = list(z) %>% rlang::set_names(y),
type = list(everything() ~ "dichotomous"),
statistic = list(everything() ~ glue("{statistic}"))
) %>%
gtsummary::add_n() %>%
gtsummary::modify_header(stat_by = "**{level}**")
}
)
)
### CALCULATE DIFFERENCES-------------------------
# For "chisq" or "exact" method
if (method %in% c("chisq", "exact")) {
df_propdiff_final <-
df_propdiff_summary %>%
mutate(
estci =
pmap(
list(.data$x, .data$y),
~ calculate_unadjusted(
data = .env$data %>%
select(all_of(c(..1, ..2))) %>%
tidyr::drop_na(),
method = .env$method,
x = ..1,
y = ..2,
conf.level = .env$conf.level
)
)
)
# For multivariable bootstrap methods
} else if (method %in% c("boot_centile", "boot_sd")) {
# Calculate central estimate in full dataset
df_propdiff_est <-
df_propdiff_summary %>%
mutate(
est =
pmap(
list(.data$x, .data$y),
~ create_model_pred(
data = .env$data %>%
select(all_of(c(..1, ..2, covariates))) %>%
tidyr::drop_na(),
x = ..1,
y = ..2,
covariates = covariates,
pvalue = TRUE
)
)
)
# Create list of indicators for each resample, separately for each outcome
df_bs_map <-
df_propdiff_est %>%
select(.data$x, .data$y) %>%
mutate(freq = bootstrapn) %>%
tidyr::uncount(.data$freq) %>%
mutate(
nrow =
pmap_int(
list(.data$x, .data$y),
~ nrow(
.env$data %>%
select(all_of(c(..1, ..2, covariates)))
)
),
bs_assignment =
map(
.data$nrow, ~ sample.int(..1, replace = TRUE)
),
# Bootstrapping adjusted difference
bs_pred =
pmap(
list(.data$x, .data$y, .data$bs_assignment),
~ create_model_pred(
data = .env$data %>%
select(all_of(c(..1, ..2, covariates))) %>%
tidyr::drop_na() %>%
slice(..3),
x = ..1,
y = ..2,
covariates = covariates,
pvalue = TRUE
)
)
) %>%
select(.data$x, .data$y, .data$bs_pred) %>%
unnest(.data$bs_pred) %>%
nest(bs_pred = -c(.data$x, .data$y))
}
#### CALCULATE 95% CI---------------------------------
# 95% CI for exact method already calculated
# Calculate which centiles to use
lower_centile <- (1 - conf.level) / 2
upper_centile <- 1 - (1 - conf.level) / 2
# For centile method
if (method == "boot_centile") {
# Calculate 95% CI using centiles
df_propdiff_ci <-
df_bs_map %>%
mutate(
ci =
map(
.data$bs_pred,
~ ..1 %>%
dplyr::summarize(
conf.low_2 = quantile(.data$estimate_2, lower_centile, na.rm = TRUE),
conf.high_2 = quantile(.data$estimate_2, upper_centile, na.rm = TRUE)
)
)
) %>%
select(-.data$bs_pred)
# Merge with main results
df_propdiff_final <-
left_join(
df_propdiff_est,
df_propdiff_ci,
by = c("x", "y")
) %>%
mutate(
estci = map2(.data$est, .data$ci, ~ bind_cols(..1, ..2))
) %>%
select(-.data$est, -.data$ci)
# For mean/SD method
} else if (method == "boot_sd") {
# Calculate standard deviation around mean differences
df_propdiff_ci <-
df_bs_map %>%
mutate(
se =
map_dbl(
.data$bs_pred,
~ ..1 %>%
dplyr::summarize(se = sd(.data$estimate_2, na.rm = TRUE)) %>%
pull(se)
)
) %>%
select(-.data$bs_pred)
# Merge with main results
df_propdiff_final <-
left_join(
df_propdiff_est,
df_propdiff_ci,
by = c("x", "y")
) %>%
mutate(
estci =
map2(
.data$est, .data$se,
~ ..1 %>%
mutate(
conf.low_2 = .data$estimate_2 + stats::qnorm(lower_centile) * ..2,
conf.high_2 = .data$estimate_2 + stats::qnorm(upper_centile) * ..2
)
)
) %>%
select(-.data$est, -.data$se)
}
# Standardize format of results
df_propdiff_fmt <-
df_propdiff_final %>%
mutate(
estci =
map(
.data$estci,
~ ..1 %>%
mutate_at(
vars(.data$estimate_2, .data$conf.low_2, .data$conf.high_2),
~ . * 100
) %>%
mutate(
ci = as.character(glue("{estimate_fun(.data$conf.low_2)}%, {estimate_fun(.data$conf.high_2)}%"))
)
)
)
# Stack tbl_summary tables
tbl_results <-
gtsummary::tbl_stack(df_propdiff_fmt$tbl_rates)
# Unnest difference and 95% CI
df_estci <-
df_propdiff_fmt %>%
select(.data$estci) %>%
unnest(cols = c(.data$estci)) %>%
select(.data$estimate_2, .data$ci, .data$conf.low_2, .data$conf.high_2, .data$p.value_2)
# Add results to table body
tbl_results$table_body <-
bind_cols(
tbl_results$table_body,
df_estci
)
# rename stat_1 to stat_1_1 and stat_2 to stat_2_1
tbl_results$table_body <-
tbl_results$table_body %>%
rename(stat_1_1 = .data$stat_1, stat_2_1 = .data$stat_2)
tbl_results$table_styling <-
tbl_results$table_styling %>%
map(
function(.x) {
if (!is.data.frame(.x)) return(.x)
if (!"column" %in% names(.x)) return(.x)
.x %>%
mutate(
column = dplyr::case_when(
.data$column %in% "stat_1" ~ "stat_1_1",
.data$column %in% "stat_2" ~ "stat_2_1",
TRUE ~ .data$column
)
)
}
)
# Update table header
if (method %in% c("chisq", "exact"))
estlabel <- "**Difference**"
else estlabel <- "**Adjusted Difference**"
# update styling -------------------------------------------------------------
tbl_results <-
tbl_results %>%
gtsummary::modify_fmt_fun(
list(
estimate_2 ~ function(x) as.character(glue("{estimate_fun(x)}%")),
p.value_2 ~ pvalue_fun
)
) %>%
modify_header(
estimate_2 = estlabel,
ci = "**95% CI**",
p.value_2 = "**p-value**"
) %>%
gtsummary::modify_footnote(ci ~ "CI = Confidence Interval", abbreviation = TRUE)
# Add class
class(tbl_results) <- c("tbl_propdiff", "gtsummary")
return(tbl_results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.