#' Tidy characteristics data
#'
#' @description
#'
#' \Sexpr[results=rd, stage=render]{lifecycle::badge("stable")}
#'
#' Table one is a tabular description of
#' characteristics, e.g., demographics of patients in a clinical trial,
#' presented overall and also stratified by a categorical variable, e.g.
#' treatment group.
#'
#' @return a [tibble][tibble::tibble-package] containing summary values
#' that describe characteristics of observations in `data` , which can
#' subsequently be sent to different modes of output
#' (see [to_word] and [to_kable]).
#'
#' @param data a data frame
#'
#' @param meta_data a meta data frame. If unspecified, a meta data frame
#' will be created using `data`.
#'
#' @param formula an optional formula object. The left hand side of the
#' formula should be blank. The right hand side of the formula should
#' contain row variables for the table. The '|' symbol can be used to
#' include stratifying variables. If this option is used, no more than
#' two stratifying variables should be used, and they must be separated
#' by a * symbol. If formula is used, the strat, by, and row_vars inputs
#' are ignored.
#'
#' @param strat a character value indicating the column name in data that
#' will be used to stratify the table
#'
#' @param by a character value indicating the column name in data that
#' will be used to split the table into groups, prior to stratification.
#'
#' @param row_vars a character vector indicating column names of row
#' variables in the table. If unspecified, all columns are used.
#'
#' @param specs_table_vals named vector of character values.
#' Names should be variables, while values should be specs.
#' Valid specs are 'mean' and 'median' (see examples).
#'
#' @param specs_table_tests named vector of character values.
#' Names should be variables, while values should be specs.
#' Valid specs are 'params' or 'noparm' (see examples).
#'
#' @param include_pval T/F, should the table include a column for p-values?
#' If p-values are included, factor variables are handled using
#' chi-square tests, continuous variables are handled using t-tests
#' or ANOVA, depending on the number of categories in the table
#' stratification.
#'
#' @param expand_binary_catgs T/F, should all categories be included for
#' binary categorical variables? (This only applies to binary variables.)
#'
#' @param include_freq T/F, should frequency values be included for
#' categorical variables?
#'
#' @param add_perc_to_cats T/F, should categorical variable labels
#' be appended with a percent sign?
#'
#' @export
#'
#' @examples
#' data("pbc_tbl1")
#' # report median albumin instead of mean
#' # use kruskal wallis test for albumin
#' tibble_one(
#' pbc_tbl1,
#' formula = ~ . | trt,
#' include_freq = FALSE,
#' include_pval = TRUE,
#' specs_table_vals = c(albumin = 'median'),
#' specs_table_tests = c(albumin = 'nopars')
#' )
#'
# data = pbc_tbl1
# formula = ~ . - age | sex
# meta_data = meta
# row_vars = NULL
# strat = NULL
# by = NULL
# row_vars = NULL
# specs_table_vals = NULL
# specs_table_tests = NULL
# expand_binary_catgs = FALSE
# include_pval = FALSE
# include_freq = FALSE
tibble_one <- function(
data,
formula = NULL,
meta_data = NULL,
row_vars = NULL,
strat = NULL,
by = NULL,
specs_table_vals = NULL,
specs_table_tests = NULL,
include_pval=FALSE,
expand_binary_catgs = FALSE,
include_freq = FALSE,
add_perc_to_cats = TRUE
){
# uniformly convert characters to factors
data %<>% mutate_if(is.character, as.factor)
# check to see if any factor variable has a blank level.
check_blanks(data)
# Identify row, stratification, and by variables
if( !is.null(formula) ){
tb1_vars <- parse_tb1_formula(formula, data)
} else {
tb1_vars <- list(
row_vars = vars_select(colnames(data), !!enquo(row_vars)),
strat = vars_select(colnames(data), !!enquo(strat)),
by = vars_select(colnames(data), !!enquo(by))
) %>%
map(
.f = function(x){
if(length(x)==0) return(NULL)
set_names(x, NULL)
}
)
if(is.null(tb1_vars$row_vars)) stop(
"formula or row_vars must be specified",
call. = FALSE
)
}
row_vars <- tb1_vars$row_vars
strat <- tb1_vars$strat
by <- tb1_vars$by
if(vec_is_empty(row_vars)){
stop("There should be at least 1 row_var")
}
for(variable in c(row_vars, strat, by)){
if(all(is.na(data[[variable]]))){
stop(
glue("All values of {variable} are missing."),
call. = FALSE
)
}
}
# make it easier to read my if-then code:
stratified_table <- !is.null(strat)
# in case a user misinterprets strat/by arguments
if( !stratified_table & !is.null(by) ){
strat = by
by = NULL
}
by_table <- !is.null(by)
# strat cannot be numeric
if(stratified_table){
if(is.numeric(data[[strat]])){
stop("stratification variable must be a factor")
}
}
# by cannot be numeric
if(by_table){
if(is.numeric(data[[by]])){
stop("by variable must be a factor")
}
}
if( !stratified_table & include_pval ){
stop(
"You have included p-values but have not",
" indicated which groups to compare.",
"\nPlease specify strat in your formula, e.g.",
" 'formula = ~ . | strat', if you want p-values",
call. = FALSE
)
}
# Ordered factors need to be re-factored as unordered.
# (This has to do with the default contrast method in R)
fctrs <- purrr::map_lgl(data, is.factor) %>%
enframe(value = 'factor') %>%
filter(factor == TRUE) %>%
mutate(ordered = map_lgl(name, ~is.ordered(data[[.x]])))
if(any(fctrs$ordered)){
for(f in fctrs$name[fctrs$ordered]){
data[[f]] %<>% factor(ordered = FALSE, levels = levels(data[[f]]))
}
}
meta <- meta_data
if(is.null(meta)){
# meta data set is compiled if needed
meta <- data %>%
mutate_if(is.character, as.factor) %>%
select_at(vars(strat, by, row_vars)) %>%
build_meta(
expand_binary_catgs = expand_binary_catgs,
add_perc_to_cats = add_perc_to_cats
) %>%
check_meta()
}
# initialize default specification for table values / tests
default_spec <- rep(
x = 'default',
times = length(row_vars)
) %>%
set_names(row_vars)
# Handle table value and test specs
specs_table_vals <- parse_specs(default_spec, specs_table_vals)
specs_table_tests <- parse_specs(default_spec, specs_table_tests)
# determine what is actually in these specs
spec_means <- any(c('default','mean') %in% specs_table_vals)
spec_medns <- any('median' %in% specs_table_vals)
# determine how to describe these specs
table_value_description <-
if ( spec_means && spec_medns ) {
paste(
"Table values for continuous variables are mean (standard deviation)",
"or median [interquartile range]. Table values for categorical",
"variables are", if(include_freq) "count (percent)." else "percent."
)
} else if (spec_means && !spec_medns) {
paste(
"Table values are mean (standard deviation) and",
if(include_freq) "count (percent)" else "percent",
"for continuous and categorical variables, respectively."
)
} else if (!spec_means && spec_medns) {
paste(
"Table values are median [interquartile range] and",
if(include_freq) "count (percent)" else "percent",
"for continuous and categorical variables, respectively."
)
}
# Create sample size values
n_obs <- c(
group = 'None',
variable = 'descr',
labels = 'No. of observations',
Overall = nrow(data)
)
# make adjustments to table parameters
# based on whether or not a stratification
# variable was specified. Initialize empty
# container for stratification data
strat_data <- NULL
select_vec <- row_vars
if(stratified_table){
# if strat is specified, we know there is at least
# one level of stratification. Therefore, we
# identify the label of the stratification variable
if(strat %in% meta$data$variable){
strat_labs <- meta$data %>%
filter(variable == strat) %>%
pull(label)
} else {
strat_labs <- capitalize(strat)
}
# We also specify that there is one by group and
# and initialize by_table, which we will modify if there
# is a second stratifying variable (i.e., by != NULL)
n_by <- 1L
by_table <- NULL
if(!is.null(by)){
# For this type of table, two headers are needed.
# To set this up, we modify the strat variable in data and
# designate the number of participants in each by category
if(by %in% meta$data$variable){
by_lab <- meta$data %>%
filter(variable == by) %>%
pull(label)
strat_labs <- c(strat_labs, by_lab)
} else {
strat_labs <- c(strat_labs, capitalize(by))
}
by_table <- table(data[[by]])
n_by <- length(by_table)
data[[strat]] %<>% interaction(data[[by]], sep='_._')
}
# count the number of participants in each strata
strat_table <- table(data[[strat]])
# add the counts of participants in each strata to the n_obs vector
n_obs %<>% c(strat_table)
# formalize information about strata with a list
strat_data <- list(
# total no. of groups
n_groups = length(strat_table) / n_by,
# total no. of by groups
n_by = n_by,
# counts in the by groups
by_table = by_table,
# label vector
label = strat_labs
)
# Create a data frame with edited strat col
# rename the strat col so that downstream
# code can be consistent for multiple table types
rename_vec <- set_names(strat, ".strat")
select_vec <- c(rename_vec, row_vars)
}
# modify n_obs vector to include p-value label if needed
if( include_pval ){ n_obs %<>% c("P-value" = '') }
# the top row of the table is initialized here (descr = descriptive)
descr_row <- vibble(n_obs)
# the original data is modified for computing table values
# .strat is the stratifying variable
tbl_data <- select(data, !!!select_vec)
# abbreviations are organized into one string
table_abbrs <- meta$data$abbr %>%
purrr::keep(~any(!is.na(.x))) %>%
purrr::flatten() %>%
purrr::map2_chr(names(.), ~ paste(.y, .x, sep = ' = ')) %>%
sort() %>%
paste(collapse = ', ')
# notes are left as a named list
table_notes <- meta$data$note %>%
set_names(meta$data$variable) %>%
purrr::keep(~any(!is.na(.x)))
table_data <- meta$data %>%
select(-c(abbr,note)) %>%
{
# NOTE(boyiguo1): The following implementation doesn't work well when meta_data contain more variables than specified in formula
# e.g.:
# meta <- pbc_tbl1 %>%
# set_variable_labels(
# status = "Status at last contact",
# trt = "Treatment group",
# age = 'Age',
# sex = 'Sex at birth',
# ascites = 'Ascites',
# bili = 'Bilirubin levels',
# edema = 'Edema',
# albumin = 'Serum Albumin'
# ) %>%
# build_meta(add_perc_to_cats = TRUE)
#
#
# tbl_one <- tibble_one(
# data = pbc_tbl1,
# meta_data = meta,
# formula = ~ . - bili | trt,
# include_pval = TRUE
# )
# if(stratified_table){
# filter(., !variable %in% c(strat, by))
# } else {
# .
# }
# Note(boyiguo1): I think it is safe just select the variables that is in the row_vars
# You might want to re-pipe
filter(., variable %in% row_vars)
} %>%
left_join(
enframe(
specs_table_vals,
name = 'variable',
value = 'fun_descr'
),
by = 'variable'
) %>%
left_join(
enframe(
specs_table_tests,
name = 'variable',
value = 'test_descr'
),
by = 'variable'
) %>%
mutate(
tbl_val = pmap(
.l = list(variable, type, fun_descr, test_descr),
.f = function(.variable, .var_type, .fun_type, .test_type){
gen_tbl_value(
data = tbl_data,
variable = .variable,
var_type = .var_type,
fun_type = .fun_type,
test_type = .test_type,
include_pval = include_pval,
include_freq = include_freq,
stratified_table = stratified_table,
expand_binary_catgs = expand_binary_catgs
)
}
)
) %>%
select(variable, unit, labels, tbl_val, group) %>%
tidyr::unnest(cols = c(labels, tbl_val))
if(nrow(table_data) > 1){
# Fix percents for factors with >=3 categories
# This only needs to be run if there are 2+ rows in the data
for(i in 2:nrow(table_data)){
if(table_data$variable[i] == table_data$variable[i-1]){
table_data$unit[i] <- NA_character_
}
}
}
table_data %<>% bind_rows(descr_row, .)
if( meta$add_perc_to_cats && include_freq ){
table_data$unit %<>% gsub(
pattern = '%',
replacement = 'n (%)',
x = .
)
}
table_data %<>%
mutate(
variable = factor(
x = variable,
levels = c(
'descr',
unique(
c(
setdiff(row_vars, meta$var_levels),
# order for variables without group assignment
meta$var_levels,
# Order for variables with group assignment
row_vars
)
)
)
),
group = factor(
x = group,
levels = unique(
c(
"None",
meta$group_levels
)
)
),
#group = fct_relevel(group, 'None'),
labels = case_when(!is.na(unit) ~ paste(labels, unit, sep = ', '),
TRUE ~ labels
)
) %>%
arrange(group, variable) %>%
select(-unit)
# Set table attributes
# what type of table is this?
# If there is no stratification at all: single_header
# If there is one stratification variable: double_header
# If there is a stratification and a by variable: triple_header
table_num <- 1 + as.numeric(!is.null(strat)) + as.numeric(!is.null(by))
table_type <- switch(
EXPR = as.character(table_num),
"1" = 'single_decker',
"2" = 'double_decker',
"3" = 'triple_decker',
"unkown table type"
)
# These are read in and used in the to_yyy functions
attr(table_data, 'type') <- table_type
attr(table_data, 'strat') <- strat_data
attr(table_data, 'byvar') <- by
attr(table_data, 'pvals') <- include_pval
attr(table_data, 'abbrs') <- table_abbrs
attr(table_data, 'notes') <- table_notes
attr(table_data, 'descr') <- table_value_description
attr(table_data, 'allcats') <- expand_binary_catgs
# set class to include tibble_one
# class(table_data) %<>% c('tibble_one')
table_data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.