#' @title
#' Tidy table one with no strata
#'
#' @description
#' Creates a tidy data frame of the results that can go into a "Table 1" of
#' summary descriptive statistics of a study sample. Inspiration for this is owed
#' to the `tableone` package by Kazuki Yoshida.
#'
#' @param data A data frame or tibble containing the variables to be summarized.
#' @param strata NULL
#' @param vars Character vector of the variable names to be summarized. If
#' empty, then all variables in the given data frame are used.
#' @param na_level Character string of the text to replace `NA` in the strata
#' variable, if any exist.
#' @param b_replicates an integer specifying the number of replicates used in
#' the Monte Carlo test for Fisher's Exact test and Chi-square test.
#' @param ... Additional arguments. Not used.
#'
#' @importFrom car leveneTest
#' @importFrom dplyr bind_rows
#' @importFrom dplyr count
#' @importFrom dplyr count
#' @importFrom dplyr distinct
#' @importFrom dplyr everything
#' @importFrom dplyr filter
#' @importFrom dplyr group_by
#' @importFrom dplyr left_join
#' @importFrom dplyr mutate
#' @importFrom dplyr mutate_if
#' @importFrom dplyr n
#' @importFrom dplyr n_distinct
#' @importFrom dplyr one_of
#' @importFrom dplyr pull
#' @importFrom dplyr relocate
#' @importFrom dplyr rename
#' @importFrom dplyr rename
#' @importFrom dplyr select
#' @importFrom dplyr summarise
#' @importFrom dplyr transmute
#' @importFrom dplyr ungroup
#' @importFrom forcats fct_na_value_to_level
#' @importFrom glue glue
#' @importFrom purrr map
#' @importFrom purrr map_chr
#' @importFrom purrr map_dbl
#' @importFrom purrr map2
#' @importFrom nortest ad.test
#' @importFrom purrr pluck
#' @importFrom rlang ensym
#' @importFrom stats as.formula
#' @importFrom stats bartlett.test
#' @importFrom stats chisq.test
#' @importFrom stats fisher.test
#' @importFrom stats kruskal.test
#' @importFrom stats ks.test
#' @importFrom stats oneway.test
#' @importFrom stats quantile
#' @importFrom stats sd
#' @importFrom stats shapiro.test
#' @importFrom tableone CreateTableOne
#' @importFrom tableone ExtractSmd
#' @importFrom testit has_warning
#' @importFrom tibble as_tibble
#' @importFrom tibble tibble
#' @importFrom tidyr complete
#' @importFrom tidyr gather
#'
#' @return
#' A tibble with the following results
#' \describe{
#' \item{strata}{Level of the stratifying variable}
#' \item{var}{Variable/column name}
#' \item{n}{Number of records}
#' \item{n_distinct}{Numer of distinct values}
#' \item{complete}{Number of non-missing observations}
#' \item{missing}{Number of missing observations}
#' \item{mean}{Mean}
#' \item{sd}{Standard deviation}
#' \item{p0}{Minimum}
#' \item{p25}{25th percentile}
#' \item{p50}{Median}
#' \item{p75}{75th percentile}
#' \item{p100}{Maximum}
#' \item{cv}{Coefficient of variation}
#' \item{shapiro_test}{Shapiro-Wilkes test: p-value}
#' \item{ks_test}{Kolmogorov-Smirnov test: p-value}
#' \item{ad_test}{Anderson-Darling test for normality: p-value}
#' \item{level}{Level of the variable}
#' \item{n_level}{Total number in the variable's group}
#' \item{n_strata}{Total number in the variable group and strata}
#' }
#' @export
#'
#' @examples
#' library(dplyr)
#'
#' tab1 <- create_tidy_table_one_no_strata(data = pbc_mayo,
#' strata = NULL,
#' vars = c("time",
#' "status",
#' "trt",
#' "age",
#' "sex",
#' "ascites",
#' "hepato",
#' "spiders",
#' "edema",
#' "bili",
#' "chol",
#' "albumin",
#' "copper",
#' "alk_phos",
#' "ast",
#' "trig",
#' "platelet",
#' "protime",
#' "stage"))
#'
#' dplyr::glimpse(tab1)
#'
#'
#' library(ggplot2) # diamonds data set
#'
#' tab2 <- create_tidy_table_one_no_strata(data = diamonds,
#' vars = c("carat",
#' "cut",
#' "color",
#' "clarity",
#' "depth",
#' "table",
#' "price"))
#'
#' dplyr::glimpse(tab2)
create_tidy_table_one_no_strata <- function(data,
strata = NULL,
vars,
na_level = "(Missing)",
b_replicates = 2000, ...) {
# Silence no visible binding for global variable
dat <- res <- n_level_valid <- n_strata_valid <- label <- sort1 <- sort2 <- NULL
# Ensure strata is NULL since this function does not handle stratified data
if (!is.null(strata)) {
stop("Strata must be NULL for create_tidy_table_one_no_stata.")
}
# Convert ordered factors to regular factors
data <- data |>
dplyr::mutate_if(.predicate = ~ ("ordered" %in% class(.)),
.funs = ~ factor(., ordered = FALSE))
# If no variables are specified, use all variables in the dataset
if (missing(vars)) {
vars <- names(data)
}
#### Get variable info --------------------------------
# Extract variable labels for later use
var_lbls <- tibble::tibble(var = names(data)) |>
mutate(label = purrr::map_chr(.x = data[, var],
.f = ~ get_var_labels(x = .x)))
# Get variable types and other meta-information
var_info <- get_var_info(data = data,
.vars = vars)
# Create sorting variables to maintain order in the output
var_info <- var_info |>
mutate(sort1 = cumsum(var != dplyr::lag(var, default = dplyr::first(var))),
sort1 = sort1 + 1) |>
group_by(var) |>
mutate(sort2 = dplyr::row_number()) |>
ungroup()
# Identify categorical and continuous variables
cat_vars <- var_info |>
dplyr::filter(var_type == "categorical") |>
dplyr::pull(var) |>
unique()
con_vars <- var_info |>
dplyr::filter(var_type == "continuous") |>
dplyr::pull(var) |>
unique()
#### Get tables stats --------------------------------
res_stats <- list()
# Process categorical variables, if any
if (length(cat_vars) > 0) {
suppressWarnings(
cat_stats <- tibble::tibble(var = cat_vars) |>
mutate(dat = purrr::map(.x = var,
.f = ~ dplyr::select(data,
dplyr::all_of(.x))),
res = purrr::map(.x = dat,
.f = ~ do_one_cat(.x))) |>
dplyr::select(var, res) |>
tidyr::unnest(res) |>
mutate(pct = n_level / n_strata,
pct_valid = n_level_valid / n_strata_valid)
)
res_stats <- dplyr::bind_rows(res_stats,
cat_stats)
}
# Process continuous variables, if any
if (length(con_vars) > 0) {
con_stats <- data |>
dplyr::select(dplyr::one_of(con_vars)) |>
tidyr::pivot_longer(cols = dplyr::all_of(con_vars),
names_to = "var",
values_to = "value") |>
group_by(var) |>
summarise(n = dplyr::n(),
n_distinct = dplyr::n_distinct(value),
complete = sum(!is.na(value)),
missing = sum(is.na(value)),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
p0 = custom_min(value, na.rm = TRUE),
p25 = quantile(value, probs = 0.25, na.rm = TRUE),
p50 = quantile(value, probs = 0.50, na.rm = TRUE),
p75 = quantile(value, probs = 0.75, na.rm = TRUE),
p100 = custom_max(value, na.rm = TRUE),
cv = sd / mean,
shapiro_test = calc_shapiro_test(var = value),
ks_test = calc_ks_test(var = value),
ad_test = calc_ad_test(var = value)) |>
ungroup()
res_stats <- dplyr::bind_rows(res_stats,
con_stats)
}
#### Clean up and arrange --------------------------------
# Add variable info (class, type) and labels to the results
class_and_type <- var_info |>
dplyr::select(-level,
-sort1,
-sort2) |>
dplyr::distinct()
res_stats <- res_stats |>
dplyr::left_join(class_and_type,
by = "var") |>
dplyr::left_join(var_lbls,
by = "var")
# Arrange results by variable and level, maintaining the original order
if (length(cat_vars) > 0) {
sort_vars <- var_info |>
dplyr::select(var, level, sort1, sort2)
res_stats <- res_stats |>
dplyr::left_join(sort_vars,
by = c("var", "level")) |>
dplyr::arrange(sort1, sort2) |>
mutate(var = forcats::fct_inorder(var),
level = forcats::fct_inorder(level)) |>
dplyr::select(-sort1, -sort2) |>
dplyr::relocate(class,
var_type,
label,
.after = dplyr::everything())
} else {
sort_vars <- var_info |>
dplyr::select(var, sort1, sort2)
res_stats <- res_stats |>
dplyr::left_join(sort_vars,
by = c("var")) |>
dplyr::arrange(sort1, sort2) |>
mutate(var = forcats::fct_inorder(var)) |>
dplyr::select(-sort1, -sort2) |>
dplyr::relocate(class,
var_type,
label,
.after = dplyr::everything())
}
#### Return results --------------------------------
return(res_stats)
}
#### Helper function --------------------------------
do_one_cat <- function(x) {
# Silence no visible binding for global variable
n_level_valid <- NULL
# Summarize categorical variable, filling in missing levels if necessary
x |>
dplyr::rename("level" = 1) |>
dplyr::count(level,
name = "n_level",
.drop = FALSE) |>
tidyr::complete(level,
fill = list(n_level = 0)) |>
mutate(n_strata = sum(n_level, na.rm = TRUE)) |>
mutate(n_level_valid = dplyr::if_else(is.na(level), NA_integer_, n_level),
n_strata_valid = sum(n_level_valid, na.rm = TRUE))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.