Nothing
#' Create Publication-Ready Descriptive Statistics Tables
#'
#' Generates comprehensive descriptive statistics tables with automatic variable
#' type detection, group comparisons, and appropriate statistical testing. This
#' function is designed to create "Table 1"-style summaries commonly used in
#' clinical and epidemiological research, with full support for continuous,
#' categorical, and time-to-event variables.
#'
#' @param data Data frame or data.table containing the dataset to summarize.
#' Automatically converted to a data.table for efficient processing.
#'
#' @param by Character string specifying the column name of the grouping
#' variable for stratified analysis (\emph{e.g.}, treatment arm, exposure
#' status). When \code{NULL} (default), produces overall summaries only
#' without group comparisons or statistical tests.
#'
#' @param variables Character vector of variable names to summarize. Can
#' include standard column names for continuous or categorical variables,
#' and survival expressions using \code{Surv()} syntax (\emph{e.g.},
#' \code{"Surv(os_months, os_status)"}). Variables are processed in the
#' order provided.
#'
#' @param stats_continuous Character vector specifying which statistics to
#' compute for continuous variables. Multiple values create separate rows
#' for each variable. Options:
#' \itemize{
#' \item \code{"mean_sd"} - Mean \eqn{\pm} standard deviation
#' \item \code{"median_iqr"} - Median [interquartile range]
#' \item \code{"median_range"} - Median (minimum-maximum)
#' \item \code{"range"} - Minimum-maximum only
#' }
#' Default is \code{"median_iqr"}.
#'
#' @param stats_categorical Character string specifying the format for
#' categorical variable summaries:
#' \itemize{
#' \item \code{"n"} - Count only
#' \item \code{"percent"} - Percentage only
#' \item \code{"n_percent"} - Count (percentage) [default]
#' }
#'
#' @param digits Integer specifying the number of decimal places for
#' continuous statistics. Default is 1.
#'
#' @param p_digits Integer specifying the number of decimal places for
#' \emph{p}-values. Values smaller than \code{10^(-p_digits)} are displayed
#' as \code{"< 0.001"} (for \code{p_digits = 3}), \code{"< 0.0001"} (for
#' \code{p_digits = 4}), etc. Default is 3.
#'
#' @param conf_level Numeric confidence level for confidence intervals in
#' survival variable summaries (median survival time with CI). Must be
#' between 0 and 1. Default is 0.95 (95\% confidence intervals).
#'
#' @param p_per_stat Logical. If \code{TRUE}, displays \emph{p}-values on each
#' row (per statistic) rather than only on the first row of each variable.
#' Useful when different statistics within a variable warrant separate
#' significance testing. Default is \code{FALSE}.
#'
#' @param na_include Logical. If \code{TRUE}, missing values (NAs) are
#' displayed as a separate category/row for each variable. If \code{FALSE},
#' missing values are silently excluded from calculations. Default is
#' \code{FALSE}.
#'
#' @param na_label Character string used to label the missing values row when
#' \code{na_include = TRUE}. Default is \code{"Unknown"}.
#'
#' @param na_percent Logical. Controls how percentages are calculated for
#' categorical variables when \code{na_include = TRUE}:
#' \itemize{
#' \item If \code{TRUE}, percentages include NAs in the denominator (all
#' categories sum to 100\%)
#' \item If \code{FALSE}, percentages exclude NAs from the denominator
#' (non-missing categories sum to 100\%, missing shown separately)
#' }
#' Only affects categorical variables. Default is \code{FALSE}.
#'
#' @param test Logical. If \code{TRUE}, performs appropriate statistical tests
#' for group comparisons and adds a \emph{p}-value column. Requires
#' \code{by} to be specified. Tests are automatically selected based on
#' variable type and test parameters. Default is \code{TRUE}.
#'
#' @param test_continuous Character string specifying the statistical test for
#' continuous variables:
#' \itemize{
#' \item \code{"auto"} - Automatic selection: \emph{t}-test/ANOVA for means,
#' Wilcoxon/Kruskal-Wallis for medians [default]
#' \item \code{"t"} - Independent samples \emph{t}-test (2 groups only)
#' \item \code{"aov"} - One-way ANOVA (2+ groups)
#' \item \code{"wrs"} - Wilcoxon rank-sum test (2 groups only)
#' \item \code{"kwt"} - Kruskal-Wallis test (2+ groups)
#' }
#'
#' @param test_categorical Character string specifying the statistical test
#' for categorical variables:
#' \itemize{
#' \item \code{"auto"} - Automatic selection: Fisher exact test if any
#' expected cell frequency < 5, otherwise \eqn{\chi^2} test [default]
#' \item \code{"fisher"} - Fisher exact test
#' \item \code{"chisq"} - \eqn{\chi^2} test
#' }
#'
#' @param total Logical or character string controlling the total column:
#' \itemize{
#' \item \code{TRUE} or \code{"first"} - Include total column as first
#' column after Variable/Group [default]
#' \item \code{"last"} - Include total column as last column before
#' \emph{p}-value
#' \item \code{FALSE} - Exclude total column
#' }
#'
#' @param total_label Character string for the total column header.
#' Default is \code{"Total"}.
#'
#' @param labels Named character vector or list providing custom display
#' labels for variables. Names should match variable names (or \code{Surv()}
#' expressions), values are the display labels. Variables not in
#' \code{labels} use their original names. Can also label the grouping
#' variable specified in \code{by}. Default is \code{NULL}.
#'
#' @param number_format Character string or two-element character vector
#' controlling thousand and decimal separators in formatted output. Named
#' presets:
#' \itemize{
#' \item \code{"us"} - Comma thousands, period decimal: \code{1,234.56} [default]
#' \item \code{"eu"} - Period thousands, comma decimal: \code{1.234,56}
#' \item \code{"space"} - Thin-space thousands, period decimal: \code{1 234.56}
#' (SI/ISO 31-0)
#' \item \code{"none"} - No thousands separator: \code{1234.56}
#' }
#' Or provide a custom two-element vector \code{c(big.mark, decimal.mark)},
#' \emph{e.g.}, \code{c("'", ".")} for Swiss-style: \verb{1'234.56}.
#'
#' When \code{NULL} (default), uses
#' \code{getOption("summata.number_format", "us")}. Set the global option
#' once per session to avoid passing this argument repeatedly:
#' \preformatted{
#' options(summata.number_format = "eu")
#' }
#'
#' @param ... Additional arguments passed to the underlying statistical test
#' functions (\emph{e.g.}, \code{var.equal = TRUE} for \emph{t}-tests,
#' \code{simulate.p.value = TRUE} for Fisher test).
#'
#' @return A data.table with S3 class \code{"desctable"} containing formatted
#' descriptive statistics. The table structure includes:
#' \describe{
#' \item{Variable}{Variable name or label (from \code{labels})}
#' \item{Group}{For continuous variables: statistic type (\emph{e.g.},
#' "Mean \eqn{\pm} SD", "Median [IQR]"). For categorical variables:
#' category level. Empty for variable name rows.}
#' \item{Total}{Statistics for the total sample (if
#' \code{total = TRUE})}
#' \item{Group columns}{Statistics for each group level (when \code{by}
#' is specified). Column names match group levels.}
#' \item{\emph{p}-value}{Formatted \emph{p}-values from statistical tests
#' (when \code{test = TRUE} and \code{by} is specified)}
#' }
#'
#' The first row always shows sample sizes for each column. All numeric
#' output (counts, statistics, \emph{p}-values) respects the
#' \code{number_format} setting for locale-appropriate formatting.
#'
#' The returned object includes the following attributes accessible via
#' \code{attr()}:
#' \describe{
#' \item{raw_data}{A data.table containing unformatted numeric values
#' suitable for further statistical analysis or custom formatting.
#' Includes additional columns for standard deviations, quartiles,
#' etc.}
#' \item{by_variable}{The grouping variable name used (value of
#' \code{by})}
#' \item{variables}{The variables analyzed (value of
#' \code{variables})}
#' }
#'
#' @details
#' \strong{Variable Type Detection:}
#'
#' The function automatically detects variable types and applies appropriate
#' summaries:
#' \itemize{
#' \item \strong{Continuous}: Numeric variables (integer or double) receive
#' statistics specified in \code{stats_continuous}
#' \item \strong{Categorical}: Character, factor, or logical variables
#' receive frequency counts and percentages
#' \item \strong{Time-to-Event}: Variables specified as
#' \code{Surv(time, event)} display median survival with confidence
#' intervals (level controlled by \code{conf_level})
#' }
#'
#' \strong{Statistical Testing:}
#'
#' When \code{test = TRUE} and \code{by} is specified:
#' \itemize{
#' \item \strong{Continuous with "auto"}: Parametric tests (\emph{t}-test, ANOVA)
#' for mean-based statistics; non-parametric tests (Wilcoxon,
#' Kruskal-Wallis) for median-based statistics
#' \item \strong{Categorical with "auto"}: Fisher exact test when any
#' expected cell frequency < 5; \eqn{\chi^2} test otherwise
#' \item \strong{Survival}: Log-rank test for comparing survival curves
#' \item \strong{Range statistics}: No \emph{p}-value computed (ranges
#' are descriptive)
#' }
#'
#' \strong{Missing Data Handling:}
#'
#' Missing values are handled differently by variable type:
#' \itemize{
#' \item \strong{Continuous}: NAs excluded from calculations; optionally
#' shown as count when \code{na_include = TRUE}
#' \item \strong{Categorical}: NAs can be included as a category when
#' \code{na_include = TRUE}. The \code{na_percent} parameter controls
#' whether percentages are calculated with or without NAs in the
#' denominator
#' \item \strong{Survival}: NAs in time or event excluded from analysis
#' }
#'
#' \strong{Formatting Conventions:}
#'
#' All numeric output respects the \code{number_format} parameter. Separators
#' within ranges and confidence intervals adapt automatically to avoid
#' ambiguity:
#' \itemize{
#' \item Mean \eqn{\pm} SD: \code{"45.2 \eqn{\pm} 12.3"} (US) or
#' \code{"45,2 \eqn{\pm} 12,3"} (EU)
#' \item Median [IQR]: \code{"38.0 [28.0-52.0]"} (US) or
#' \code{"38,0 [28,0-52,0]"} (EU, en-dash separator)
#' \item Range: \code{"18.0-75.0"} (positive, US),
#' \code{"-5.0 to 10.0"} (when bounds are negative)
#' \item Survival: \code{"24.5 (21.2-28.9)"} (US) or
#' \code{"24,5 (21,2-28,9)"} (EU)
#' \item Counts \eqn{\ge} 1000: \code{"1,234"} (US) or \code{"1.234"} (EU)
#' \item \emph{p}-values: \code{"< 0.001"} (US) or \code{"< 0,001"} (EU)
#' }
#'
#' @seealso
#' \code{\link{survtable}} for detailed survival summary tables,
#' \code{\link{fit}} for regression modeling,
#' \code{\link{table2pdf}} for PDF export,
#' \code{\link{table2docx}} for Word export,
#' \code{\link{table2html}} for HTML export
#'
#' @examples
#' # Load example clinical trial data
#' data(clintrial)
#'
#' # Example 1: Basic descriptive table without grouping
#' desctable(clintrial,
#' variables = c("age", "sex", "bmi"))
#'
#' \donttest{
#'
#' # Example 2: Grouped comparison with default tests
#' desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "sex", "race", "bmi"))
#'
#' # Example 3: Customize continuous statistics
#' desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "bmi", "creatinine"),
#' stats_continuous = c("median_iqr", "range"))
#'
#' # Example 4: Change categorical display format
#' desctable(clintrial,
#' by = "treatment",
#' variables = c("sex", "race", "smoking"),
#' stats_categorical = "n") # Show counts only
#'
#' # Example 5: Include missing values
#' desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "smoking", "hypertension"),
#' na_include = TRUE,
#' na_label = "Missing")
#'
#' # Example 6: Disable statistical testing
#' desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "sex", "bmi"),
#' test = FALSE)
#'
#' # Example 7: Force specific tests
#' desctable(clintrial,
#' by = "surgery",
#' variables = c("age", "sex"),
#' test_continuous = "t", # t-test instead of auto
#' test_categorical = "fisher") # Fisher test instead of auto
#'
#' # Example 8: Adjust decimal places
#' desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "bmi"),
#' digits = 2, # 2 decimals for continuous
#' p_digits = 4) # 4 decimals for p-values
#'
#' # Example 9: Custom variable labels
#' labels <- c(
#' age = "Age (years)",
#' sex = "Sex",
#' bmi = "Body Mass Index (kg/m\u00b2)",
#' treatment = "Treatment Arm"
#' )
#'
#' desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "sex", "bmi"),
#' labels = labels)
#'
#' # Example 10: Position total column last
#' desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "sex"),
#' total = "last")
#'
#' # Example 11: Exclude total column
#' desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "sex"),
#' total = FALSE)
#'
#' # Example 12: Survival analysis
#' desctable(clintrial,
#' by = "treatment",
#' variables = "Surv(os_months, os_status)")
#'
#' # Example 13: Multiple survival endpoints
#' desctable(clintrial,
#' by = "treatment",
#' variables = c(
#' "Surv(pfs_months, pfs_status)",
#' "Surv(os_months, os_status)"
#' ),
#' labels = c(
#' "Surv(pfs_months, pfs_status)" = "Progression-Free Survival",
#' "Surv(os_months, os_status)" = "Overall Survival"
#' ))
#'
#' # Example 14: Mixed variable types
#' desctable(clintrial,
#' by = "treatment",
#' variables = c(
#' "age", "sex", "race", # Demographics
#' "bmi", "creatinine", # Labs
#' "smoking", "hypertension", # Risk factors
#' "Surv(os_months, os_status)" # Survival
#' ))
#'
#' # Example 15: Three or more groups
#' desctable(clintrial,
#' by = "stage", # Assuming stage has 3+ levels
#' variables = c("age", "sex", "bmi"))
#' # Automatically uses ANOVA/Kruskal-Wallis and chi-squared
#'
#' # Example 16: Access raw unformatted data
#' result <- desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "bmi"))
#' raw_data <- attr(result, "raw_data")
#' print(raw_data)
#' # Raw data includes unformatted numbers, SDs, quartiles, etc.
#'
#' # Example 17: Check which grouping variable was used
#' result <- desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "sex"))
#' attr(result, "by_variable") # "treatment"
#'
#' # Example 18: NA percentage calculation options
#' # Include NAs in percentage denominator (all sum to 100%)
#' desctable(clintrial,
#' by = "treatment",
#' variables = "smoking",
#' na_include = TRUE,
#' na_percent = TRUE)
#'
#' # Exclude NAs from denominator (non-missing sum to 100%)
#' desctable(clintrial,
#' by = "treatment",
#' variables = "smoking",
#' na_include = TRUE,
#' na_percent = FALSE)
#'
#' # Example 19: Passing additional test arguments
#' # Equal variance t-test
#' desctable(clintrial,
#' by = "sex",
#' variables = "age",
#' test_continuous = "t",
#' var.equal = TRUE)
#'
#' # Example 20: European number formatting
#' desctable(clintrial,
#' by = "treatment",
#' variables = c("age", "sex", "bmi"),
#' number_format = "eu")
#'
#' # Example 21: Complete Table 1 for publication
#' table1 <- desctable(
#' data = clintrial,
#' by = "treatment",
#' variables = c(
#' "age", "sex", "race", "ethnicity", "bmi",
#' "smoking", "hypertension", "diabetes",
#' "ecog", "creatinine", "hemoglobin",
#' "site", "stage", "grade",
#' "Surv(os_months, os_status)"
#' ),
#' labels = clintrial_labels,
#' stats_continuous = c("median_iqr", "range"),
#' total = TRUE,
#' na_include = FALSE
#' )
#' print(table1)
#'
#' }
#'
#' @family descriptive functions
#' @export
desctable <- function(data,
by = NULL,
variables,
stats_continuous = c("median_iqr"),
stats_categorical = "n_percent",
digits = 1,
p_digits = 3,
conf_level = 0.95,
p_per_stat = FALSE,
na_include = FALSE,
na_label = "Unknown",
na_percent = FALSE,
test = TRUE,
test_continuous = "auto",
test_categorical = "auto",
total = TRUE,
total_label = "Total",
labels = NULL,
number_format = NULL,
...) {
if (!data.table::is.data.table(data)) {
data <- data.table::as.data.table(data)
}
## Resolve number formatting marks
validate_number_format(number_format)
marks <- resolve_number_marks(number_format)
## Set group_var from 'by' parameter
group_var <- by
group_var_label <- NULL
## Apply label to group variable if provided
if (!is.null(group_var) && !is.null(labels) && group_var %chin% names(labels)) {
group_var_label <- labels[group_var]
} else if (!is.null(group_var)) {
group_var_label <- group_var
}
## Variables are already provided as a vector
vars <- variables
## Pre-allocate result lists
n_vars <- length(vars)
result_list <- vector("list", n_vars)
raw_result_list <- vector("list", n_vars)
## Process each variable
for (i in seq_along(vars)) {
var_data <- process_variable(
data = data,
var = vars[i],
group_var = group_var,
stats_continuous = stats_continuous,
stats_categorical = stats_categorical,
digits = digits,
p_digits = p_digits,
conf_level = conf_level,
p_per_stat = p_per_stat,
na_include = na_include,
na_label = na_label,
na_percent = na_percent,
test = test,
test_continuous = test_continuous,
test_categorical = test_categorical,
total = total,
total_label = total_label,
labels = labels,
marks = marks,
...
)
result_list[[i]] <- var_data$formatted
raw_result_list[[i]] <- var_data$raw
}
## Combine all results using rbindlist
result <- data.table::rbindlist(result_list, fill = TRUE)
raw_result <- data.table::rbindlist(raw_result_list, fill = TRUE)
## Standardize column names
old_names <- c("variable", "level")
new_names <- c("Variable", "Group")
if ("variable" %chin% names(result)) {
data.table::setnames(result, old_names, new_names, skip_absent = TRUE)
data.table::setnames(raw_result, old_names, new_names, skip_absent = TRUE)
}
## Add p-value column if tests requested (after standardization)
if (test && !is.null(group_var)) {
result <- format_pvalues_desctable(result, p_digits, marks)
}
## Add N row as first row (for both grouped and ungrouped tables)
if (!is.null(group_var)) {
## Get the group values in the correct order
if (is.factor(data[[group_var]])) {
## Use factor levels for proper ordering
groups <- levels(data[[group_var]])
} else {
## Fall back to unique values for non-factors
groups <- unique(data[[group_var]])
groups <- groups[!is.na(groups)]
}
## Create N row
n_row <- data.table::data.table(
Variable = "N",
Group = ""
)
## Calculate and add total if present
if (total_label %chin% names(result)) {
n_total <- nrow(data)
n_row[[total_label]] <- format_count(n_total, marks)
}
## Calculate for each group in the correct order
for (grp in groups) {
grp_col <- as.character(grp)
if (grp_col %chin% names(result)) {
n_group <- sum(data[[group_var]] == grp, na.rm = TRUE)
n_row[[grp_col]] <- format_count(n_group, marks)
}
}
## Add empty p-value column if it exists
if ("p-value" %chin% names(result)) {
n_row[["p-value"]] <- ""
}
## Prepend N row
result <- rbind(n_row, result, fill = TRUE)
} else if (total && total_label %chin% names(result)) {
## Add N row for ungrouped tables with Total column
n_total <- nrow(data)
n_row <- data.table::data.table(
Variable = "N",
Group = ""
)
n_row[[total_label]] <- format_count(n_total, marks)
## Prepend N row
result <- rbind(n_row, result, fill = TRUE)
}
## Reorder columns if total position specified
if (!isFALSE(total) && !is.null(group_var)) {
result <- reorder_total_column(result, total, total_label)
}
## Attach raw data and metadata as attributes
data.table::setattr(result, "raw_data", raw_result)
data.table::setattr(result, "by_variable", group_var)
data.table::setattr(result, "variables", variables)
result[]
return(result)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.