Nothing
#' Describe a variable, optionally by groups
#'
#' Returns a dataframe with one row per group.
#'
#' The dependent variable (`y`) must be numeric. If you pass an existing
#' non-numeric variable (e.g., character, factor), `desc_var()` will stop with
#' a clear error indicating that the variable must be numeric.
#'
#' @param y A numeric vector of values, a column name (character string or unquoted) if \code{data} is provided,
#' or a formula of the form \code{y ~ x} or \code{y ~ x1 + x2} (for multiple grouping variables).
#' @param group Optional grouping variable, if not provided computed for the full data.
#' Ignored if \code{y} is a formula.
#' @param data Optional data frame containing the variable(s).
#' @param digits Number of decimal places to round to. Default is 3.
#'
#' @return A data frame with one row per group (or one row if no group is specified) containing:
#' \itemize{
#' \item \code{group}: Group identifier
#' \item \code{mean}: Mean
#' \item \code{sd}: Standard deviation
#' \item \code{se}: Standard error
#' \item \code{median}: Median
#' \item \code{min}: Minimum
#' \item \code{max}: Maximum
#' \item \code{mode}: Most frequent value
#' \item \code{freq_mode}: Frequency of mode
#' \item \code{mode2}: 2nd most frequent value
#' \item \code{freq_mode2}: Frequency of 2nd mode
#' \item \code{n.total}: Number of observations
#' \item \code{n.missing}: Number of observations with missing (NA) values
#' \item \code{n.unique}: Number of unique values
#' }
#' Columns may also carry a \code{"label"} attribute, which provides a short
#' human-readable description of each column.
#'
#'
#' @examples
#' # With grouping
#' df <- data.frame(y = rnorm(100), group = rep(c("A", "B"), 50))
#' desc_var(y, group, data = df)
#'
#' # Without grouping (full dataset)
#' desc_var(y, data = df)
#'
#' # Direct vectors
#' y <- rnorm(100)
#' group <- rep(c("A", "B"), 50)
#' desc_var(y, group)
#'
#' # With custom decimal places
#' desc_var(y, group, data = df, digits = 2)
#'
#' # Using formula syntax: y ~ x
#' desc_var(y ~ group, data = df)
#'
#' # Using formula syntax with multiple grouping variables: y ~ x1 + x2
#' df2 <- data.frame(y = rnorm(200), x1 = rep(c("A", "B"), 100), x2 = rep(c("X", "Y"), each = 100))
#' desc_var(y ~ x1 + x2, data = df2)
#'
#' @export
desc_var <- function(y, group = NULL, data = NULL, digits = 3) {
# FUNCTION OUTLINE:
# 1. Check if y is a formula and extract variables (formula vs non-formula syntax)
# 2. Validate inputs (numeric check, length checks, perfect overlap check)
# 3. Define helper function to compute statistics for a vector
# 4. Compute statistics (either for full dataset or by group)
# 5. Round numeric columns to specified decimal places
# 6. Add descriptive labels to columns as attributes
# 7. Return result dataframe
# Helper function to format error messages with conditional newline
format_msg <- function(msg) {
total_length <- nchar(msg) + nchar("desc_var() says: ")
if (total_length > 40) {
return(paste0("desc_var() says:\n", msg))
} else {
return(paste0("desc_var() says: ", msg))
}
}
# Helper function to validate that variables have the same length
validate_lengths <- function(vars, y_len, var_names = NULL) {
for (i in seq_along(vars)) {
if (length(vars[[i]]) != y_len) {
message2(format_msg("All variables must have the same length"), col = 'red', stop = TRUE)
}
}
}
# Helper function to extract variable from data or environment
extract_var <- function(var_name, data, calling_env, error_msg) {
if (!is.null(data)) {
if (!var_name %in% names(data)) {
message2(format_msg(sprintf("'%s' not found in data", var_name)), col = 'red', stop = TRUE)
}
return(data[[var_name]])
} else {
tryCatch({
get(var_name, envir = calling_env)
}, error = function(e) {
message2(format_msg(sprintf("'%s' not found", var_name)), col = 'red', stop = TRUE)
})
}
}
# Helper function to extract and validate grouping variables
extract_group_vars <- function(group_var_names, data, calling_env, y_len) {
if (is.null(group_var_names) || length(group_var_names) == 0) {
return(list(group_list = NULL, use_separate_cols = FALSE))
}
# Extract grouping variables
group_list <- lapply(group_var_names, function(gv) {
extract_var(gv, data, calling_env, sprintf("'%s' not found", gv))
})
names(group_list) <- group_var_names
# Validate all variables have the same length
validate_lengths(group_list, y_len)
# For multiple grouping variables, keep them separate
# For single grouping variable, create a single group vector
use_separate_cols <- length(group_var_names) > 1
return(list(group_list = group_list, use_separate_cols = use_separate_cols))
}
# 1. Check if y is a formula and extract variables
# Use unevaluated call to detect formula so we never evaluate y (avoids
# "restarting interrupted promise evaluation" when the object does not exist)
calling_env <- parent.frame()
call_match <- match.call()
y_quote <- call_match[["y"]]
is_formula <- is.call(y_quote) && identical(y_quote[[1]], as.symbol("~"))
if (is_formula) {
# Parse formula: y ~ x or y ~ x1 + x2 (evaluate only now that we know it's a formula)
formula_vars <- all.vars(y)
if (length(formula_vars) < 1) {
message2(format_msg("Formula must have at least one variable"), col = 'red', stop = TRUE)
}
# First variable is response, rest are grouping variables
y_var_name <- formula_vars[1]
group_var_names <- if (length(formula_vars) > 1) formula_vars[-1] else NULL
y_name <- y_var_name # For consistent error messages
# Validate data frame if provided
if (!is.null(data) && !is.data.frame(data)) {
message2(format_msg("'data' must be a data frame"), col = 'red', stop = TRUE)
}
# Extract response variable
y <- extract_var(y_var_name, data, calling_env, sprintf("'%s' not found", y_var_name))
y_len <- length(y)
# Extract grouping variables
group_result <- extract_group_vars(group_var_names, data, calling_env, y_len)
group_list <- group_result$group_list
use_separate_group_cols <- group_result$use_separate_cols
# Create single group vector if single grouping variable, otherwise keep separate
if (!is.null(group_list)) {
if (use_separate_group_cols) {
group <- NULL # Keep separate, don't create interaction
} else {
group <- group_list[[1]] # Single grouping variable
}
} else {
group <- NULL
}
} else {
# Not a formula: use match.call() and deparse(substitute()) like other functions
group_was_provided <- !missing(group)
# Get variable names from the call (before any evaluation)
y_expr <- call_match$y
y_name_raw <- if (!is.null(y_expr)) deparse(y_expr) else "y"
# Clean variable name: remove df$ prefix if present
y_name <- if (grepl("\\$", y_name_raw)) {
strsplit(y_name_raw, "\\$")[[1]][length(strsplit(y_name_raw, "\\$")[[1]])]
} else {
y_name_raw
}
group_name <- NULL
group_name_raw <- NULL
if (group_was_provided) {
group_expr <- call_match$group
group_name_raw <- if (!is.null(group_expr)) deparse(group_expr) else "group"
# Clean variable name: remove df$ prefix if present
group_name <- if (grepl("\\$", group_name_raw)) {
strsplit(group_name_raw, "\\$")[[1]][length(strsplit(group_name_raw, "\\$")[[1]])]
} else {
group_name_raw
}
}
# Validate data frame if provided
if (!is.null(data) && !is.data.frame(data)) {
message2(format_msg("'data' must be a data frame"), col = 'red', stop = TRUE)
}
# Extract y variable
if (!is.null(data)) {
# Data provided: extract from data frame
if (!y_name %in% names(data)) {
message2(format_msg(sprintf("'%s' not found in data", y_name)), col = 'red', stop = TRUE)
}
y <- data[[y_name]]
} else {
# No data: evaluate from calling environment
# If the call was df$x (or other call), always eval that expression — do not
# use get("x"), which can pick up a wrong object when bare `x` exists.
if (is.call(y_expr)) {
y <- tryCatch(
eval(y_expr, envir = calling_env),
error = function(e) {
message2(
format_msg(sprintf("'%s' could not be evaluated", y_name_raw)),
col = "red",
stop = TRUE
)
}
)
} else {
y_exists <- exists(y_name, envir = calling_env, inherits = TRUE)
if (!y_exists) {
y_eval <- tryCatch(
eval(y_expr, envir = calling_env),
error = function(e) NULL
)
if (is.null(y_eval)) {
message2(format_msg(sprintf("'%s' not found", y_name)), col = 'red', stop = TRUE)
} else {
y <- y_eval
}
} else {
y <- get(y_name, envir = calling_env)
}
}
}
y_len <- length(y)
# Extract group variable if provided
if (group_was_provided) {
if (!is.null(data)) {
# Data provided: extract from data frame
if (!group_name %in% names(data)) {
message2(format_msg(sprintf("'%s' not found in data", group_name)), col = 'red', stop = TRUE)
}
group <- data[[group_name]]
} else {
# No data: evaluate from calling environment (same rule as y: calls like df$g first)
if (is.call(group_expr)) {
group <- tryCatch(
eval(group_expr, envir = calling_env),
error = function(e) {
message2(
format_msg(sprintf("'%s' could not be evaluated", group_name_raw)),
col = "red",
stop = TRUE
)
}
)
} else {
group_exists <- exists(group_name, envir = calling_env, inherits = TRUE)
if (!group_exists) {
if (is.vector(group)) {
# group is already evaluated, use it as-is
} else {
message2(format_msg(sprintf("'%s' not found", group_name)), col = 'red', stop = TRUE)
}
} else {
group <- get(group_name, envir = calling_env)
}
}
}
# Validate length
if (!is.null(group)) {
validate_lengths(list(group), y_len)
}
} else {
group <- NULL
}
# For non-formula case, always use single group column
group_list <- NULL
use_separate_group_cols <- FALSE
}
# 2. Validate inputs
# 2.1. Validate that y is numeric
if (!is.numeric(y)) {
y_class <- paste(class(y), collapse = "/")
message2(
format_msg(sprintf("'%s' must be numeric; currently %s", y_name, y_class)),
col = "red",
stop = TRUE
)
}
# 2.2. Check that multiple grouping variables do not overlap perfectly (formula syntax only)
if (use_separate_group_cols && !is.null(group_list) && length(group_list) > 1) {
# Check all pairs of grouping variables for perfect overlap
for (i in 1:(length(group_list) - 1)) {
for (j in (i + 1):length(group_list)) {
var1 <- group_list[[i]]
var2 <- group_list[[j]]
# Perfect overlap: one-to-one mapping means unique combinations equals both unique value counts
# This means each value in var1 maps to exactly one value in var2, and vice versa
n_unique_combos <- length(unique(paste(var1, var2, sep = "|")))
n_unique_v1 <- length(unique(var1))
n_unique_v2 <- length(unique(var2))
# Perfect overlap occurs when: n_unique_combos == n_unique_v1 == n_unique_v2
# This ensures one-to-one mapping (no many-to-one or one-to-many)
if (n_unique_combos == n_unique_v1 && n_unique_combos == n_unique_v2 && n_unique_v1 == n_unique_v2) {
var1_name <- names(group_list)[i]
var2_name <- names(group_list)[j]
message2(format_msg(sprintf("Multiple grouping variables should not overlap perfectly: '%s' and '%s' overlap perfectly", var1_name, var2_name)), col = 'red', stop = TRUE)
}
}
}
}
# 3. Define helper function to compute statistics for a vector
compute_stats <- function(x) {
n_total <- length(x) # Total count including missing values
na_count <- sum(is.na(x))
x <- x[!is.na(x)] # Remove NAs for statistics computation
n <- length(x) # Count of non-missing values
# Handle empty case (all values are missing)
if (n == 0) {
return(list(
n = as.integer(n_total), # Total count including missing
mean = NA_real_,
sd = NA_real_,
se = NA_real_,
median = NA_real_,
missing = as.integer(na_count),
unique = 0L,
min = NA_real_,
max = NA_real_,
mode = NA_real_,
freq_mode = NA_integer_,
mode2 = NA_real_,
freq_mode2 = NA_integer_
))
}
# Compute basic statistics
mean_val <- mean(x)
sd_val <- sd(x)
se_val <- sd_val / sqrt(n)
median_val <- median(x)
min_val <- min(x)
max_val <- max(x)
# Compute number of unique values
n_unique <- length(unique(x))
# Compute mode statistics only if there are repeated values
has_repeats <- n_unique < n
if (has_repeats) {
tab <- table(x)
ord <- order(tab, decreasing = TRUE)
mode_val <- as.numeric(names(tab)[ord[1]])
freq_mode <- as.integer(tab[ord[1]])
mode2nd_val <- if (length(ord) > 1) as.numeric(names(tab)[ord[2]]) else NA_real_
freq_mode2nd <- if (length(ord) > 1) as.integer(tab[ord[2]]) else NA_integer_
} else {
mode_val <- NA_real_
freq_mode <- NA_integer_
mode2nd_val <- NA_real_
freq_mode2nd <- NA_integer_
}
# Return consistent structure
return(list(
n = as.integer(n_total), # Total count including missing values
mean = as.numeric(mean_val),
sd = as.numeric(sd_val),
se = as.numeric(se_val),
median = as.numeric(median_val),
missing = as.integer(na_count),
unique = as.integer(n_unique),
min = as.numeric(min_val),
max = as.numeric(max_val),
mode = as.numeric(mode_val),
freq_mode = as.integer(freq_mode),
mode2 = as.numeric(mode2nd_val),
freq_mode2 = as.integer(freq_mode2nd)
))
}
# 4. Compute statistics
has_grouping <- !is.null(group) || (use_separate_group_cols && !is.null(group_list))
if (!has_grouping) {
# No grouping: compute for full dataset
stats <- compute_stats(y)
# Build base data frame (counts at end: n, n.missing, n.unique)
result_df <- data.frame(
group = "All",
mean = stats$mean,
sd = stats$sd,
se = stats$se,
median = stats$median,
min = stats$min,
max = stats$max,
mode = stats$mode,
freq_mode = stats$freq_mode,
mode2 = stats$mode2,
freq_mode2 = stats$freq_mode2,
n.total = stats$n,
n.missing = stats$missing,
n.unique = stats$unique,
stringsAsFactors = FALSE
)
} else {
# Grouping: compute for each group
result_list <- list()
if (use_separate_group_cols) {
# Multiple grouping variables: iterate over unique combinations
# Create a data frame to get unique combinations efficiently
group_df <- as.data.frame(group_list, stringsAsFactors = FALSE)
unique_combos <- unique(group_df)
# Check for missing combinations (combinations that don't exist in data)
all_possible_combos <- expand.grid(lapply(group_list, function(x) sort(unique(x))), stringsAsFactors = FALSE)
all_possible_str <- do.call(paste, c(all_possible_combos, sep = "|"))
unique_combos_str <- do.call(paste, c(unique_combos, sep = "|"))
missing_indices <- which(!all_possible_str %in% unique_combos_str)
if (length(missing_indices) > 0) {
missing_combos_df <- all_possible_combos[missing_indices, , drop = FALSE]
# Format missing combinations as readable strings
missing_str <- apply(missing_combos_df, 1, function(row) {
paste(paste(names(missing_combos_df), row, sep = "="), collapse = ", ")
})
missing_list <- paste(missing_str, collapse = "\n")
message2(format_msg(sprintf("Some possible group combinations are not observed:\n%s", missing_list)), col = 'blue')
}
# Build data frames for each combination
for (i in seq_len(nrow(unique_combos))) {
combo <- unique_combos[i, , drop = FALSE]
# Create mask: all grouping variables match this combination
mask <- Reduce(`&`, lapply(names(group_list), function(nm) {
group_list[[nm]] == combo[[nm]]
}))
y_group <- y[mask]
stats <- compute_stats(y_group)
# Build data frame with grouping variables as separate columns (counts at end)
result_row <- data.frame(
combo,
mean = stats$mean,
sd = stats$sd,
se = stats$se,
median = stats$median,
min = stats$min,
max = stats$max,
mode = stats$mode,
freq_mode = stats$freq_mode,
mode2 = stats$mode2,
freq_mode2 = stats$freq_mode2,
n.total = stats$n,
n.missing = stats$missing,
n.unique = stats$unique,
stringsAsFactors = FALSE
)
result_list[[length(result_list) + 1]] <- result_row
}
} else {
# Single group column: use group vector directly
unique_groups <- sort(unique(group))
# Second pass: build data frames
for (g in unique_groups) {
y_group <- y[group == g]
stats <- compute_stats(y_group)
# Build data frame (counts at end: n, n.missing, n.unique)
result_row <- data.frame(
group = as.character(g),
mean = stats$mean,
sd = stats$sd,
se = stats$se,
median = stats$median,
min = stats$min,
max = stats$max,
mode = stats$mode,
freq_mode = stats$freq_mode,
mode2 = stats$mode2,
freq_mode2 = stats$freq_mode2,
n.total = stats$n,
n.missing = stats$missing,
n.unique = stats$unique,
stringsAsFactors = FALSE
)
result_list[[length(result_list) + 1]] <- result_row
}
}
result_df <- do.call(rbind, result_list)
# Check if any group has 0 observations
if (any(result_df$n.total == 0)) {
zero_groups <- sum(result_df$n.total == 0)
if (zero_groups == 1) {
message2(format_msg("1 group has 0 observations"), col = "red2")
} else {
message2(format_msg(sprintf("%d groups have 0 observations", zero_groups)), col = "red2")
}
}
# Sort by grouping variables
if (use_separate_group_cols && !is.null(group_list)) {
# Sort by grouping variables from right to left in formula
# y ~ x1 + x2 + x3 => sort by x3, then x2, then x1
sort_cols <- rev(names(group_list))
result_df <- result_df[do.call(order, result_df[sort_cols]), , drop = FALSE]
} else if (!is.null(group)) {
# Sort by single group column
result_df <- result_df[order(result_df$group), , drop = FALSE]
}
}
# 5. Round numeric columns to specified decimal places
numeric_cols <- c("mean", "sd", "se", "median", "mode", "mode2", "min", "max")
numeric_cols <- numeric_cols[numeric_cols %in% names(result_df)]
result_df[numeric_cols] <- lapply(result_df[numeric_cols], round, digits = digits)
# 5b. Replace NA mode values with "-" and flag if all modes are NA
all_modes_na <- all(is.na(result_df$mode))
# Convert mode columns to character and replace NA with "-"
result_df$mode <- ifelse(is.na(result_df$mode), "-", as.character(result_df$mode))
result_df$freq_mode <- ifelse(is.na(result_df$freq_mode), "-", as.character(result_df$freq_mode))
result_df$mode2 <- ifelse(is.na(result_df$mode2), "-", as.character(result_df$mode2))
result_df$freq_mode2 <- ifelse(is.na(result_df$freq_mode2), "-", as.character(result_df$freq_mode2))
# 6. Add descriptive labels to columns as attributes
label_list <- list(
group = "Group identifier",
mean = "Mean",
sd = "Standard deviation",
se = "Standard error",
median = "Median (50th percentile)",
min = "Minimum value",
max = "Maximum value",
mode = "Most frequent value",
freq_mode = "Frequency of mode",
mode2 = "2nd most frequent value",
freq_mode2 = "Frequency of 2nd mode",
n.total = "Number of observations",
n.missing = "Number of observations with missing values (NA)",
n.unique = "Number of unique values"
)
# Add labels for individual grouping variable columns if they exist
if (use_separate_group_cols && !is.null(group_list)) {
for (gv in names(group_list)) {
if (gv %in% names(result_df)) {
label_list[[gv]] <- paste("Grouping variable:", gv)
}
}
}
# Only include labels for columns that exist
var_labels(result_df) <- label_list
rownames(result_df) <- NULL
# 7. Add class and attributes for print method, then return
attr(result_df, "all_modes_na") <- all_modes_na
attr(result_df, "y_name") <- y_name
class(result_df) <- c("desc_var", class(result_df))
return(result_df)
}
#' Print method for desc_var objects
#'
#' @param x An object of class \code{desc_var}
#' @param ... Additional arguments passed to print.data.frame
#'
#' @return Invisibly returns the original object
#' @export
print.desc_var <- function(x, ...) {
# Remove the desc_var class temporarily to use default data.frame printing
class(x) <- class(x)[class(x) != "desc_var"]
print(x, ...)
# Print note if all modes were NA
if (isTRUE(attr(x, "all_modes_na"))) {
y_name <- attr(x, "y_name")
if (is.null(y_name) || y_name == "") y_name <- "y"
cat(sprintf("\nNote: mode not reported because all values of %s are unique\n", y_name))
}
invisible(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.