Nothing
#' Univariate Tests for Multi‑Level Categorical Predictors
#'
#' @description
#' `cyt_univariate_multi`provides univariate statistical testing for
#' categorical predictors with more than two levels. For each
#' categorical predictor and numeric outcome pair, a global test is
#' performed, followed by pairwise comparisons if the global test is
#' significant. Users may choose between two methods,
#' classical ANOVA with Tukey’s Honest Significant Difference (HSD)
#' or a non‑parametric Kruskal–Wallis test followed by pairwise
#' Wilcoxon rank–sum tests. The return format can either be a list
#' of adjusted p‑values for each outcome–predictor pair or, if
#' `format_output = TRUE`, a tidy data frame summarizing all
#' pairwise comparisons.
#'
#' @param data A data frame or matrix containing both categorical
#' and continuous variables. Character columns will be converted
#' to factors.
#' @param method Character specifying the type of global test to
#' perform. Use "anova" (default) for one‑way ANOVA with Tukey
#' HSD or "kruskal" for Kruskal–Wallis with pairwise Wilcoxon
#' tests.
#' @param cat_vars Optional character vector of predictor column
#' names. When `NULL`, all factor or character columns in `data`
#' are used.
#' @param cont_vars Optional character vector of numeric outcome
#' variable names. When `NULL`, all numeric columns in `data` are
#' used.
#' @param p_adjust_method Character string specifying the method for
#' p‑value adjustment across pairwise comparisons. Passed to
#' `p.adjust`. Default is "BH".
#' @param format_output Logical. If `TRUE`, returns a tidy data
#' frame; otherwise (default) returns a list of numeric vectors
#' keyed by "Outcome_Categorical". Each numeric vector contains
#' adjusted p‑values for the pairwise comparisons.
#' @return Either a list (if `format_output = FALSE`) or a data
#' frame (if `format_output = TRUE`).
#' @examples
#' data("ExampleData1")
#' cyt_univariate_multi(ExampleData1[, c(1:2, 5:6)], method = "kruskal",
#' format_output = TRUE)
#' @author Shubh Saraswat
#' @importFrom stats aov as.formula kruskal.test pairwise.wilcox.test TukeyHSD
#' @export
cyt_univariate_multi <- function(
data,
method = c("anova", "kruskal"),
cat_vars = NULL,
cont_vars = NULL,
p_adjust_method = "BH",
format_output = FALSE
) {
names(data) <- make.names(names(data), unique = TRUE)
method <- match.arg(method)
# Convert to data frame
x1_df <- as.data.frame(data)
# Convert character variables to factors
char_cols <- names(x1_df)[sapply(x1_df, is.character)]
if (length(char_cols) > 0) {
x1_df[char_cols] <- lapply(x1_df[char_cols], as.factor)
}
# Identify categorical predictors
if (is.null(cat_vars)) {
cat_vars <- names(x1_df)[sapply(x1_df, is.factor)]
}
# Identify continuous variables
if (is.null(cont_vars)) {
cont_vars <- names(x1_df)[sapply(x1_df, is.numeric)]
}
# Initialize list to store p-value vectors
test_results <- list()
for (cat_var in cat_vars) {
# Skip predictors with insufficient levels
levs <- levels(x1_df[[cat_var]])
n_lvls <- length(levs)
if (n_lvls <= 1) {
next
}
# For ANOVA restrict to <=10 levels (as original)
if (method == "anova" && n_lvls > 10) {
next
}
for (outcome in cont_vars) {
key <- paste(outcome, cat_var, sep = "_")
if (method == "anova") {
model <- stats::aov(
stats::as.formula(paste(outcome, "~", cat_var)),
data = x1_df
)
tuk <- TukeyHSD(model)[[cat_var]]
p_vals <- tuk[, "p adj"]
names(p_vals) <- rownames(tuk)
test_results[[key]] <- round(p_vals, 4)
} else {
# Kruskal-Wallis global test
kw <- stats::kruskal.test(
stats::as.formula(paste(outcome, "~", cat_var)),
data = x1_df
)
# Pairwise Wilcoxon with adjustment
pw <- stats::pairwise.wilcox.test(
x1_df[[outcome]],
x1_df[[cat_var]],
p.adjust.method = p_adjust_method
)
p_mat <- pw$p.value
# Flatten symmetric matrix into named vector (remove NA values)
pairs <- NULL
pvals <- NULL
for (i in seq_len(nrow(p_mat))) {
for (j in seq_len(ncol(p_mat))) {
if (!is.na(p_mat[i, j])) {
pairs <- c(
pairs,
paste0(rownames(p_mat)[i], "-", colnames(p_mat)[j])
)
pvals <- c(pvals, p_mat[i, j])
}
}
}
names(pvals) <- pairs
# Round values
test_results[[key]] <- round(pvals, 4)
}
}
}
# If no tests performed
if (length(test_results) == 0) {
return(
"No valid comparisons were performed. Check that your data has numeric columns and factors with sufficient levels."
)
}
if (!format_output) {
return(test_results)
}
# Create tidy data frame
out_df <- data.frame(
Outcome = character(),
Categorical = character(),
Comparison = character(),
P_adj = numeric(),
stringsAsFactors = FALSE
)
for (key in names(test_results)) {
parts <- strsplit(key, "_")[[1]]
outcome <- parts[1]
cat_var <- parts[2]
p_vec <- test_results[[key]]
for (comp in names(p_vec)) {
out_df <- rbind(
out_df,
data.frame(
Outcome = outcome,
Categorical = cat_var,
Comparison = comp,
P_adj = p_vec[comp],
stringsAsFactors = FALSE,
row.names = NULL
)
)
}
}
out_df
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.