Nothing
#' @title Exploratory Data Analysis, Normality Testing, and
#' Visualization
#'
#' @description
#'
#' `r lifecycle::badge("experimental")`
#'
#' `is_it_normal()` calculates descriptive statistics and conducts univariate
#' normality testing on one or more numeric variables in a dataset using a
#' selected statistical test. Optional plots are included for one variable at a
#' time, only. Results are returned as a named list containing summaries and,
#' optionally, normality tests and/or diagnostic plots.
#'
#' @param df A `data.frame` or `tibble` containing the variables to assess.
#' @param ... One or more unquoted column names from `df` to be analyzed.
#' @param group_vars Optional. A character vector of column names in `df` to
#' group results by (e.g., `c("year", "hospital_level")`). If `NULL`, no
#' grouping is applied. Grouped summaries and normality tests are computed
#' within each unique combination of values across these variables.
#' @param seed A numeric value passed to `set.seed()` to ensure reproducibility.
#' Default is `10232015`.
#' @param normality_test A character string specifying the statistical test to
#' use. Must be one of: `"shapiro-wilk" or "shapiro" or "sw"`,
#' `"kolmogorov-smirnov" or "ks"`, `"anderson-darling" or "ad"`, `"lilliefors"
#' or "lilli"`, `"cramer-von-mises" or "cvm"`, `"pearson" or "p"`, or
#' `"shapiro-francia" or "sf"`. If `NULL`, no normality test is performed,
#' which is the default.
#' @param include_plots Logical. If `TRUE`, plots are generated for a single
#' variable. Plotting is disabled if multiple variables are passed.
#' @param plot_theme A `ggplot2::theme` function to apply to all plots. Default
#' is `traumar::theme_cleaner`.
#'
#' @return A named list with the following elements:
#' \describe{
#' \item{descriptive_statistics}{A `tibble` of summary statistics for each
#' variable.}
#' \item{normality_test}{A `tibble` of test statistics and p-values
#' (if `normality_test == TRUE`).}
#' \item{plots}{A patchwork object containing four plots (if `include_plots =
#' TRUE` and one variable supplied).}
#' }
#'
#' @details
#' \itemize{
#' \item If the data do not meet the test requirements for a chosen test of
#' normality, `is_it_normal()` will not run the tests.
#' \item Normality tests may yield differing results. Each test has distinct
#' assumptions and sensitivity. Users should verify assumptions and consult
#' test-specific guidance to ensure appropriate use.
#' \item The function will abort with helpful CLI messages if input types or
#' structures are incorrect.
#' \item If plotting is enabled, and `nrow(df) > 10000`, a warning is issued
#' as plotting may become computationally expensive.
#'}
#'
#' @note Supported normality tests are below. Please check the specifications of
#' these tests in the corresponding documentation.
#' \itemize{
#' \item Shapiro-Wilk (`stats::shapiro.test()`)
#' \item Kolmogorov-Smirnov (`stats::ks.test()`)
#' \item Anderson-Darling (`nortest::ad.test()`)
#' \item Lilliefors (`nortest::lillie.test()`)
#' \item Cramer-von Mises (`nortest::cvm.test()`)
#' \item Pearson (`norest::pearson.test()`)
#' \item Shapiro-Francia (`nortest::sf.test()`)
#'}
#'
#' Please note that if grouped plotting is enabled, each group will generate its
#' own set of plots. This may flood your IDE or console. Plan your use of this
#' functionality with care to avoid lags or unwanted outputs.
#'
#' @author Nicolas Foss, Ed.D., MS
#'
#' @export
is_it_normal <- function(
df,
...,
group_vars = NULL,
seed = 10232015,
normality_test = NULL,
include_plots = FALSE,
plot_theme = traumar::theme_cleaner
) {
###___________________________________________________________________________
### Data validation
###___________________________________________________________________________
# Validate the `df` argument
if (!is.data.frame(df) && !tibble::is_tibble(df)) {
cli::cli_abort(c(
"Input to argument {.strong {.var df}} was not of the correct class. {.fn is_it_normal} does not know what to do with it!",
"x" = "You supplied an object of class {.cls {class(df)}}.",
"i" = "Please use an object of class {.cls data.frame} or {.cls tibble}."
))
}
# Validate group_vars
if (!is.null(group_vars)) {
if (!is.character(group_vars)) {
cli::cli_abort(c(
"Argument {.var group_vars} must be {.cls character} or {.val NULL}.",
"x" = "You supplied an object of class {.cls {class(group_vars)}}."
))
}
missing_vars <- setdiff(group_vars, names(df))
if (length(missing_vars) > 0) {
cli::cli_abort(c(
"Some values in {.var group_vars} are not columns in {.var df}.",
"x" = "Missing column(s): {.val {missing_vars}}.",
"i" = "Check that all values in {.var group_vars} match column names in {.var df}."
))
}
}
# validate the `normality_test`
if (!is.null(normality_test)) {
# Normalize aliases
normality_test <- tolower(normality_test)
normality_test <- switch(
normality_test,
"shapiro" = "shapiro-wilk",
"sw" = "shapiro-wilk",
"sf" = "shapiro-francia",
"ks" = "kolmogorov-smirnov",
"ad" = "anderson-darling",
"lilli" = "lilliefors",
"cvm" = "cramer-von mises",
"p" = "pearson",
normality_test # if not an alias, keep original
)
attempt <- try(
match.arg(
normality_test,
choices = c(
"shapiro-wilk",
"kolmogorov-smirnov",
"anderson-darling",
"lilliefors",
"cramer-von mises",
"pearson",
"shapiro-francia"
)
),
silent = TRUE
)
if (inherits(attempt, "try-error")) {
cli::cli_abort(
c(
"If {.var normality_test} is not {cli::col_blue('NULL')}, it must be one of {.val shapiro-wilk}, {.val kolmogorov-smirnov}, {.val anderson-darling}, {.val lilliefors}, {.val cramer-von mises}, {.val pearson}, or {.val shapiro-francia}.",
"i" = "{.var normality_test} was {.val {normality_test}}."
)
)
}
normality_test <- attempt
}
# Check if plot_theme is a function
if (include_plots && !is.function(plot_theme)) {
cli::cli_abort(c(
"Input to argument {.strong {.var plot_theme}} was not of the correct class.",
"x" = "You supplied an object of class {.cls {class(plot_theme)}} to {.var plot_theme}.",
"i" = "Please use an object of class {.cls function}, usually a {.fn ggplot2::theme} object."
))
} else {
chosen_theme <- as.function(plot_theme)
}
if (!is.numeric(seed)) {
cli::cli_abort(c(
"In order to set the random seed, {.var seed} must have class {.cls numeric}.",
"i" = "{.var seed} had class {.cls {class(seed)}}."
))
}
# Set up variables to use in purrr::map()
vars <- rlang::enquos(...)
# Dynamically generate the variable names
var_names <- purrr::map_chr(vars, ~ rlang::as_label(.))
# Check if multiple columns were provided
multiple_columns <- if (length(vars) > 1) TRUE else FALSE
# Callout for the variables passed
cli::cli_h1("Output Summary")
cli::cli_h3("Descriptive Statistics")
cli::cli_inform(c(
"*" = paste0(
"Exploratory data analysis of the variable(s): ",
cli::col_green(cli::style_bold(paste(var_names, collapse = ", ")))
)
))
# Messaging related to normality testing
cli::cli_h3("Normality Tests")
# Issue a warning if data violates any normality test choice assumptions
if (
!is.null(normality_test) &&
normality_test %in%
c(
"shapiro-wilk",
"shapiro-francia",
"anderson-darling",
"cramer-von mises",
"lilliefors"
)
) {
# Default values for tests
test_info <- list(
"shapiro-wilk" = list(
min_n = 3,
max_n = 5000,
fn = "stats::shapiro.test"
),
"shapiro-francia" = list(
min_n = 5,
max_n = 5000,
fn = "nortest::sf.test"
),
"anderson-darling" = list(
min_n = 8,
max_n = Inf,
fn = "nortest::ad.test"
),
"cramer-von mises" = list(
min_n = 8,
max_n = Inf,
fn = "nortest::cvm.test"
),
"lilliefors" = list(
min_n = 5,
max_n = Inf,
fn = "nortest::lillie.test"
)
)[[normality_test]]
# Perform the test within each group
group_status <- df |>
dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) |>
dplyr::group_modify(
~ {
group_data <- .x
group_keys <- .y
var_status <- purrr::map_chr(vars, function(var) {
var_name <- rlang::as_label(var)
vec <- dplyr::pull(group_data, !!var)
n_complete <- sum(!is.na(vec))
# control structure for no group
if (!is.null(group_vars)) {
group_label <- paste(
purrr::map2_chr(
names(group_keys),
group_keys,
~ glue::glue("{.x} = {.y}")
),
collapse = ", "
)
} else {
group_label <- "full dataset"
}
if (n_complete < test_info$min_n) {
glue::glue(
"{var_name} ({group_label}): n_complete = {n_complete}, problem: n_complete < {test_info$min_n}"
)
} else if (n_complete > test_info$max_n) {
glue::glue(
"{var_name} ({group_label}): n_complete = {n_complete}, problem: n_complete > {test_info$max_n}"
)
} else {
NA_character_
}
})
tibble::tibble(var_status = var_status)
}
)
# Explicit check for any actual messages
violations_exist <- any(!is.na(group_status$var_status))
if (violations_exist) {
cli::cli_alert_warning(c(
"One or more variables do not meet the sample size requirements for {.fn {test_info$fn}}.\n",
"The test will not run for overall data or any groups that fail to meet the requirements.\n",
"The test will be performed for groups that meet the sample size requirements.\n",
"Please select another test if desired."
))
cli::cli_alert_info("Problematic variables:")
purrr::walk(stats::na.omit(group_status$var_status), cli::cli_li)
} else {
# When all is well, let the user know.
cli::cli_alert_info(
"No issues reported with the univariate tests of normality. Proceeding."
)
}
} else if (is.null(normality_test)) {
# When no normality test procedure is requested, return a message
cli::cli_alert_info("No normality test requested.")
} else {
# Data limits do not apply to other tests in these ways.
cli::cli_alert_info(
"No issues reported with the univariate tests of normality. Proceeding."
)
}
# Summary on grouping behavior of the function
cli::cli_h3("Strata")
# Callout for the grouping applied (or no grouping if NULL)
if (is.null(group_vars) || length(group_vars) == 0) {
cli::cli_inform(
c(
"*" = "No grouping applied. Descriptive statistics and normality tests (as applicable) are computed on the entire dataset."
)
)
} else {
cli::cli_inform(
c(
"*" = paste0(
"Grouping by: ",
cli::col_blue(cli::style_bold(paste(group_vars, collapse = ", ")))
)
)
)
}
###___________________________________________________________________________
### Create some convention functions for plotting
###___________________________________________________________________________
# A utility function to check the length of a vector
is_valid_for_plotting <- function(vec, min_n = 2) {
vec <- vec[!is.na(vec)]
length(vec) >= min_n && all(is.finite(vec))
}
# Utility function for a density plot
make_density_plot <- function(data, vec, var, var_name, group_label = NULL) {
if (!is_valid_for_plotting(vec)) {
cli::cli_alert_danger(
"Skipping density plot for {.var {var_name}}{if (!is.null(group_label)) paste0(' (', group_label, ')')}: insufficient valid data."
)
return(NULL)
}
ggplot2::ggplot(data, ggplot2::aes(x = !!var)) +
ggplot2::geom_density(
fill = "#F27026",
color = "black",
alpha = 0.5,
na.rm = TRUE
) +
ggplot2::geom_segment(
x = median(vec, na.rm = TRUE),
xend = median(vec, na.rm = TRUE),
y = 0,
yend = Inf,
linetype = "dashed",
color = "darkslategray",
alpha = 0.25,
linewidth = 1.5
) +
ggplot2::ggtitle(
if (is.null(group_label)) {
paste0(
"Kernel Density Plot of ",
var_name,
" with horizontal line at the median"
)
} else {
paste0("Density Plot with Median (", group_label, ")")
}
) +
ggplot2::labs(
x = var_name,
caption = paste0("n = ", sum(!is.na(vec)), " non-missings")
) +
chosen_theme()
}
# Utility function for a histogram plot
make_hist_plot <- function(data, vec, var, var_name, group_label = NULL) {
if (!is_valid_for_plotting(vec)) {
cli::cli_alert_danger(
"Skipping histogram for {.var {var_name}}{if (!is.null(group_label)) paste0(' (', group_label, ')')}: insufficient valid data."
)
return(NULL)
}
bin_width <- (max(vec, na.rm = TRUE) - min(vec, na.rm = TRUE)) / 15
ggplot2::ggplot(data, ggplot2::aes(x = !!var)) +
ggplot2::geom_histogram(
binwidth = bin_width,
fill = "#C6D667",
color = "black",
na.rm = TRUE
) +
ggplot2::ggtitle(
if (is.null(group_label)) paste0("Histogram of ", var_name) else
paste0("Histogram (", group_label, ")")
) +
ggplot2::labs(x = var_name) +
chosen_theme()
}
# Utility function for a boxplot
make_boxplot <- function(data, vec, var, var_name, group_label = NULL) {
if (!is_valid_for_plotting(vec)) {
cli::cli_alert_danger(
"Skipping boxplot for {.var {var_name}}{if (!is.null(group_label)) paste0(' (', group_label, ')')}: insufficient valid data."
)
return(NULL)
}
ggplot2::ggplot(data, ggplot2::aes(x = !!var, y = "")) +
ggplot2::geom_jitter(
color = "#03617A",
alpha = 0.25,
na.rm = TRUE,
height = 0.35
) +
ggplot2::geom_boxplot(
fill = "#B9E1DA",
color = "black",
alpha = 0.2,
na.rm = TRUE,
orientation = "y"
) +
ggplot2::stat_boxplot(geom = "errorbar", width = 0.5) +
ggplot2::ggtitle(
if (is.null(group_label))
paste0("Boxplot with scatterplot of ", var_name) else
paste0("Boxplot (", group_label, ")")
) +
ggplot2::labs(x = var_name, y = "") +
chosen_theme()
}
# Utility function for a Q-Q plot
make_qq_plot <- function(data, vec, var, var_name, group_label = NULL) {
if (!is_valid_for_plotting(vec)) {
cli::cli_alert_danger(
"Skipping Q-Q plot for {.var {var_name}}{if (!is.null(group_label)) paste0(' (', group_label, ')')}: insufficient valid data."
)
return(NULL)
}
ggplot2::ggplot(data, ggplot2::aes(sample = !!var)) +
ggplot2::geom_qq_line(
linewidth = 1.25,
color = "#70C8B8",
alpha = 0.7,
na.rm = TRUE
) +
ggplot2::stat_qq(
color = "#19405B",
alpha = 0.4,
size = 2,
na.rm = TRUE
) +
ggplot2::ggtitle(
if (is.null(group_label)) paste0("Normal Q-Q Plot of ", var_name) else
paste0("Normal Q-Q Plot (", group_label, ")")
) +
chosen_theme()
}
###___________________________________________________________________________
### Initialize the output list
###___________________________________________________________________________
output_list <- list()
###___________________________________________________________________________
### Generate descriptive statistics
###___________________________________________________________________________
# Iterate over the provided columns
results <- purrr::map(vars, function(var) {
var_name <- rlang::as_label(var)
df_summarized <- df |>
dplyr::select(dplyr::all_of(c(group_vars, var_name))) |>
dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) |>
dplyr::summarize(
variable = var_name,
mean = round(mean(.data[[var_name]], na.rm = TRUE), 3),
std_dev = round(sd(.data[[var_name]], na.rm = TRUE), 3),
min = min(.data[[var_name]], na.rm = TRUE),
quantile_25 = quantile(.data[[var_name]], 0.25, na.rm = TRUE),
median = quantile(.data[[var_name]], 0.5, na.rm = TRUE),
quantile_75 = quantile(.data[[var_name]], 0.75, na.rm = TRUE),
max = max(.data[[var_name]], na.rm = TRUE),
non_missings = sum(!is.na(.data[[var_name]])),
missings = sum(is.na(.data[[var_name]])),
p_complete = round(mean(!is.na(.data[[var_name]])), 3),
p_missing = round(mean(is.na(.data[[var_name]])), 3),
n_obs = dplyr::n(),
.groups = "drop"
)
df_summarized
})
# set names
names(results) <- purrr::map_chr(vars, rlang::as_label)
# Pivot results wider
results <- results |>
purrr::list_rbind()
# Assign results to the output_list
output_list$descriptive_statistics <- results
###___________________________________________________________________________
### Optional: Perform the test of normality on variables
###___________________________________________________________________________
if (!is.null(normality_test)) {
# Only set the random seed if normality test is done
set.seed(seed)
# Grouping (if any) applied before normality test
if (!is.null(group_vars) && length(group_vars) > 0) {
# Group data by the specified group_vars
df <- df |>
dplyr::group_by(dplyr::across(dplyr::all_of(group_vars)))
}
# Iterate over the variables of interest to perform the requested normality
# test within each group (or overall if no grouping)
test_results <- purrr::map(vars, function(var) {
var_name <- rlang::as_label(var)
# Perform normality test within each group
test_result <- df |>
dplyr::group_modify(function(sub_df, group) {
vec <- dplyr::pull(sub_df, !!var)
# get complete observations
n_complete <- sum(!is.na(vec))
# Try to perform the normality test and handle any errors
test_result <- tryCatch(
{
# Shapiro-Wilk
if (normality_test == "shapiro-wilk") {
test_result <- shapiro.test(vec) |>
broom::tidy() |>
dplyr::mutate(
result = dplyr::if_else(
p.value < 0.05,
"Non-normal distribution",
"Normal distribution"
)
)
# Kolmogorov-Smirnov
} else if (normality_test == "kolmogorov-smirnov") {
test_result <- ks.test(vec, y = "pnorm") |>
broom::tidy() |>
dplyr::mutate(
result = dplyr::if_else(
p.value < 0.05,
"Non-normal distribution",
"Normal distribution"
)
)
# Anderson-Darling
} else if (normality_test == "anderson-darling") {
test_result <- nortest::ad.test(vec) |>
broom::tidy() |>
dplyr::mutate(
result = dplyr::if_else(
p.value < 0.05,
"Non-normal distribution",
"Normal distribution"
)
)
# Lilliefors
} else if (normality_test == "lilliefors") {
test_result <- nortest::lillie.test(vec) |>
broom::tidy() |>
dplyr::mutate(
result = dplyr::if_else(
p.value < 0.05,
"Non-normal distribution",
"Normal distribution"
)
)
# Cramer-von Mises
} else if (normality_test == "cramer-von mises") {
test_result <- nortest::cvm.test(vec) |>
broom::tidy() |>
dplyr::mutate(
result = dplyr::if_else(
p.value < 0.05,
"Non-normal distribution",
"Normal distribution"
)
)
# Pearson
} else if (normality_test == "pearson") {
test_result <- nortest::pearson.test(vec) |>
broom::tidy() |>
dplyr::mutate(
result = dplyr::if_else(
p.value < 0.05,
"Non-normal distribution",
"Normal distribution"
)
)
# Shapiro-Francia
} else if (normality_test == "shapiro-francia") {
test_result <- nortest::sf.test(vec) |>
broom::tidy() |>
dplyr::mutate(
result = dplyr::if_else(
p.value < 0.05,
"Non-normal distribution",
"Normal distribution"
)
)
}
},
error = function(e) {
# Return a tibble indicating the test was skipped
tibble::tibble(
variable = var_name,
test = "Test Skipped",
statistic = NA_real_,
p_value = NA_real_,
result = "Test Skipped"
)
}
)
# Return the results in a tibble, including the group
suppressWarnings(
tibble::tibble(
variable = var_name,
test = test_result$method,
statistic = test_result$statistic,
p_value = test_result$p.value,
result = test_result$result
)
)
})
# Return the test results for the variable
test_result
})
# Combine the individual variable results into a single data frame
test_results <- purrr::list_rbind(test_results)
# Store the final results in the output list
output_list$normality_test <- test_results
} else {
output_list$normality_test <- NULL
}
###___________________________________________________________________________
### Optional plots - only available if one variable is passed to `variables`
###___________________________________________________________________________
if (include_plots && !multiple_columns) {
# Extract data programmatically
var <- vars[[1]]
var_name <- rlang::as_label(var)
# Header for the plotting section
cli::cli_h3("Visualizations")
if (nrow(df) > 5000) {
# Issue a warning
cli::cli_alert_warning(
c(
"Large Data Warning! The number of observations within {.var df} is large, plotting may become computationally expensive."
)
)
# Issue an alternative
cli::cli_alert_success(
"Consider downsampling your data using {.fn base::sample}, {.fn dplyr::slice_sample}, or a function with a similar purpose."
)
}
# Grouped plotting can be computationally expensive
# Ensure the user is aware and provide them a way out
if (!is.null(group_vars)) {
cli::cli_alert_warning(
c(
"Grouped plotting is enabled. Each group will generate its own set of plots. This may flood your IDE or console."
)
)
cli::cli_alert_info(
"To avoid this, assign the function result to an object and inspect plots selectively from the list output."
)
###_______________________________________________________________________
### Plotting functionality
###_______________________________________________________________________
# Group the data
grouped_data <- df |>
dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) |>
dplyr::group_split()
# Get labels for outside the loop
plot_list_labels <- grouped_data |>
purrr::map_chr(
~ {
keys <- dplyr::distinct(
.x,
dplyr::across(dplyr::all_of(group_vars))
)
paste(
purrr::map2_chr(names(keys), keys, \(n, v) glue::glue("{n}={v}")),
collapse = ", "
)
}
)
# Create the plots with dynamic names
output_list$plots <- grouped_data |>
purrr::map(function(group_data) {
group_keys <- dplyr::distinct(
group_data,
dplyr::across(dplyr::all_of(group_vars))
)
group_label <- paste(
purrr::map2_chr(
names(group_keys),
group_keys,
~ glue::glue("{.x} = {.y}")
),
collapse = ", "
)
# Get the data as a vector
vec <- dplyr::pull(group_data, !!var)
# QQ Plot
qqplot <- make_qq_plot(group_data, vec, var, var_name, group_label)
# Histogram
hist_plot <- make_hist_plot(
group_data,
vec,
var,
var_name,
group_label
)
# Kernel Density Plot with line at the median
density_plot <- make_density_plot(
group_data,
vec,
var,
var_name,
group_label
)
# Boxplot with scatterplot
boxplot_plot <- make_boxplot(
group_data,
vec,
var,
var_name,
group_label
)
# Set up list of plots, remove NULL objects
valid_plots <- purrr::compact(
list(
qqplot,
hist_plot,
density_plot,
boxplot_plot
)
)
# Set up the patchwork
patchwork::wrap_plots(valid_plots) +
patchwork::plot_annotation(
title = paste0(
"Normality Test of the variable '",
var_name,
"'"
),
theme = chosen_theme()
)
}) |>
purrr::set_names(plot_list_labels) |>
purrr::compact()
# Plot a single variable without grouped output
} else {
# Get the data of interest as a vector
vec <- df |> dplyr::pull(!!var)
# QQ Plot
qqplot <- make_qq_plot(df, vec, var, var_name)
# Histogram
hist_plot <- make_hist_plot(df, vec, var, var_name)
# Kernel Density Plot
density_plot <- make_density_plot(df, vec, var, var_name)
# Boxplot with scatterplot
boxplot_plot <- make_boxplot(df, vec, var, var_name)
# Combine, remove NULL objects
plot_list <- purrr::compact(list(
qqplot,
hist_plot,
density_plot,
boxplot_plot
))
# Patchwork
combined_plots <- patchwork::wrap_plots(plot_list) +
patchwork::plot_annotation(
title = paste0(
"Normality Test of the variable '",
var_name,
"'"
),
theme = chosen_theme()
)
# Add to the list
output_list$plots <- combined_plots
}
} else if (include_plots && multiple_columns) {
# Header for the plotting section
cli::cli_h3("Visualizations")
# If multiple_columns, then do not include plots
include_plots <- FALSE
cli::cli_alert_info(
"More than one column was passed to {.fn is_it_normal}, and plotting is only available for one column at a time. {.var include_plots} is now set to {.val FALSE}."
)
}
# Line to separate output for cleaner presentation
cli::cli_rule()
# Return outputs as a list
return(output_list)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.