R/vis_group_normality.R
In visStatistics: Automated Selection and Visualisation of Statistical Hypothesis Tests

Documented in vis_group_normality

### Header vis_group_normality -----

#' Visualisation of the normality assumption for Welch ANOVA/t-test
#'
#' \code{vis_group_normality} checks for normality of each group separately using 
#' the Shapiro-Wilk and Anderson-Darling tests. The null hypothesis is that 
#' each group is normally distributed. The function generates histograms 
#' with normal distribution overlays and Q-Q plots to visually assess normality.
#' The layout is always 2 rows × k columns (histograms on top, Q-Q plots on bottom).
#'
#' @param samples Numeric vector; the dependent variable.
#' @param groups Factor or vector; the grouping variable (2 to 8 groups for visual display).
#' @param conf.level Numeric; confidence level (default: 0.95). Used to determine 
#'   alpha = 1 - conf.level for normality test interpretation.
#' @param samplename Character; label for the y-axis (default: "").
#' @param groupname Character; label for the x-axis (default: "").
#' @param cex Numeric; scaling factor for plot text and symbols (default: 1).
#'
#' @return A list containing:
#' \describe{
#'   \item{shapiro_tests}{List of Shapiro-Wilk test results for each group}
#'   \item{ad_tests}{List of Anderson-Darling test results for each group}
#'   \item{n_groups}{Number of groups}
#'   \item{group_names}{Names of the groups}
#' }
#'
#' @details
#' Layout is always 2 rows × k columns:
#' \itemize{
#'   \item Top row: Histograms with normal overlay for each group
#'   \item Bottom row: Q-Q plots for each group
#' }
#' For more than 8 groups, a tabular summary is provided instead of plots.
#'
#' @examples
#' # Two groups (like t-test)
#' vis_group_normality(ToothGrowth$len, ToothGrowth$supp)
#' 
#' # Three groups
#' ToothGrowth$dose <- as.factor(ToothGrowth$dose)
#' vis_group_normality(ToothGrowth$len, ToothGrowth$dose)
#'
#' @export

vis_group_normality <- function(samples, 
                                groups, 
                                conf.level = 0.95, 
                                samplename = "", 
                                groupname = "", 
                                cex = 1) {
  
  # Store original par settings
  oldpar <- par(no.readonly = TRUE)
  on.exit(par(oldpar))
  
  # Rename for internal consistency (legacy variable names)
  y <- samples
  g <- groups
  
  # Clean data
  complete_cases <- complete.cases(y, g)
  y <- y[complete_cases]
  g <- as.factor(g[complete_cases])
  
  # Get group information
  group_levels <- levels(g)
  k <- length(group_levels)
  
  # Check minimum requirements
  if (k < 2) {
    stop("At least 2 groups required")
  }
  
  # Check if too many groups for meaningful visualization
  if (k > 8) {
    cat("\n")
    cat("===============================================\n")
    cat("  TOO MANY GROUPS FOR VISUAL DISPLAY (k = ", k, ")\n")
    cat("===============================================\n\n")
    cat("Visual plots are not meaningful with more than 8 groups.\n")
    cat("Providing normality test results in tabular format instead.\n\n")
    
    # Calculate test results for all groups
    shapiro_tests <- list()
    ad_tests <- list()
    
    results_table <- data.frame(
      Group = character(k),
      n = integer(k),
      Shapiro_W = numeric(k),
      Shapiro_p = numeric(k),
      AD_A = numeric(k),
      AD_p = numeric(k),
      stringsAsFactors = FALSE
    )
    
    for (i in seq_along(group_levels)) {
      gname <- group_levels[i]
      gdata <- y[g == gname]
      n <- length(gdata)
      
      # Shapiro-Wilk test
      if (n >= 3 && n <= 5000) {
        sh_test <- shapiro.test(gdata)
        shapiro_tests[[gname]] <- sh_test
        results_table$Shapiro_W[i] <- sh_test$statistic
        results_table$Shapiro_p[i] <- sh_test$p.value
      } else {
        shapiro_tests[[gname]] <- list(
          statistic = NA,
          p.value = NA,
          method = paste0("Shapiro-Wilk test requires n: 3-5000 (n = ", n, ")")
        )
        results_table$Shapiro_W[i] <- NA
        results_table$Shapiro_p[i] <- NA
      }
      
      # Anderson-Darling test
      if (n >= 7) {
        ad_test <- nortest::ad.test(gdata)
        ad_tests[[gname]] <- ad_test
        results_table$AD_A[i] <- ad_test$statistic
        results_table$AD_p[i] <- ad_test$p.value
      } else {
        ad_tests[[gname]] <- list(
          statistic = NA,
          p.value = NA,
          method = paste0("Anderson-Darling test requires n >= 7 (n = ", n, ")")
        )
        results_table$AD_A[i] <- NA
        results_table$AD_p[i] <- NA
      }
      
      results_table$Group[i] <- gname
      results_table$n[i] <- n
    }
    
    # Format and print table
    results_table$Shapiro_W <- round(results_table$Shapiro_W, 4)
    results_table$Shapiro_p <- format.pval(results_table$Shapiro_p, digits = 3, eps = 0.001)
    results_table$AD_A <- round(results_table$AD_A, 4)
    results_table$AD_p <- format.pval(results_table$AD_p, digits = 3, eps = 0.001)
    
    cat("Normality Test Results by Group:\n")
    cat("================================\n\n")
    print(results_table, row.names = FALSE)
    
    cat("\n")
    cat("Interpretation:\n")
    alpha <- 1 - conf.level
    cat("  - p >", alpha, ": Data consistent with normality\n")
    cat("  - p <=", alpha, ": Evidence against normality\n")
    cat("\n")
    
    # Count how many groups fail normality
    n_fail_shapiro <- sum(results_table$Shapiro_p != "NA" & 
                            as.numeric(sub("<", "", results_table$Shapiro_p)) < alpha, 
                          na.rm = TRUE)
    n_fail_ad <- sum(results_table$AD_p != "NA" & 
                       as.numeric(sub("<", "", results_table$AD_p)) < alpha, 
                     na.rm = TRUE)
    
    cat("Summary:\n")
    cat("  ", n_fail_shapiro, "out of", k, "groups fail Shapiro-Wilk test (p <", alpha, ")\n")
    cat("  ", n_fail_ad, "out of", k, "groups fail Anderson-Darling test (p <", alpha, ")\n")
    cat("\n")
    
    if (n_fail_shapiro > 0 || n_fail_ad > 0) {
      cat("Recommendation: Consider using Kruskal-Wallis test (non-parametric)\n")
    } else {
      cat("Recommendation: Normality assumption appears reasonable for Welch ANOVA\n")
    }
    cat("\n")
    
    # Return results invisibly
    result <- list(
      shapiro_tests = shapiro_tests,
      ad_tests = ad_tests,
      n_groups = k,
      group_names = group_levels,
      results_table = results_table,
      note = "Too many groups for visual display"
    )
    
    class(result) <- "vis_group_normality"
    return(invisible(result))
  }
  
  # For k <= 8: Visual plots with 2 rows × k columns layout
  
  # Split data by groups
  group_data <- lapply(group_levels, function(lev) y[g == lev])
  names(group_data) <- group_levels
  
  # Run normality tests for each group
  shapiro_tests <- list()
  ad_tests <- list()
  
  for (i in seq_along(group_levels)) {
    gname <- group_levels[i]
    gdata <- group_data[[i]]
    n <- length(gdata)
    
    # Shapiro-Wilk test
    if (n >= 3 && n <= 5000) {
      shapiro_tests[[gname]] <- shapiro.test(gdata)
    } else if (n < 3) {
      shapiro_tests[[gname]] <- list(
        statistic = NA, 
        p.value = NA,
        method = paste0("Shapiro-Wilk test requires n >= 3 (n = ", n, ")")
      )
    } else {
      shapiro_tests[[gname]] <- list(
        statistic = NA,
        p.value = NA,
        method = paste0("Shapiro-Wilk test allows max n = 5000 (n = ", n, ")")
      )
    }
    
    # Anderson-Darling test
    if (n >= 7) {
      ad_tests[[gname]] <- nortest::ad.test(gdata)
    } else {
      ad_tests[[gname]] <- list(
        statistic = NA,
        p.value = NA,
        method = paste0("Anderson-Darling test requires n >= 7 (n = ", n, ")")
      )
    }
  }
  
  # Simple 2 × k layout: histograms on top row, Q-Q plots on bottom row
  # Adjust cex based on number of groups
  if (k <= 3) {
    plot_cex <- 0.6 * cex
    mar_val <- c(4, 4, 3, 1)
  } else if (k <= 5) {
    plot_cex <- 0.55 * cex
    mar_val <- c(3.5, 3.5, 2.5, 0.5)
  } else {
    # k = 6, 7, 8
    plot_cex <- 0.5 * cex
    mar_val <- c(3, 3, 2, 0.5)
  }
  
  par(mfrow = c(2, k),
      mar = mar_val,
      oma = c(0, 0, 2.5, 0),
      cex = plot_cex,
      font.main = 1,  # 1 = plain, not bold
      font.lab = 1,
      font.axis = 1)
  
  # Row 1: All histograms
  for (i in seq_along(group_levels)) {
    gname <- group_levels[i]
    gdata <- group_data[[i]]
    
    # Skip if no data
    if (length(gdata) == 0) {
      plot.new()
      next
    }
    
    # Calculate statistics
    gmean <- mean(gdata, na.rm = TRUE)
    gsd <- sd(gdata, na.rm = TRUE)
    
    # Get p-values for this group
    p_s <- shapiro_tests[[gname]]$p.value
    p_a <- ad_tests[[gname]]$p.value
    
    # --- Histogram with normal overlay ---
    hist_data <- hist(gdata, plot = FALSE)
    max_hist_density <- max(hist_data$density)
    max_normal_peak <- dnorm(gmean, mean = gmean, sd = gsd)
    y_max <- max(max_hist_density, max_normal_peak) * 1.1
    
    # Create title with p-values
    p_s_text <- if (!is.na(p_s)) signif(p_s, 2) else "N/A"
    p_a_text <- if (!is.na(p_a)) signif(p_a, 2) else "N/A"
    main_text <- paste0(gname, "\nSW p=", p_s_text, ", AD p=", p_a_text)
    
    hist(gdata, 
         freq = FALSE,
         main = main_text,
         xlab = "",
         ylab = if (i == 1) "Density" else "",
         ylim = c(0, y_max),
         col = "lightblue", 
         border = "black")
    
    # Overlay normal distribution
    data_range <- range(gdata)
    normal_range <- c(gmean - 3*gsd, gmean + 3*gsd)
    full_range <- c(min(data_range[1], normal_range[1]), 
                    max(data_range[2], normal_range[2]))
    x_seq <- seq(full_range[1], full_range[2], length = 200)
    normal_curve <- dnorm(x_seq, mean = gmean, sd = gsd)
    lines(x_seq, normal_curve, col = "red", lwd = 2)
  }
  
  # Row 2: All Q-Q plots
  for (i in seq_along(group_levels)) {
    gname <- group_levels[i]
    gdata <- group_data[[i]]
    
    # Skip if no data
    if (length(gdata) == 0) {
      plot.new()
      next
    }
    
    # --- Q-Q plot ---
    qqnorm(gdata, 
           main = paste0(gname, " Q-Q"),
           xlab = "Theoretical",
           ylab = if (i == 1) "Sample" else "")
    qqline(gdata, col = "red", lwd = 2)
  }
  
  # Outer title
  mtext("Normality Diagnostics by Group", outer = TRUE, cex = 1.0, line = 0.5)
  
  # Return results
  result <- list(
    shapiro_tests = shapiro_tests,
    ad_tests = ad_tests,
    n_groups = k,
    group_names = group_levels
  )
  
  class(result) <- "vis_group_normality"
  return(invisible(result))
}

Any scripts or data that you put into this service are public.

visStatistics documentation built on May 13, 2026, 1:08 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

visStatistics
Automated Selection and Visualisation of Statistical Hypothesis Tests

R/vis_group_normality.R
In visStatistics: Automated Selection and Visualisation of Statistical Hypothesis Tests

Defines functions vis_group_normality

Documented in vis_group_normality

Try the visStatistics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

visStatistics Automated Selection and Visualisation of Statistical Hypothesis Tests

R/vis_group_normality.R In visStatistics: Automated Selection and Visualisation of Statistical Hypothesis Tests

Defines functions vis_group_normality

Documented in vis_group_normality

Try the visStatistics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

visStatistics
Automated Selection and Visualisation of Statistical Hypothesis Tests

R/vis_group_normality.R
In visStatistics: Automated Selection and Visualisation of Statistical Hypothesis Tests