R/modified.chisq.test.R
In Upsilon: Another Test of Association for Count Data

Documented in modified.chisq.test

# modified.chisq.test.R
#
# Author: Xuye Luo, Mingzhou Song
#
# Modified:
#   December 20, 2025. Updated documentation
#
#   December 14, 2025. Renamed the method 
#     name to improve clarity
#
#   December 11, 2025
#
#' @title Zero-Tolerant Pearson's Chi-squared Test for Contingency Tables
#'
#' @description Performs Pearson's chi-squared test 
#'   \insertCite{pearson1900}{Upsilon} 
#'   on contingency tables, slightly modified 
#'   to handle rows or columns of all zeros. 
#' 
#' @references 
#' \insertRef{pearson1900}{Upsilon}
#'   
#' @details
#'   This test is useful if \emph{p}-value must be returned
#'   on a contingency table with valid non-negative counts,
#'   where the build-in R implementation of 
#'   \code{\link[stats]{chisq.test}} could return \code{NA}
#'   as \emph{p}-value, regardless of a pattern being
#'   strong or weak. See Examples.
#'   
#'   Unlike \code{\link[stats]{chisq.test}}, this
#'   function handles tables with empty rows or columns (where
#'   expected values are 0) by calculating the test
#'   statistic over non-zero entries only. This prevents
#'   the result from becoming \code{NA}, while giving
#'   meaningful \emph{p}-values.
#' 
#' @note This function only takes contingency table
#'   as input. It does not support goodness-of-fit
#'   test on vectors.
#'   It does \strong{not} offer an option
#'   to apply Yates's continuity correction 
#'   on 2 \eqn{\times} 2 tables.
#'   
#' @param x a matrix or data frame of floating or integer
#'  numbers to specify a contingency table. Entries
#'  must be non-negative.
#'  
#' @param log.p a logical. If \code{TRUE}, 
#'   the \emph{p}-value is calculated in
#'   closed form to \strong{natural logarithm} of \emph{p}-value 
#'   to improve numerical precision when
#'   \emph{p}-value approaches zero.
#'   Defaults to \code{FALSE}.
#'
#' @return A list with class \code{"htest"} containing:
#' \item{statistic}{the chi-squared test statistic (calculated ignoring entries of 0-expected count).}
#' \item{parameter}{the degrees of freedom.}
#' \item{p.value}{the \emph{p}-value by the test.}
#' \item{estimate}{Cramér's \emph{V} statistic.}
#' \item{observed}{the observed counts.}
#' \item{expected}{the expected counts under the null hypothesis.}
#'
#' @importFrom stats pchisq
#' @export
#'
#' @examples
#' library("Upsilon")
#' 
#' # A table with a dominant function and an empty column
#' x <- matrix(
#'   c(0, 3, 0,
#'     3, 0, 0), 
#'    nrow = 2, byrow = TRUE)
#' print(x)
#' 
#' # Standard chisq.test fails or returns NA warning
#' chisq.test(x)
#' 
#' # Modified chi-squared test is significant:
#' modified.chisq.test(x)

modified.chisq.test <- function(x, log.p = FALSE) {
  
  METHOD <- "Zero-tolerant chi-squared test"
  DNAME  <- deparse(substitute(x))
  
  x <- as.matrix(x)
  
  # Calculate totals
  n  <- sum(x)
  nr <- nrow(x)
  nc <- ncol(x)
  k  <- min(nr, nc)
  
  # Calculate Expected values matrix
  sr <- rowSums(x)
  sc <- colSums(x)
  E  <- outer(sr, sc, "*") / n
  
  # Initialize variables
  STATISTIC <- 0
  ESTIMATE  <- 0
  PARAMETER <- (nr - 1L) * (nc - 1L)
  
  # Empty table, insufficient dimensions, or zero total count
  if (n == 0 || k <= 1) {
    PVAL <- if (log.p) 0 else 1
  } else {
    # Use na.rm=TRUE to ignore division by zero errors where E=0
    term <- (x - E)^2 / E
    STATISTIC <- sum(term, na.rm = TRUE)
    
    # Calculate Cramer's V
    # Avoid division by zero in effect size calculation
    denom <- n * (k - 1)
    if (denom > 0) {
      ESTIMATE <- sqrt(STATISTIC / denom)
    }
    
    PVAL <- stats::pchisq(STATISTIC, PARAMETER, lower.tail = FALSE, log.p = log.p)
  }
  
  names(STATISTIC) <- "X-squared"
  names(ESTIMATE)  <- "Cram\uE9r's V"
  names(PARAMETER) <- "df"
  names(PVAL)      <- "p.value"
  
  structure(
    list(
      statistic = STATISTIC,
      estimate  = ESTIMATE,
      parameter = PARAMETER,
      p.value   = PVAL,
      method    = METHOD,
      data.name = DNAME,
      observed  = x,
      expected  = E
    ),
    class = "htest"
  )
}