R/modified.gtest.R
In Upsilon: Another Test of Association for Count Data

Documented in modified.gtest

# modified.gtest.R
#
# Author: Xuye Luo, Mingzhou Song
#
# Modified:
#   December 20, 2025. Updated documentation
#
#   December 14, 2025. Renamed the method 
#     name to improve clarity
#
#   December 11, 2025
#
#' @title Zero-Tolerant G-Test for Contingency Tables
#'
#' @description Performs \emph{G}-test 
#'   \insertCite{WOOLF:1957aa}{Upsilon} 
#'   on contingency tables, slightly modified 
#'   to handle rows or columns of all zeros.
#' 
#' @references 
#' \insertRef{WOOLF:1957aa}{Upsilon}
#' 
#' @details
#'   This test is useful if a \emph{p}-value must be returned
#'   on a contingency table with valid non-negative counts,
#'   where other implementations of \emph{G}-test could 
#'   return \code{NA} as the \emph{p}-value, regardless of a 
#'   pattern being strong or weak.
#'   
#'   This function handles tables with empty rows
#'   or columns (where expected values are 0) by
#'   calculating the test statistic over non-zero
#'   entries only. This prevents the result from
#'   becoming \code{NA}, while giving meaningful
#'   \emph{p}-values.
#'
#' @param x a matrix or data frame of floating or integer
#'  numbers to specify a contingency table. Entries
#'  must be non-negative.
#'  
#' @param log.p a logical. If \code{TRUE}, 
#'   the \emph{p}-value is calculated in
#'   closed form to \strong{natural logarithm} of \emph{p}-value 
#'   to improve numerical precision when
#'   \emph{p}-value approaches zero.
#'   Defaults to \code{FALSE}.
#'
#' @return A list with class \code{"htest"} containing:
#' \item{statistic}{the \emph{G} statistic (log-likelihood ratio).}
#' \item{parameter}{the degrees of freedom.}
#' \item{p.value}{the \emph{p}-value of the test.}
#' \item{estimate}{the value of mutual information.}
#' \item{method}{a character string indicating the method used.}
#' \item{data.name}{a character string, name of the input data.}
#' \item{observed}{the observed counts.}
#' \item{expected}{the expected counts under the null hypothesis.}
#'
#' @examples
#' library("Upsilon")
#' 
#' # Create a sparse table with empty rows/cols
#' x <- matrix(
#'   c(0, 3, 0, 
#'     3, 0, 0), 
#'   nrow = 2, byrow = TRUE
#' )
#' print(x)
#' # Perform the modified G-test
#' modified.gtest(x)
#' @importFrom stats pchisq
#' @export
modified.gtest <- function(x, log.p = FALSE) {
  
  METHOD <- "Zero-tolerant G-Test"
  DNAME  <- deparse(substitute(x))
  
  x <- as.matrix(x)
  
  # Basic properties
  n  <- sum(x)
  nr <- nrow(x)
  nc <- ncol(x)
  k  <- min(nr, nc)
  
  # empty table or insufficient dimensions
  if (n == 0 || k <= 1) {
    STATISTIC <- 0
    ESTIMATE  <- 0
    PARAMETER <- (nr - 1L) * (nc - 1L)
    PVAL      <- if (log.p) 0 else 1
    E         <- x
  } else {
    # Calculate Expected Frequencies
    sr <- rowSums(x)
    sc <- colSums(x)
    E  <- outer(sr, sc, "*") / n
    term <- x * log(x / E)
    STATISTIC <- 2 * sum(term, na.rm = TRUE)
    
    # Degrees of Freedom
    PARAMETER <- (nr - 1L) * (nc - 1L)
    
    # P-value
    PVAL <- stats::pchisq(STATISTIC, PARAMETER, lower.tail = FALSE, log.p = log.p)
    
    # Mutual Information: I = G / 2N
    ESTIMATE <- STATISTIC / (2 * n)
  }
  
  names(STATISTIC) <- "Log-likelihood Ratio (G)"
  names(PARAMETER) <- "df"
  names(PVAL)      <- "p.value"
  names(ESTIMATE)  <- "Mutual Information"
  
  structure(
    list(
      statistic = STATISTIC,
      parameter = PARAMETER,
      estimate  = ESTIMATE,
      p.value   = PVAL,
      method    = METHOD,
      data.name = DNAME,
      observed  = x,
      expected  = E
    ),
    class = "htest"
  )
}