R/upsilon.test.R
In Upsilon: Another Test of Association for Count Data

Documented in upsilon.test

# upsilon.test.R
#
# Author: Xuye Luo, Mingzhou Song
#
# Created: March 19, 2021
# Updated: 
#   December 20, 2025. Updated documentation
#
#   December 12, 2025

#' @title Upsilon Test of Association for Count Data
#'
#' @description Performs the Upsilon test to evaluate
#'  association among categorical variables represented
#'  by a contingency table.
#' 
#' @details The Upsilon test is designed to promote 
#'  dominant function patterns. In contrast to other
#'  tests of association to favor all function
#'  patterns, it is unique in demoting non-dominant
#'  function patterns.
#'  
#'  Null hypothesis (\eqn{H_0}): Row and column variables are 
#'    statistically independent.
#'  
#'  Null population: A discrete uniform distribution,
#'    where each entry in the table has the same
#'    probability.
#'
#'  Null distribution: The Upsilon test statistic 
#'    asymptotically follows a chi-squared distribution
#'    with \code{(nrow(x) - 1)(ncol(x) - 1)} degrees of freedom,
#'    under the null hypothesis on the null population.
#'    
#'  See \insertCite{luo2021upsilon}{Upsilon} for full 
#'  details of the Upsilon test. 
#'
#' @param x a matrix or data frame of floating or integer
#'  numbers to specify a contingency table. Entries
#'  must be non-negative.
#'  
#' @param log.p a logical. If \code{TRUE}, 
#'   the \emph{p}-value is calculated in
#'   closed form to \strong{natural logarithm} of \emph{p}-value 
#'   to improve numerical precision when
#'   \emph{p}-value approaches zero.
#'   Defaults to \code{FALSE}.
#'
#' @return A list with class \code{"htest"} containing:
#' \item{statistic}{the value of the Upsilon statistic.}
#' \item{parameter}{the degrees of freedom.}
#' \item{p.value}{the \emph{p}-value.}
#' \item{estimate}{the effect size.}
#' \item{method}{a character string giving the test name.}
#' \item{data.name}{a character string giving the name of input data.}
#' \item{observed}{the observed counts, a matrix copy of the input data.}
#' \item{expected}{the expected counts under the null
#'  hypothesis using the observed marginals.}
#'
#' @references
#' \insertRef{luo2021upsilon}{Upsilon}
#' @importFrom stats pchisq
#' @export
#'
#' @examples
#' library("Upsilon")
#' 
#' # A contingency table with independent row and column variables
#' x <- matrix(
#'   c(1, 1, 0, 
#'     1, 1, 0,
#'     1, 1, 0), 
#'   nrow = 3, byrow = TRUE
#'  )
#'  
#' print(x)
#' 
#' upsilon.test(x)
#' 
#' # A contingency table with a non-dominant function
#' x <- matrix(
#'   c(4, 0, 0, 
#'     0, 1, 0,
#'     0, 0, 1), 
#'   nrow = 3, byrow = TRUE
#'  )
#'  
#' print(x)
#' 
#' upsilon.test(x)
#' 
#' # A contingency table with a dominant function
#' x <- matrix(
#'   c(2, 0, 0, 
#'     0, 2, 0,
#'     0, 0, 2), 
#'   nrow = 3, byrow = TRUE)
#'   
#' print(x)
#' 
#' upsilon.test(x)
#' 
#' # Another contingency table with a dominant function
#' x <- matrix(
#'   c(3, 0, 0, 
#'     0, 3, 0,
#'     0, 0, 0), 
#'   nrow = 3, byrow = TRUE)
#' 
#' print(x)
#' 
#' upsilon.test(x)

upsilon.test <- function(x, log.p = FALSE) {
  
  METHOD <- "Upsilon test"
  DNAME  <- deparse(substitute(x))
  
  x <- as.matrix(x)
  
  # Input Validation
  if (any(x < 0, na.rm = TRUE)) {
    stop("Observed counts 'x' must be non-negative.")
  }
  
  n  <- sum(x)
  nr <- as.numeric(nrow(x))
  nc <- as.numeric(ncol(x))
  k  <- min(nr, nc)
  
  # Handle degenerate cases (empty table or insufficient dimensions)
  if (n == 0 || k <= 1) {
    STATISTIC <- 0
    ESTIMATE  <- 0
    PARAMETER <- (nr - 1L) * (nc - 1L)
    PVAL      <- if (log.p) 0 else 1
    E         <- x
  } else {
    # Expected values under independence
    sr <- rowSums(x)
    sc <- colSums(x)
    E  <- outer(sr, sc, "*") / n
    
    # Upsilon Calculation
    # Normalization factor: Average count per cell
    avg <- n / (nr * nc)
    
    # Statistic: sum((O - E)^2) / avg
    term <- (x - E)^2
    STATISTIC <- sum(term, na.rm = TRUE) / avg
    
    # Effect Size
    ESTIMATE <- sqrt(STATISTIC / (n * nr * nc / 4))
    
    PARAMETER <- (nr - 1L) * (nc - 1L)
    PVAL <- stats::pchisq(STATISTIC, PARAMETER, lower.tail = FALSE, log.p = log.p)
  }
  
  names(STATISTIC) <- "Upsilon"
  names(ESTIMATE)  <- "Effect size"
  names(PARAMETER) <- "df"
  names(PVAL)      <- "p.value"
  
  structure(
    list(
      statistic = STATISTIC,
      estimate  = ESTIMATE,
      parameter = PARAMETER,
      p.value   = PVAL,
      method    = METHOD,
      data.name = DNAME,
      observed  = x,
      expected  = E
    ),
    class = "htest"
  )
}