R/upsilon.gof.test.R
In Upsilon: Another Test of Association for Count Data

Documented in upsilon.gof.test

# upsilon.gof.test.R
#
# Author: Xuye Luo, Joe Song
#
# Updated:
# December 20, 2025. 
#   - Updated the documentation
#
# December 12, 2025

#' @title Upsilon Goodness-of-Fit Test for Count Data
#'
#' @description (FOR INTERNAL USE ONLY) Performs the Upsilon Goodness-of-Fit test to determine if a sample of 
#' observed counts fits a specified probability distribution.
#' The Upsilon statistic uses a specific normalization (dividing by the average expected count)
#' which differs from the standard Pearson's Chi-squared test.
#'
#' @param x A numeric vector representing observed counts. Must be non-negative.
#' @param p A numeric vector of probabilities of the same length as \code{x}.
#'          Defaults to a uniform distribution (1/length(x)).
#' @param rescale.p Logical. If \code{TRUE} (default), \code{p} is rescaled to sum to 1.
#'          If \code{FALSE}, \code{p} must sum to 1, otherwise an error is raised.
#'
#' @param log.p a logical. If \code{TRUE}, 
#'   the \emph{p}-value is calculated in
#'   closed form to \strong{natural logarithm} of \emph{p}-value 
#'   to improve numerical precision when
#'   \emph{p}-value approaches zero.
#'   Defaults to \code{FALSE}.
#'
#' @return A list with class \code{"htest"} containing:
#' \item{statistic}{The Upsilon test statistic.}
#' \item{parameter}{The degrees of freedom (k - 1).}
#' \item{p.value}{The p-value of the test.}
#' \item{estimate}{The effect size.}
#' \item{method}{A character string indicating the method used.}
#' \item{data.name}{A character string giving the name(s) of the data.}
#' \item{observed}{The observed counts.}
#' \item{expected}{The expected counts.}
#' \item{residuals}{The Pearson residuals.}
#' \item{p.normalized}{The probability vector used (after rescaling if applicable).}
#'
#' @importFrom stats pchisq
#' @export
#'
#' @examples
#' library("Upsilon")
#' 
#' # Test against uniform distribution
#' counts <- c(10, 20, 30)
#' upsilon.gof.test(counts)

#' @keywords internal
upsilon.gof.test <- function(
    x, 
    p = rep(1/length(x), length(x)), 
    rescale.p = TRUE,
    log.p = FALSE) 
{
  METHOD <- "Upsilon Goodness-of-Fit Test"
  DNAME  <- deparse(substitute(x))
  
  x <- as.vector(x)
  
  # Input Validation
  if (any(x < 0, na.rm = TRUE)) {
    stop("Observed counts 'x' must be non-negative.")
  }
  if (length(x) != length(p)) {
    stop("Observed counts 'x' and probabilities 'p' must have the same length.")
  }
  
  # Rescaling Logic
  p_sum <- sum(p)
  if (rescale.p) {
    if (abs(p_sum - 1) > 1e-9) {
      p <- p / p_sum
    }
  } else {
    if (abs(p_sum - 1) > 1e-9) {
      stop("Probabilities 'p' must sum to 1 when rescale.p = FALSE.")
    }
  }
  
  n <- sum(x)
  k <- length(x)
  PARAMETER <- k - 1L
  
  # Handle empty data case
  if (n == 0) {
    STATISTIC <- 0
    ESTIMATE  <- 0
    PVAL      <- if (log.p) 0 else 1
    E         <- x
    RESIDUALS <- x
  } else {
    # Calculate Expected Counts
    E <- n * p
    
    term <- (x - E)^2 / (n / k)
    STATISTIC <- sum(term, na.rm = TRUE)
    
    max_statistic <- k * n * (1 + sum(p^2) - 2 * min(p))
    
    # Effect Size
    ESTIMATE <- if (max_statistic > 0) sqrt(STATISTIC / max_statistic) else 0
    
    # P-value and Residuals
    PVAL <- stats::pchisq(STATISTIC, PARAMETER, lower.tail = FALSE, log.p = log.p)
    RESIDUALS <- ifelse(E > 0, (x - E) / sqrt(E), 0)
  }
  
  names(STATISTIC) <- "Upsilon"
  names(ESTIMATE)  <- "Effect size"
  names(PARAMETER) <- "df"
  names(PVAL)      <- "p.value"
  
  structure(
    list(
      statistic    = STATISTIC,
      estimate     = ESTIMATE,
      parameter    = PARAMETER,
      p.value      = PVAL,
      method       = METHOD,
      data.name    = DNAME,
      observed     = x,
      expected     = E,
      residuals    = RESIDUALS,
      p.normalized = p
    ),
    class = "htest"
  )
}