R/multinom.test.R
In ExactMultinom: Multinomial Goodness-of-Fit Tests

Documented in multinom.test

#' @title Multinomial Goodness-of-Fit Tests
#' @aliases ExactMultinom
#' @importFrom Rcpp evalCpp
#' @importFrom stats rmultinom pchisq
#' @useDynLib ExactMultinom, .registration=TRUE
#'
#' @description Computes exact p-values for multinomial goodness-of-fit tests based on multiple test statistics, namely, 
#' Pearson's chi-square, the log-likelihood ratio and the probability mass statistic. 
#' Implements the algorithm detailed in Resin (2023). 
#' Estimates based on the classical asymptotic chi-square approximation or Monte-Carlo simulation can also be computed.
#'
#' @param x Vector of non-negative integers - number of times each outcome was observed.
#' @param p A vector of positive numbers - the hypothesized probabilities for each outcome. Need not sum to 1, but only encode hypothesized proportions correctly.
#' @param stat Test statistic to be used. Can be "Prob" for the probability mass, "Chisq" for Pearson's chi-square and "LLR" for the log-likelihood ratio.
#' @param method Method used to compute the p-values. Can be "exact", "asymptotic" and "Monte-Carlo".
#' @param theta Parameter used with the exact method. p-values less than theta will not be determined precisely. 
#' Values >= 10^-8 are recommended. Very small p-values (< 10^-8) may be falsified by rounding errors.
#' @param timelimit Time limit in seconds, after which the exact method is interrupted to avoid long runtime.
#' @param N Number of samples generated by the Monte-Carlo approach.
#'
#' @details The "exact" method implements the algorithm detailed in Resin (2023). The method improves on the full enumeration implemented in some other R packages.
#' It should work well if the number of categories is small. To avoid long runtimes the exact computation is interrupted after \code{timelimit} seconds. 
#' However, it may take longer than the specified time limit for the actual interrupt to occur. Only p-values greater \code{theta} are determined precisely. 
#' For p-values less than \code{theta}, the algorithm will only determine that the p-value is smaller than \code{theta}.
#' 
#' The "asymptotic" method returns classical chi-square approximations. The asymptotic approximation to the probability mass statistic is detailed in Section 2 of Resin (2020).
#' 
#' The "Monte-Carlo" method returns estimates based on \code{N} random draws from the hypothesized distribution. 
#' 
#' @return A list with class "mgof" containing
#'   \item{\code{x}}{As input by user.}
#'   \item{\code{p}}{As input by user.}
#'   \item{\code{stat}}{As input by user or default.}
#'   \item{\code{method}}{As input by user or default.}
#'   \item{\code{theta}}{As input by user or default.}
#'   \item{\code{pvals_ex}}{Exact p-values or NA. WARNING: Values less than theta are NOT exact p-values, but only imply that the actual p-value is less than that value.}
#'   \item{\code{pvals_as}}{Asymptotic approximation to p-values or NA.}
#'   \item{\code{pvals_mc}}{Monte-Carlo estimated p-values or NA.}
#' The first p-value (e.g., \code{pvals_ex[1]}) is obtained from the probability mass, the second from Pearson's chi-square and the third from the log-likelihood ratio.
#' 
#' 
#' @note Each method computes p-values for all three test statistics simultaneously.
#' 
#' @references Resin, J. (2023). A Simple Algorithm for Exact Multinomial Tests. \emph{Journal of Computational and Graphical Statistics} {\strong{32}, 539-550}.
#' 
#' @rdname multinom.test
#' @export
#'
#' @examples
#' # Test fairness of a die (that is, whether each side has the same probability)
#' p_fair = rep(1/6,6) # Hypothesized probabilities for each side
#' x = c(16,17,12,15,15,25) # Observed number of times each side appeared on 100 throws
#' # Exact multinomial test (using probability ordering by default):
#' multinom.test(x,p_fair)
#' # Exact multinomial test using log-likelihood ratio:
#' multinom.test(x,p_fair,stat = "LLR")
#' # Classical chi-square test (using asymptotics to estimate p-value and Pearson's chi-square):
#' multinom.test(x,p_fair,stat = "Chisq",method = "asymptotic")
#' # Using Monte-Carlo approach and probability ordering 
#' multinom.test(x,p_fair,stat = "Prob",method = "Monte-Carlo")

multinom.test = function(x,p,stat = "Prob",method = "exact",theta = 0.0001,timelimit = 10,N = 10000){
  if(length(which(c("Prob","Chisq","LLR") == stat)) == 0) stop("stat must be \"Prob\", \"Chisq\" or \"LLR\"")
  if(length(x)!=length(p)) stop("x and p are not of same length")
  if(any(is.na(x)) || any(x < 0) || max(abs(x-round(x))) > 10^-8) stop("x must be non-negative integers")
  if(any(p <= 0)) stop("Entries of p must be greater 0")
  p = p/sum(p)
  called_method = FALSE
  
  pvals_ex = NA
  if(is.element("exact",method)){
    called_method = TRUE
    if(length(x) > 15) stop("The exact method is not suited for this many outcomes. Use asymptotic or Monte-Carlo method instead.")
    setTimeLimit(cpu = timelimit,elapsed = timelimit)
    on.exit(setTimeLimit(cpu = Inf,elapsed = Inf))
    pvals_ex = multinom_test_cpp(x[order(p,decreasing = TRUE)],p[order(p,decreasing = TRUE)],theta)
    setTimeLimit(cpu = Inf,elapsed = Inf)
  }
  
  pvals_as = NA
  if(is.element("asymptotic",method)){
    called_method = TRUE
    m = length(x)
    pvals_as = 1-c(pchisq(t_prob(x,p),m-1),pchisq(t_chisq(x,p),m-1),pchisq(t_llr(x,p),m-1))
  }

  pvals_mc = NA
  if(is.element("Monte-Carlo",method)){
    called_method = TRUE
    m = length(x)
    n = sum(x)
    x_sim = rmultinom(N,n,p)
    mc_pval = function(t_fun){
      stat_x = t_fun(x,p)
      stat_sim = apply(x_sim,2,function(x) t_fun(x,p))
      pval = sum(stat_sim >= stat_x)/N
      return(pval)
    }
    pvals_mc = sapply(c(t_prob,t_chisq,t_llr),mc_pval)
  }

  if(!called_method) stop("method must be \"exact\",\"asymptotic\" or \"Monte-Carlo\"")
  
  structure(list(x = x,p = p,stat = stat,method = method,theta = theta,pvals_ex = pvals_ex,pvals_as = pvals_as,pvals_mc = pvals_mc),class = "mgof")
}


#' @export
print.mgof = function(x, ...){
  statNo = which(c("Prob","Chisq","LLR") == x$stat)
  methodNo = which(c("exact","asymptotic","Monte-Carlo") == x$method[1])
  pval = rbind(x$pvals_ex,x$pvals_as,x$pvals_mc)[methodNo,statNo]
  
  if(methodNo == 1 && pval < x$theta) cat("\nMultinomial Goodness-of-Fit Test\n\nP-value less than theta =",x$theta, "with",x$stat,"statistic using",x$method[1],"method.\n\n") 
  else cat("\nMultinomial Goodness-of-Fit Test\n\nP-value of",pval, "with",x$stat,"statistic using",x$method[1],"method.\n\n")
  
  invisible(x)
}