R/ci_cramersv.R
In confintr: Confidence Intervals

Documented in ci_chisq_ncp ci_cramersv cramersv

#' Cramer's V
#'
#' This function calculates Cramer's V, a measure of association between two categorical
#' variables.
#'
#' Cramer's V is a scaled version of the chi-squared test statistic \eqn{\chi^2} and
#' takes values in \eqn{[0, 1]}. It is calculated as
#' \eqn{\sqrt{\chi^2 / (n \cdot (k - 1))}}, where \eqn{n} is the number of observations,
#' and \eqn{k} is the smaller of the number of levels of the two variables.
#'
#' Yates continuity correction is never applied. So in the 2x2 case, if `x` is the
#' result of [stats::chisq.test()], make sure no continuity correction was applied.
#' Otherwise, results can be inconsistent.
#'
#' @param x The result of [stats::chisq.test()], a matrix/table of counts, or
#'   a `data.frame` with exactly two columns representing the two variables.
#' @returns A numeric vector of length one.
#' @export
#' @examples
#' cramersv(mtcars[c("am", "vs")])
#' @references
#'   Cramer, Harald. 1946. Mathematical Methods of Statistics. Princeton: Princeton
#'     University Press, page 282 (Chapter 21. The two-dimensional case).
#' @seealso [ci_cramersv()]
cramersv <- function(x) {
  x <- cramersv_align_input(x, correct = FALSE)
  stat <- as.numeric(x[["statistic"]])
  n <- sum(x[["observed"]])
  k <- min(dim(x[["observed"]]))
  sqrt(stat / (n * (k - 1)))
}

#' CI for the Population Cramer's V
#'
#' This function calculates CIs for the population Cramer's V.
#' By default, a parametric approach based on the non-centrality parameter (NCP)
#' of the chi-squared distribution is utilized. Alternatively, bootstrap CIs are
#' available (default "bca"), also by boostrapping CIs for the NCP and then mapping
#' the result back to Cramer's V.
#'
#' A positive lower \eqn{(1 - \alpha) \cdot 100\%}-confidence limit for the NCP goes
#' hand-in-hand with a significant association test at level \eqn{\alpha}. In order to
#' allow such test approach also with Cramer's V, if the lower bound for the NCP is 0,
#' we round down to 0 the lower bound for Cramer's V as well.
#' Without this slightly conservative adjustment, the lower limit for V would always be
#' positive since the CI for V is found by
#' \eqn{\sqrt{(\textrm{CI for NCP} + \textrm{df})/(n \cdot (k - 1))}}, where \eqn{k} is the
#' smaller number of levels in the two variables (see Smithson, p.40).
#' Use `test_adjustment = FALSE` to switch off this behaviour. Note that this is
#' also a reason to bootstrap V via NCP instead of directly bootstrapping V.
#'
#' Further note that no continuity correction is applied for 2x2 tables,
#' and that large chi-squared test statistics might provide unreliable results with
#' method "chi-squared", see [stats::pchisq()].
#'
#' @inheritParams cramersv
#' @inheritParams ci_mean
#' @param type Type of CI. One of "chi-squared" (default) or "bootstrap".
#' @param test_adjustment Adjustment to allow for test of association, see Details.
#'   The default is `TRUE`.
#' @returns An object of class "cint", see [ci_mean()] for details.
#' @export
#' @examples
#' # Example from Smithson, M., page 41
#' test_scores <- as.table(
#'   rbind(
#'     Private = c(6, 14, 17, 9),
#'     Public = c(30, 32, 17, 3)
#'   )
#' )
#' suppressWarnings(X2 <- stats::chisq.test(test_scores))
#' ci_cramersv(X2)
#' @references
#'   Smithson, M. (2003). Confidence intervals. Series: Quantitative Applications in the
#'     Social Sciences. New York, NY: Sage Publications.
#' @seealso [cramersv()], [ci_chisq_ncp()]
ci_cramersv <- function(x, probs = c(0.025, 0.975),
                        type = c("chi-squared", "bootstrap"),
                        boot_type = c("bca", "perc", "norm", "basic"),
                        R = 9999L, seed = NULL, test_adjustment = TRUE, ...) {
  # Input check and initialization
  check_probs(probs)
  x <- cramersv_align_input(x, correct = FALSE)
  stat <- as.numeric(x[["statistic"]])
  n <- sum(x[["observed"]])
  k <- min(dim(x[["observed"]]))
  df <- x[["parameter"]]

  # ci for NCP -> ci for V
  out <- ci_chisq_ncp(
    x,
    probs = probs,
    type = type,
    boot_type = boot_type,
    R = R,
    seed = seed,
    correct = FALSE,
    ...
  )
  ci <- sqrt((out$interval + df) / (n * (k - 1)))  # Smithson p40

  # Modification to allow hypothesis test of association
  if (test_adjustment && out$interval[1L] == 0) {
    ci[1L] <- 0
  }

  # Replace NCP by Cramer's V
  out$estimate <- cramersv(x)
  out$interval <- check_output(ci, probs = probs, parameter_range = 0:1)
  out$parameter <- "population Cramer's V"
  out
}

#' CI for the NCP of the Chi-Squared Distribution
#'
#' This function calculates CIs for the non-centrality parameter (NCP) of the
#' \eqn{\chi^2}-distribution. A positive lower \eqn{(1 - \alpha) \cdot 100\%}-confidence
#' limit for the NCP goes hand-in-hand with a significant association test at level
#' \eqn{\alpha}.
#'
#' By default, CIs are computed by Chi-squared test inversion. This can be unreliable
#' for very large test statistics. The default bootstrap type is "bca".
#'
#' @inheritParams ci_cramersv
#' @param correct Should Yates continuity correction be applied to the 2x2 case? The
#'   default is `TRUE` (also used in the bootstrap), which differs from [ci_cramersv()].
#' @returns An object of class "cint", see [ci_mean()] for details.
#' @export
#' @examples
#' ci_chisq_ncp(mtcars[c("am", "vs")])
#' ci_chisq_ncp(mtcars[c("am", "vs")], type = "bootstrap", R = 999)  # Use larger R
#' @references
#'   Smithson, M. (2003). Confidence intervals. Series: Quantitative Applications in the
#'     Social Sciences. New York, NY: Sage Publications.
#' @seealso [ci_cramersv()]
ci_chisq_ncp <- function(x, probs = c(0.025, 0.975), correct = TRUE,
                         type = c("chi-squared", "bootstrap"),
                         boot_type = c("bca", "perc", "norm", "basic"),
                         R = 9999L, seed = NULL, ...) {
  # Input checks and initialization
  type <- match.arg(type)
  boot_type <- match.arg(boot_type)
  check_probs(probs)
  limits <- c(0, Inf)
  x <- cramersv_align_input(x, correct = correct)
  stat <- x[["statistic"]]
  df <- x[["parameter"]]
  estimate <- chi2_to_ncp(stat, df = df)

  # Calculate ci
  if (type == "chi-squared") {
    iprobs <- 1 - probs
    if (probs[1L] == 0) {
      lci <- limits[1L]
    } else {
      f1 <- function(ncp) stats::pchisq(stat, df = df, ncp = ncp) - iprobs[1L]
      lci <- try(stats::uniroot(f1, interval = c(0, estimate))$root, silent = TRUE)
      if (inherits(lci, "try-error")) {
        lci <- limits[1L]
      }
    }
    if (probs[2L] == 1) {
      uci <- limits[2L]
    } else {
      # Upper limit might be improved
      n <- sum(x[["observed"]])
      k <- min(dim(x[["observed"]]))
      upper_limit <- pmax(4 * estimate, df + n * (k - 1), 100)
      f2 <- function(ncp) stats::pchisq(stat, df = df, ncp = ncp) - iprobs[2L]
      uci <- try(
        stats::uniroot(f2, interval = c(estimate, upper_limit))$root, silent = TRUE
      )
      if (inherits(uci, "try-error")) {
        warning("Upper limit outside search range. Set to the maximum of the parameter range.")
        uci <- limits[2L]
      }
    }
    cint <- c(lci, uci)
  } else { # bootstrap
    # Reconstruct data
    tab <- data.frame(x[["observed"]])
    X <- tab[rep(seq_len(nrow(tab)), times = tab[, "Freq"]), 1:2]

    # Bootstrap the chi-squared test statistic
    check_bca(boot_type, n = nrow(X), R = R)
    set_seed(seed)
    S <- boot::boot(
      X,
      statistic = function(x, id) suppressWarnings(
        stats::chisq.test(table(x[id, 1L], x[id, 2L]), correct = correct)$statistic
      ),
      R = R,
      ...
    )
    cint <- ci_boot(S, boot_type = boot_type, probs = probs)

    # Map chi-squared to ncp
    cint <- chi2_to_ncp(cint, df = df)
  }

  # Organize output
  cint <- check_output(cint, probs = probs, parameter_range = limits)
  out <- list(
    parameter = "non-centrality parameter of the chi-squared distribution",
    interval = cint,
    estimate = estimate,
    probs = probs,
    type = type,
    info = boot_info(type, boot_type = boot_type, R = R)
  )
  class(out) <- "cint"
  out
}

# Helper functions

# Map chi-squared statistic to non-centrality parameter
chi2_to_ncp <- function(stat, df) {
  pmax(0, stat - df)
}

# Checks input and turns df into table/matrix
cramersv_align_input <- function(x, correct = FALSE) {
  stopifnot(inherits(x, "htest") || is.matrix(x) || is.data.frame(x))
  if (inherits(x, "htest")) {
    stopifnot("X-squared" %in% names(x[["statistic"]]))
  } else {
    if (is.data.frame(x)) {
      stopifnot(ncol(x) == 2L)
      x <- table(x[, 1L], x[, 2L])
    }
    stopifnot(all(x >= 0))
    x <- stats::chisq.test(x, correct = correct)
  }
  return(x)
}