#' @title Measures of association for contingency tables
#' @name crosstable_statistics
#'
#' @description This function calculates various measure of association for
#' contingency tables and returns the statistic and p-value.
#' Supported measures are Cramer's V, Phi, Spearman's rho,
#' Kendall's tau and Pearson's r.
#'
#' @param data A data frame or a table object. If a table object, \code{x1} and
#' \code{x2} will be ignored. For Kendall's \emph{tau}, Spearman's \emph{rho}
#' or Pearson's product moment correlation coefficient, \code{data} needs
#' to be a data frame. If \code{x1} and \code{x2} are not specified,
#' the first two columns of the data frames are used as variables
#' to compute the crosstab.
#' @param formula A formula of the form \code{lhs ~ rhs} where \code{lhs} is a
#' numeric variable giving the data values and \code{rhs} a factor giving the
#' corresponding groups.
#' @param tab A \code{\link{table}} or \code{\link[stats]{ftable}}. Tables of class
#' \code{\link[stats]{xtabs}} and other will be coerced to \code{ftable}
#' objects.
#' @param x1 Name of first variable that should be used to compute the
#' contingency table. If \code{data} is a table object, this argument
#' will be irgnored.
#' @param x2 Name of second variable that should be used to compute the
#' contingency table. If \code{data} is a table object, this argument
#' will be irgnored.
#' @param statistics Name of measure of association that should be computed. May
#' be one of \code{"auto"}, \code{"cramer"}, \code{"phi"}, \code{"spearman"},
#' \code{"kendall"}, \code{"pearson"} or \code{"fisher"}. See 'Details'.
#' @param ci.lvl Scalar between 0 and 1. If not \code{NULL}, returns a data
#' frame including lower and upper confidence intervals.
#' @param ... Other arguments, passed down to the statistic functions
#' \code{\link[stats]{chisq.test}}, \code{\link[stats]{fisher.test}} or
#' \code{\link[stats]{cor.test}}.
#'
#' @inheritParams means_by_group
#' @inheritParams bootstrap
#' @inheritParams boot_ci
#'
#' @return For \code{phi()}, the table's Phi value. For \code{cramer()}, the
#' table's Cramer's V.
#' \cr \cr
#' For \code{crosstable_statistics()}, a list with following components:
#' \describe{
#' \item{\code{estimate}}{the value of the estimated measure of association.}
#' \item{\code{p.value}}{the p-value for the test.}
#' \item{\code{statistic}}{the value of the test statistic.}
#' \item{\code{stat.name}}{the name of the test statistic.}
#' \item{\code{stat.html}}{if applicable, the name of the test statistic, in HTML-format.}
#' \item{\code{df}}{the degrees of freedom for the contingency table.}
#' \item{\code{method}}{character string indicating the name of the measure of association.}
#' \item{\code{method.html}}{if applicable, the name of the measure of association, in HTML-format.}
#' \item{\code{method.short}}{the short form of association measure, equals the \code{statistics}-argument.}
#' \item{\code{fisher}}{logical, if Fisher's exact test was used to calculate the p-value.}
#' }
#'
#' @details The p-value for Cramer's V and the Phi coefficient are based
#' on \code{chisq.test()}. If any expected value of a table cell is
#' smaller than 5, or smaller than 10 and the df is 1, then \code{fisher.test()}
#' is used to compute the p-value, unless \code{statistics = "fisher"}; in
#' this case, the use of \code{fisher.test()} is forced to compute the
#' p-value. The test statistic is calculated with \code{cramer()} resp.
#' \code{phi()}.
#' \cr \cr
#' Both test statistic and p-value for Spearman's rho, Kendall's tau
#' and Pearson's r are calculated with \code{cor.test()}.
#' \cr \cr
#' When \code{statistics = "auto"}, only Cramer's V or Phi are calculated,
#' based on the dimension of the table (i.e. if the table has more than
#' two rows or columns, Cramer's V is calculated, else Phi).
#'
#' @examples
#' # Phi coefficient for 2x2 tables
#' tab <- table(sample(1:2, 30, TRUE), sample(1:2, 30, TRUE))
#' phi(tab)
#'
#' # Cramer's V for nominal variables with more than 2 categories
#' tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE))
#' cramer(tab)
#'
#' # formula notation
#' data(efc)
#' cramer(e16sex ~ c161sex, data = efc)
#'
#' # bootstrapped confidence intervals
#' cramer(e16sex ~ c161sex, data = efc, ci.lvl = .95, n = 100)
#'
#' # 2x2 table, compute Phi automatically
#' crosstable_statistics(efc, e16sex, c161sex)
#'
#' # more dimensions than 2x2, compute Cramer's V automatically
#' crosstable_statistics(efc, c172code, c161sex)
#'
#' # ordinal data, use Kendall's tau
#' crosstable_statistics(efc, e42dep, quol_5, statistics = "kendall")
#'
#' # calcilate Spearman's rho, with continuity correction
#' crosstable_statistics(efc,
#' e42dep,
#' quol_5,
#' statistics = "spearman",
#' exact = FALSE,
#' continuity = TRUE
#' )
#' @export
crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), weights = NULL, ...) {
# match arguments
statistics <- match.arg(statistics)
# name for test statistics in HTML
stat.html <- NULL
# check if data is a table
if (!is.table(data)) {
# evaluate unquoted names
x1 <- deparse(substitute(x1))
x2 <- deparse(substitute(x2))
weights <- deparse(substitute(weights))
# if names were quotes, remove quotes
x1 <- gsub("\"", "", x1, fixed = T)
x2 <- gsub("\"", "", x2, fixed = T)
weights <- gsub("\"", "", weights, fixed = T)
if (sjmisc::is_empty(weights) || weights == "NULL")
weights <- NULL
else
weights <- data[[weights]]
# check for "NULL" and get data
if (x1 != "NULL" && x2 != "NULL")
data <- data[, c(x1, x2)]
else
data <- data[, 1:2]
if (!is.null(weights)) data <- cbind(data, weights)
# make table
if (!is.null(weights)) {
tab <- as.table(round(stats::xtabs(data[[3]] ~ data[[1]] + data[[2]])))
class(tab) <- "table"
} else
tab <- table(data)
} else {
# 'data' is a table - copy to table object
tab <- data
# check if statistics are possible to compute
if (statistics %in% c("spearman", "kendall", "pearson")) {
stop(
sprintf(
"Need arguments `data`, `x1` and `x2` to compute %s-statistics.",
statistics
),
call. = F
)
}
}
# get expected values
tab.val <- table_values(tab)
# remember whether fisher's exact test was used or not
use.fisher <- FALSE
# select statistics automatically, based on number of rows/columns
if (statistics %in% c("auto", "cramer", "phi", "fisher")) {
# get chisq-statistics, for df and p-value
chsq <- suppressWarnings(stats::chisq.test(tab, ...))
pv <- chsq$p.value
test <- chsq$statistic
# set statistics name
names(test) <- "Chi-squared"
stat.html <- "χ<sup>2</sup>"
# check row/columns
if ((nrow(tab) > 2 || ncol(tab) > 2 || statistics %in% c("cramer", "fisher")) && statistics != "phi") {
# get cramer's V
s <- cramer(tab)
# if minimum expected values below 5, compute fisher's exact test
if (statistics == "fisher" ||
min(tab.val$expected) < 5 ||
(min(tab.val$expected) < 10 && chsq$parameter == 1)) {
pv <- stats::fisher.test(tab, simulate.p.value = TRUE, ...)$p.value
use.fisher <- TRUE
}
# set statistics
statistics <- "cramer"
} else {
# get Phi
s <- phi(tab)
# if minimum expected values below 5 and df=1, compute fisher's exact test
if (min(tab.val$expected) < 5 ||
(min(tab.val$expected) < 10 && chsq$parameter == 1)) {
pv <- stats::fisher.test(tab, ...)$p.value
use.fisher <- TRUE
}
# set statistics
statistics <- "phi"
}
} else {
# compute correlation coefficient
cv <- stats::cor.test(x = data[[1]], y = data[[2]], method = statistics, ...)
# get statistics and p-value
s <- cv$estimate
pv <- cv$p.value
test <- cv$statistic
stat.html <- names(test)
}
# compute method string
method <- dplyr::case_when(
statistics == "kendall" ~ "Kendall's tau",
statistics == "spearman" ~ "Spearman's rho",
statistics == "pearson" ~ "Pearson's r",
statistics == "cramer" ~ "Cramer's V",
statistics == "phi" ~ "Phi"
)
# compute method string
method.html <- dplyr::case_when(
statistics == "kendall" ~ "Kendall's τ",
statistics == "spearman" ~ "Spearman's ρ",
statistics == "pearson" ~ "Pearson's r",
statistics == "cramer" ~ "Cramer's V",
statistics == "phi" ~ "φ"
)
# return result
structure(class = "sj_xtab_stat", list(
estimate = s,
p.value = pv,
statistic = test,
stat.name = names(test),
stat.html = stat.html,
df = (nrow(tab) - 1) * (ncol(tab) - 1),
n_obs = sum(tab, na.rm = TRUE),
method = method,
method.html = method.html,
method.short = statistics,
fisher = use.fisher
))
}
#' @rdname crosstable_statistics
#' @export
xtab_statistics <- crosstable_statistics
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.