#' Association between categorical variables
#'
#' @description
#' Compute the Cramer's V, a descriptive statistic that measures the association between categorical
#' variables.
#'
#' @param x Data frame or matrix with a set of categorical variables.
#'
#' @details
#' Any integer variable is internally converted to a factor.
#'
#' @return
#' A matrix with the Cramer's V between the categorical variables.
#'
#' @author Alessandro Samuel-Rosa \email{alessandrosamuelrosa@@gmail.com}
#'
#' @note
#' The original code is available at <https://sas-and-r.blogspot.com/>, Example 8.39:
#' calculating Cramer's V, posted by Ken Kleinman on Friday, June 3, 2011. As such, Ken Kleinman
#' \email{Ken_Kleinman@@hms.harvard.edu} is entitled a \sQuote{contributor} to the R-package
#' __pedometrics__.
#'
# The function \code{bigtabulate} used to compute the chi-squared test is the main bottleneck in
# the current version of [pedometrics::cramer()]. Ideally it will be implemented in C++.
#'
#' @references
#' Cramér, H. _Mathematical methods of statistics_. Princeton: Princeton University Press, p. 575,
#' 1946.
#'
#' Everitt, B. S. _The Cambridge dictionary of statistics_. Cambridge: Cambridge University Press,
#' p. 432, 2006.
#'
#' @section Reverse dependencies:
#' The __spsann__ package, provider of methods for the optimization of sample configurations using
#' spatial simulated annealing in R, requires [pedometrics::cramer()] for some of its functions to
#' work. The development version of the __spsann__ package is available on
#' <https://github.com/Laboratorio-de-Pedometria/spsann-package>.
#'
#' @examples
#' if (interactive()) {
#' data(meuse, package = "sp")
#' str(meuse)
#' test <- cramer(meuse[, c("ffreq", "soil", "lime", "landuse")])
#' }
# FUNCTION #########################################################################################
#' @export
cramer <-
function(x) {
nam <- colnames(x)
# perform some checks
check <- sapply(x, is.factor)
if (any(check == FALSE)) {
check <- which(check == FALSE)
for (i in check)
x[, i] <- as.data.frame(sapply(x[, i], as.factor))
}
check <- sapply(x, nlevels)
if (any(check < 2L)) {
stop("each variable must have at least 2 levels")
}
# the statistics is calculated using a for loop
res <- matrix(NA, nrow = dim(x)[2], ncol = dim(x)[2])
for (i in 1:dim(x)[2]) {
for (j in 1:dim(x)[2]) {
n <- length(x[, i])
x2 <- .chisqStat(x[, i], x[, j]) / n
#x2 <- suppressWarnings(chisq.test(x[, i], x[, j], correct = FALSE))
#x2 <- x2$statistic / n
den <- min(length(unique(x[, i])), length(unique(x[, j]))) - 1
#res[i, j] <- as.numeric(sqrt(x2 / den))
res[i, j] <- sqrt(x2 / den)
}
}
colnames(res) <- nam
rownames(res) <- nam
return(res)
}
# Pearson's Chi-squared statistic ##############################################
.chisqStat <-
function(x, y) {
#x <- table(x, y)
OBSERVED <- table(x, y)
# OBSERVED <- bigtabulate::bigtabulate(cbind(x, y), ccols = c(1, 2))
n <- sum(OBSERVED)
sr <- rowSums(OBSERVED)
sc <- colSums(OBSERVED)
EXPECTED <- outer(sr, sc, "*") / n
#YATES <- 0
#STATISTIC <- sum((abs(OBSERVED - EXPECTED) - YATES) ^ 2 / EXPECTED)
#STATISTIC <- sum(abs(OBSERVED - EXPECTED) ^ 2 / EXPECTED)
STATISTIC <- sum(abs(OBSERVED - EXPECTED) ^ 2 / EXPECTED, na.rm = TRUE)
return(STATISTIC)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.