Nothing
#' Compute pairwise distances for categorical data
#'
#' Internal helper function to compute distances between observations based on
#' the matching coefficient, which measures the proportion of matching attributes
#' between two categorical vectors. This approach is particularly useful for
#' multiclass categorical variables.
#'
#' The distance between two observations \eqn{i} and \eqn{j} is defined as:
#' \deqn{d(i, j) = 1 - \frac{\alpha}{p^\prime}}
#' where \eqn{\alpha} is the number of matching attributes (agreements) and \eqn{p'}
#' is the number of non-missing comparisons between the two observations.
#'
#' @param x A data frame or matrix containing only categorical variables (factor or character)
#' @param method Currently only \code{"matching_coefficient"} is supported.
#'
#' @return A symmetric numeric matrix of pairwise distances. Distance is in the
#' range [0, 1], where 0 indicates complete agreement and 1 indicates
#' complete disagreement. NA is returned for pairs with no valid comparisons
#' (all NA entries).
#'
#' @details
#' \itemize{
#' \item Only categorical columns (factor or character) are supported; numeric columns
#' must be converted prior to using this function.
#' \item Missing values (NA) are ignored pairwise. If all attributes are missing
#' for a given pair, the distance is returned as NA.
#' \item This distance is equivalent to the normalized Hamming distance when
#' applied to binary variables.
#' \item The matching coefficient satisfies metric properties and can be used
#' as a building block for mixed-type distances (e.g., combined with
#' quantitative distances via Gower's similarity).
#' }
#'
#' @examples
#' # Small categorical dataset
#' df <- data.frame(
#' A = factor(c("red", "blue", "red")),
#' B = factor(c("circle", "circle", "square"))
#' )
#' # Compute matching coefficient
#' dbrobust:::dist_categorical(df)
#'
#' @keywords internal
dist_categorical <- function(x, method = "matching_coefficient") {
# Validate input columns are all categorical
if (!all(sapply(x, function(col) is.factor(col) || is.character(col)))) {
stop("Categorical methods require factor or character columns")
}
n <- nrow(x)
d <- matrix(0, n, n)
# Loop through all unique pairs (i, j) with i < j
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
# Count attribute-wise matches and valid comparisons
matches <- sum(x[i, ] == x[j, ], na.rm = TRUE)
total <- sum(!is.na(x[i, ]) & !is.na(x[j, ]))
# Compute 1 - matching proportion
d_val <- if (total > 0) 1 - (matches / total) else NA
# Fill symmetric matrix
d[i, j] <- d[j, i] <- d_val
}
}
diag(d) <- 0 # zero distance to self
return(d)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.