R/dist_binary.R

Defines functions dist_binary

Documented in dist_binary

#' Compute pairwise binary distances
#'
#' Internal helper function to compute pairwise distances between binary vectors
#' using standard binary distance/similarity measures. Delegates to
#' \code{ade4::dist.binary} when available for performance.
#'
#' Supported methods (for two binary vectors \eqn{x_i} and \eqn{x_j}):
#' \itemize{
#'   \item \code{"jaccard"}: \deqn{d = 1 - \frac{a}{a + b + c}}
#'   \item \code{"dice"}: \deqn{d = 1 - \frac{2a}{2a + b + c}}
#'   \item \code{"sokal_michener"}: \deqn{d = 1 - \frac{a + d}{a + b + c + d}}
#'   \item \code{"russell_rao"}: \deqn{d = 1 - \frac{a}{a + b + c + d}}
#'   \item \code{"sokal_sneath"}: \deqn{d = 1 - \frac{a}{a + 1/2(b + c)}}
#'   \item \code{"kulczynski"}: \deqn{d = 1 - \frac{1}{2}\left(\frac{a}{a+b} + \frac{a}{a+c}\right)}
#'   \item \code{"hamming"}: \deqn{d = 1 - \frac{a + d}{a + b + c + d}}
#' }
#'
#' Where:
#' \itemize{
#'   \item \eqn{a} = number of positions where both vectors are 1
#'   \item \eqn{b} = number of positions where x_i = 1 and x_j = 0
#'   \item \eqn{c} = number of positions where x_i = 0 and x_j = 1
#'   \item \eqn{d} = number of positions where both vectors are 0
#' }
#'
#' The Sokal-Michener coefficient is equivalent to the normalized Hamming distance.
#'
#' @param x A numeric matrix or data frame of binary values (0/1, TRUE/FALSE, or NA)
#' @param method A character string specifying the binary distance measure to use.
#'
#' @return A symmetric numeric matrix of pairwise distances. NA is returned for pairs
#'   with no valid comparisons (all NA entries).
#'
#' @details
#' \itemize{
#'  \item Factors or character columns are converted to numeric 0/1.
#'  \item Missing values (NA) are ignored pairwise; if all entries are missing, distance is NA.
#'  \item Methods supported by \code{ade4} (e.g., Jaccard, Dice, Sokal-Michener, etc.) are
#'   computed via \code{ade4::dist.binary} for efficiency.
#'  \item Manual computations are implemented for Hamming and Kulczynski if \code{ade4} is unavailable.
#' }
#'
#' @examples
#' # Small example with binary matrix
#' mat <- matrix(c(
#'   1, 0, 1,
#'   1, 1, 0,
#'   0, 1, 1
#' ), nrow = 3, byrow = TRUE)
#'
#' # Example with Jaccard
#' dbrobust:::dist_binary(mat, method = "jaccard")
#'
#' # Example with Hamming
#' dbrobust:::dist_binary(mat, method = "hamming")
#'
#' @keywords internal
dist_binary <- function(x, method) {
  # Convert data.frame or matrix to numeric 0/1 if factor/character
  if (is.data.frame(x)) {
    x <- data.frame(lapply(x, function(col) {
      if (is.factor(col) || is.character(col)) {
        as.numeric(as.character(col))
      } else {
        col
      }
    }))
  } else {
    if (is.factor(x) || is.character(x)) {
      x <- as.numeric(as.character(x))
    }
  }

  # Ensure it's a numeric matrix
  x <- as.matrix(x)

  # Validate binary values
  if (!all(x %in% c(0, 1, TRUE, FALSE, NA))) {
    stop("Binary methods require binary (0/1 or TRUE/FALSE) data")
  }

  # Map of ade4-compatible methods
  ade4_methods <- c(
    jaccard         = 1,
    dice            = 5,
    sokal_michener  = 2,
    sokal_sneath    = 3,
    ochiai          = 7,
    russell_rao     = 10
  )

  method_std <- tolower(gsub("-", "_", method))  # Normalize input

  if (method_std %in% names(ade4_methods)) {
    if (!requireNamespace("ade4", quietly = TRUE)) {
      stop("The 'ade4' package is required for method: ", method)
    }
    dist_mat <- ade4::dist.binary(x, method = ade4_methods[[method_std]])
    return(as.matrix(dist_mat))
  }

  # Manual implementations for unsupported methods
  n <- nrow(x)
  d <- matrix(0, n, n)

  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      xi <- x[i, ]
      xj <- x[j, ]
      valid <- !is.na(xi) & !is.na(xj)

      a <- sum(xi[valid] == 1 & xj[valid] == 1)
      b <- sum(xi[valid] == 1 & xj[valid] == 0)
      c <- sum(xi[valid] == 0 & xj[valid] == 1)
      d_ <- sum(xi[valid] == 0 & xj[valid] == 0)
      total <- a + b + c + d_

      if (total == 0) {
        # Si no hay valores vĂ¡lidos, la distancia es NA
        dist_val <- NA_real_
      } else {
        dist_val <- switch(method_std,
                           hamming     = (b + c) / total,
                           kulczynski  = 1 - 0.5 * ((a / (a + b)) + (a / (a + c))),
                           stop("Unsupported binary method: ", method)
        )
      }

      d[i, j] <- d[j, i] <- dist_val
    }
  }

  diag(d) <- 0
  return(d)
}

Try the dbrobust package in your browser

Any scripts or data that you put into this service are public.

dbrobust documentation built on Nov. 5, 2025, 6:24 p.m.