R/dist_continuous.R

Defines functions dist_continuous

Documented in dist_continuous

#' Compute pairwise distances for continuous numeric data
#'
#' Internal helper function to compute pairwise distance matrices for purely numeric datasets.
#' Supports standard metrics, including Euclidean, Manhattan, Chebyshev, Canberra, Minkowski,
#' standardized Euclidean, and Mahalanobis distances.
#'
#' Supported methods and formulas (for observations \eqn{\mathbf{z}_i} and \eqn{\mathbf{z}_j}):
#' \itemize{
#'   \item \code{"euclidean"}: \deqn{\delta_E(i,j) = \sqrt{\sum_{k=1}^{p} (z_{ik} - z_{jk})^2}}
#'   \item \code{"minkowski"}: \deqn{\delta_q(i,j) = \left( \sum_{k=1}^{p} |z_{ik} - z_{jk}|^q \right)^{1/q}} requires \code{p = q}
#'   \item \code{"manhattan"}: \deqn{\delta_1(i,j) = \sum_{k=1}^{p} |z_{ik} - z_{jk}|}
#'   \item \code{"maximum"}: \deqn{\delta_\infty(i,j) = \max_k |z_{ik} - z_{jk}|}
#'   \item \code{"canberra"}: \deqn{\delta_C(i,j) = \sum_{k=1}^{p} \frac{|z_{ik} - z_{jk}|}{|z_{ik}| + |z_{jk}|}} convention: \eqn{0/0 := 0}
#'   \item \code{"euclidean_standardized"}: \deqn{\delta_K(i,j) = \sqrt{\sum_{k=1}^{p} \frac{(z_{ik} - z_{jk})^2}{s_k^2}}} \eqn{s_k^2} is the variance of variable k
#'   \item \code{"mahalanobis"}: \deqn{\delta_M(i,j) = \sqrt{ (\mathbf{z}_i - \mathbf{z}_j)' \mathbf{S}^{-1} (\mathbf{z}_i - \mathbf{z}_j) }} \eqn{\mathbf{S}} is the covariance matrix
#' }
#'
#' @param x A numeric data frame or matrix with rows as observations and columns as variables.
#' @param method Distance metric to compute (see details for supported options).
#' @param p Numeric, the power parameter for Minkowski distance (required if \code{method = "minkowski"}).
#'
#' @return A symmetric numeric matrix of pairwise distances between rows of \code{x}.
#'   The diagonal contains zeros.
#'
#' @details
#' Considerations when choosing a distance metric:
#' \itemize{
#'  \item For \code{"euclidean_standardized"}, columns are standardized to mean 0 and variance 1 before
#'   computing Euclidean distances.
#'  \item Cosine and correlation distances rely on the \pkg{proxy} package; these are not guaranteed to be
#'   strictly Euclidean.
#'  \item Minkowski distance requires specifying the parameter \code{p} (e.g., \code{p = 3} for L3 norm).
#'  \item Mahalanobis distance uses the inverse of the covariance matrix. If the covariance matrix is
#'   singular, the generalized inverse from \pkg{MASS::ginv} is used.
#'  \item Standard metrics (Euclidean, Manhattan, Maximum, Canberra) are computed using \code{stats::dist}.
#' }
#' @examples
#' # Small numeric matrix
#' mat <- matrix(c(1, 2, 3,
#'                 4, 5, 6,
#'                 7, 8, 9), nrow = 3, byrow = TRUE)
#'
#' # Euclidean distance
#' dbrobust:::dist_continuous(mat, method = "euclidean")
#'
#' # Standardized Euclidean
#' dbrobust:::dist_continuous(mat, method = "euclidean_standardized")
#'
#' # Minkowski distance with p = 3
#' dbrobust:::dist_continuous(mat, method = "minkowski", p = 3)
#'
#' # Mahalanobis distance
#' set.seed(123)
#' mat <- matrix(rnorm(5*3), nrow = 5, ncol = 3)
#' colnames(mat) <- c("X1","X2","X3")
#' # Compute the mahalanobis distance
#' dbrobust:::dist_continuous(mat, method = "mahalanobis")
#'
#' # Cosine distance (requires 'proxy' package)
#' dbrobust:::dist_continuous(mat, method = "cosine")
#'
#' @keywords internal
dist_continuous <- function(x, method, p = NULL) {
  # Coerce to matrix to simplify computations
  if (is.data.frame(x)) x <- as.matrix(x)

  # Validate that all columns are numeric
  if (!is.numeric(x)) {
    stop("Continuous methods require all columns to be numeric")
  }

  # Euclidean with column standardization
  if (method == "euclidean_standardized") {
    x_scaled <- scale(x)
    return(as.matrix(stats::dist(x_scaled, method = "euclidean")))
  }

  # Cosine / Correlation distances: rely on 'proxy'
  if (method %in% c("cosine", "correlation")) {
    if (!requireNamespace("proxy", quietly = TRUE)) {
      stop("Package 'proxy' is required for method: ", method)
    }
    warning(
      "The chosen method '", method,
      "' does not guarantee Euclidean distances. Proceeding with calculation.",
      call. = FALSE
    )
    return(as.matrix(proxy::dist(x, method = method)))
  }

  # Minkowski distance requires exponent parameter p
  if (method == "minkowski") {
    if (is.null(p)) stop("You must specify parameter 'p' for Minkowski distance")
    return(as.matrix(stats::dist(x, method = "minkowski", p = p)))
  }

  # Mahalanobis distance: uses covariance structure
  if (method == "mahalanobis") {
    if (ncol(x) < 2) {
      stop("Mahalanobis distance requires at least two variables")
    }
    covmat <- stats::cov(x, use = "pairwise.complete.obs")
    inv_covmat <- tryCatch(
      solve(covmat),
      error = function(e) {
        warning("Covariance matrix singular. Using generalized inverse (MASS::ginv).")
        MASS::ginv(covmat)
      }
    )

    n <- nrow(x)
    d <- matrix(0, n, n)

    for (i in 1:(n - 1)) {
      for (j in (i + 1):n) {
        diff <- as.numeric(x[i, ] - x[j, ])
        d[i, j] <- d[j, i] <- sqrt(t(diff) %*% inv_covmat %*% diff)
      }
    }

    diag(d) <- 0
    return(d)
  }

  # Fallback: use base R 'dist' for standard methods (euclidean, manhattan, etc.)
  return(as.matrix(stats::dist(x, method = method)))
}

Try the dbrobust package in your browser

Any scripts or data that you put into this service are public.

dbrobust documentation built on Nov. 5, 2025, 6:24 p.m.