R/make_euclidean.R

Defines functions make_euclidean

Documented in make_euclidean

#' Force a Pairwise Squared Distance Matrix to Euclidean Form
#'
#' Given a pairwise squared distance matrix \eqn{D} (where \eqn{D[i,j] = d(i,j)^2}),
#' this function ensures that \eqn{D} corresponds to a valid Euclidean squared
#' distance matrix. The correction is based on the weighted Gram matrix
#' \eqn{G_w = -\frac{1}{2} J_w D J_w^\top}, where \eqn{J_w = I_n - \mathbf{1} w^\top}
#' is the centering matrix defined by the weight vector \eqn{w}.
#'
#' If the smallest eigenvalue \eqn{\lambda_{\min}} of \eqn{G_w} is below the
#' negative tolerance \code{-tol}, the function corrects \eqn{D} by adding a
#' constant shift to guarantee positive semi-definiteness of the Gram matrix,
#' following the approach of \insertCite{lingoes1971some}{dbrobust} and
#' \insertCite{mardia1978some}{dbrobust}:
#' \deqn{
#' D_{\text{new}} = D + 2 c \mathbf{1} \mathbf{1}^\top - 2 c I_n,
#' }
#' where \eqn{c = |\lambda_{\min}|}.
#'
#' @param D Numeric square matrix (n x n) of pairwise squared distances.
#'   Must be symmetric with zeros on the diagonal.
#' @param w Numeric vector of weights (length n). Internally normalized to sum to 1.
#' @param tol Numeric tolerance for detecting negative eigenvalues (default: 1e-10).
#'
#' @return A list with components:
#' \describe{
#'   \item{D_euc}{Corrected pairwise squared Euclidean distance matrix (n x n).}
#'   \item{eigvals_before}{Eigenvalues of the weighted Gram matrix before correction.}
#'   \item{eigvals_after}{Eigenvalues of the weighted Gram matrix after correction.}
#'   \item{transformed}{Logical, TRUE if correction was applied, FALSE otherwise.}
#' }
#'
#' @seealso
#' \code{\link[stats]{dist}}, \code{\link[base]{eigen}}, \code{\link{cmdscale}}
#'
#' @examples
#' # Load example dataset
#' data("Data_HC_contamination")
#'
#' # Reduce dataset to first 50 rows
#' Data_small <- Data_HC_contamination[1:50, ]
#'
#' # Select only continuous variables
#' cont_vars <- names(Data_small)[1:4]
#' Data_cont <- Data_small[, cont_vars]
#'
#' # Compute squared Euclidean distance matrix
#' dist_mat <- as.matrix(dist(Data_cont))^2
#'
#' # Introduce a small non-Euclidean distortion
#' dist_mat[1, 2] <- dist_mat[1, 2] * 0.5
#' dist_mat[2, 1] <- dist_mat[1, 2]
#'
#' # Uniform weights
#' weights <- rep(1, nrow(Data_cont))
#'
#' # Apply Euclidean correction
#' res <- make_euclidean(dist_mat, weights)
#'
#' # Check results (minimum eigenvalues before/after)
#' res$transformed
#' min(res$eigvals_before)
#' min(res$eigvals_after)
#'
#' # First 5x5 block of corrected matrix
#' round(res$D_euc[1:5, 1:5], 4)
#'
#' @references
#' \insertRef{lingoes1971some}{dbrobust}
#' \insertRef{mardia1978some}{dbrobust}
#'
#' @export
make_euclidean <- function(D, w, tol = 1e-10) {
  if (!is.matrix(D) || nrow(D) != ncol(D)) {
    stop("D must be a square matrix of pairwise squared distances.")
  }
  n <- nrow(D)
  if (length(w) != n) {
    stop("Length of weights vector must equal number of observations.")
  }

  if (!isSymmetric(D)) {
    stop("D must be symmetric.")
  }
  if (!all(diag(D) == 0)) {
    stop("Diagonal entries of D must be zero.")
  }

  w <- w / sum(w)  # Normalize weights

  # Weighted centering matrix
  Jw <- diag(n) - rep(1, n) %*% t(w)
  Gw <- -0.5 * Jw %*% D %*% t(Jw)

  eigvals_before <- eigen(Gw, symmetric = TRUE, only.values = TRUE)$values
  min_eig <- min(eigvals_before)

  transformed <- FALSE
  if (min_eig < -tol) {
    c <- abs(min_eig)
    Ones <- matrix(1, n, n)
    D_euc <- D + 2 * c * Ones - 2 * c * diag(n)
    transformed <- TRUE
  } else {
    D_euc <- D
  }

  # Compute eigenvalues after correction for info
  Gw_after <- -0.5 * Jw %*% D_euc %*% t(Jw)
  eigvals_after <- eigen(Gw_after, symmetric = TRUE, only.values = TRUE)$values

  return(list(
    D_euc = D_euc,
    eigvals_before = eigvals_before,
    eigvals_after = eigvals_after,
    transformed = transformed
  ))
}

Try the dbrobust package in your browser

Any scripts or data that you put into this service are public.

dbrobust documentation built on Nov. 5, 2025, 6:24 p.m.