R/robust_distances.R

Defines functions robust_distances

Documented in robust_distances

#' Compute Robust Squared Distances for Mixed Data
#'
#' Computes a weighted, robust squared distance matrix for datasets
#' containing continuous, binary, and categorical variables. Continuous
#' variables are handled via a robust Mahalanobis distance, and binary
#' and categorical variables are transformed via similarity coefficients.
#' The output is suitable for Euclidean correction with \code{\link{make_euclidean}}.
#'
#' @param data Data frame or numeric matrix containing the observations.
#' @param cont_vars Character vector of column names for continuous variables.
#' @param bin_vars Character vector of column names for binary variables.
#' @param cat_vars Character vector of column names for categorical variables.
#' @param w Numeric vector of observation weights. If NULL, uniform weights are used.
#' @param p Integer vector of length 3: \code{c(#cont, #binary, #categorical)}. Overrides variable type selection if provided.
#' @param method Character string: either \code{"ggower"} or \code{"relms"} for distance computation.
#' @param robust_cov Optional. Precomputed robust covariance matrix for continuous variables.
#'   If NULL, it will be estimated internally using the specified trimming proportion \code{alpha}.
#' @param alpha Numeric trimming proportion for robust covariance of continuous variables.
#' @param return_dist Logical. If TRUE, returns an object of class \code{dist}; otherwise, returns a squared distance matrix.
#'
#' @examples
#' # Example: Robust Squared Distances for Mixed Data
#'
#' # Load example data and subset
#' data("Data_HC_contamination", package = "dbrobust")
#' Data_small <- Data_HC_contamination[1:50, ]
#'
#' # Define variable types
#' cont_vars <- c("V1", "V2", "V3", "V4")  # continuous
#' cat_vars  <- c("V5", "V6", "V7")        # categorical
#' bin_vars  <- c("V8", "V9")              # binary
#'
#' # Use column w_loop as weights
#' w <- Data_small$w_loop
#'
#' # -------------------------------
#' # Method 1: Gower distances
#' # -------------------------------
#' dist_sq_ggower <- robust_distances(
#'   data = Data_small,
#'   cont_vars = cont_vars,
#'   bin_vars  = bin_vars,
#'   cat_vars  = cat_vars,
#'   w = w,
#'   alpha = 0.10,
#'   method = "ggower"
#' )
#'
#' # Apply Euclidean correction if needed
#' res_ggower <- make_euclidean(dist_sq_ggower, w)
#'
#' # Show first 5x5 block of original and corrected distances
#' cat("GGower original squared distances (5x5 block):\n")
#' print(round(dist_sq_ggower[1:5, 1:5], 4))
#' cat("\nGGower corrected squared distances (5x5 block):\n")
#' print(round(res_ggower$D_euc[1:5, 1:5], 4))
#'
#' # -------------------------------
#' # Method 2: RelMS distances
#' # -------------------------------
#' dist_sq_relms <- robust_distances(
#'   data = Data_small,
#'   cont_vars = cont_vars,
#'   bin_vars  = bin_vars,
#'   cat_vars  = cat_vars,
#'   w = w,
#'   alpha = 0.10,
#'   method = "relms"
#' )
#'
#' # Apply Euclidean correction if needed
#' res_relms <- make_euclidean(dist_sq_relms, w)
#'
#' # Show first 5x5 block of original and corrected distances
#' cat("RelMS original squared distances (5x5 block):\n")
#' print(round(dist_sq_relms[1:5, 1:5], 4))
#' cat("\nRelMS corrected squared distances (5x5 block):\n")
#' print(round(res_relms$D_euc[1:5, 1:5], 4))
#'
#' @return A numeric matrix of squared robust distances (n x n) or a \code{dist} object if \code{return_dist = TRUE}.
#' @export
robust_distances <- function(
    data = NULL,
    cont_vars = NULL,
    bin_vars = NULL,
    cat_vars = NULL,
    w = NULL,
    p = NULL,
    method = c("ggower", "relms"),
    robust_cov = NULL,
    alpha = 0.1,
    return_dist = FALSE # if TRUE returns simple distances (not squared)
) {
  method <- match.arg(method)

  if (is.null(data)) stop("'data' must be provided")

  if (!is.null(p) && (!is.null(cont_vars) || !is.null(bin_vars) || !is.null(cat_vars))) {
    stop("Provide either 'p' or the variable groups (cont_vars, bin_vars, cat_vars), but not both.")
  }

  if (is.null(p)) {
    if (is.null(cont_vars) || is.null(bin_vars) || is.null(cat_vars)) {
      stop("You must provide either 'p' or all variable groups (cont_vars, bin_vars, cat_vars).")
    }
    vars_to_check <- c(cont_vars, bin_vars, cat_vars)
    if (!all(vars_to_check %in% colnames(data))) {
      missing_vars <- vars_to_check[!(vars_to_check %in% colnames(data))]
      stop(paste("The following variables are missing in 'data':", paste(missing_vars, collapse = ", ")))
    }
    if (any(is.na(data[, vars_to_check]))) stop("NA values found in selected variables. Please clean or impute them first.")

    # Reorder columns as indicated
    df_ordered <- data[, vars_to_check]

    # Convert factors/ordered to numeric for calculations
    df_ordered <- data.frame(lapply(df_ordered, function(x) {
      if (is.factor(x) || is.character(x)) as.numeric(as.factor(x)) else x
    }))
    data_mat <- as.matrix(df_ordered)
    p <- c(length(cont_vars), length(bin_vars), length(cat_vars))
  } else {
    # If p is defined, assume data is matrix ready
    if (any(is.na(data))) stop("NA values found in 'data' matrix. Please clean or impute them first.")
    data_mat <- data
  }

  n <- nrow(data_mat)
  if (sum(p) != ncol(data_mat)) stop("Sum of p must equal number of columns in 'data'.")

  # Weights
  if (is.null(w)) {
    w <- rep(1/n, n)
  } else {
    if (length(w) != n) stop("Length of 'w' must match number of rows in 'data'.")
    if (any(w < 0)) stop("Weights must be non-negative.")
    if (abs(sum(w) - 1) > sqrt(.Machine$double.eps)) w <- w / sum(w)
  }

  # Robust covariance for continuous variables if not provided
  if (p[1] > 0) {
    if (is.null(robust_cov)) {
      X_cont <- as.matrix(data_mat[, 1:p[1], drop = FALSE])
      rob_res <- robust_covariance_gv(X_cont, w, alpha)
      robust_cov <- rob_res$S
    } else {
      if (!is.matrix(robust_cov)) stop("'robust_cov' must be a matrix")
      if (nrow(robust_cov) != p[1] || ncol(robust_cov) != p[1]) {
        stop(paste0("'robust_cov' must be a square matrix with dimension ", p[1]))
      }
    }
  }

  # Calculate squared distances
  D2 <- switch(
    method,
    ggower = robust_ggower(data_mat, w, p, robust_cov),
    relms = robust_RelMS(data_mat, w, p, robust_cov)
  )
  D2 <- as.matrix(D2)
  attr(D2, "class") <- "matrix"

  # Attach robust covariance trimming info if available
  if (exists("rob_res")) {
    if (!is.null(rob_res$central_idx)) attr(D2, "central_idx") <- rob_res$central_idx
    if (!is.null(rob_res$outlier_idx)) attr(D2, "outlier_idx") <- rob_res$outlier_idx
    if (!is.null(rob_res$phi)) attr(D2, "phi") <- rob_res$phi
    if (!is.null(rob_res$q)) attr(D2, "q") <- rob_res$q
  }

  # Return simple distances if requested
  if (return_dist) {
    return(dbstats::D2toDist(D2))
  }

  return(D2)
}

Try the dbrobust package in your browser

Any scripts or data that you put into this service are public.

dbrobust documentation built on Nov. 5, 2025, 6:24 p.m.