R/robust_covariance_gv.R

#' Robust Covariance Estimation Based on Geometric Variability
#'
#' Computes a robust covariance matrix for a weighted dataset by selecting
#' the most central subset of observations according to geometric variability.
#' Observations are ranked based on a proximity function measuring how far each
#' individual is from the rest of the data. The most central subset is then used
#' to compute a covariance matrix.
#'
#' @param X Numeric matrix of dimension n x p, where n is the number of observations
#'   and p is the number of variables.
#' @param w Numeric vector of weights of length n. Weights will be normalized to sum to 1.
#' @param alpha Numeric trimming proportion between 0 and 1 (e.g., 0.05, 0.10, 0.15)
#'   indicating the fraction of most extreme observations to discard.
#'
#' @return A list containing:
#' \describe{
#'   \item{S}{Robust covariance matrix of dimension p x p.}
#'   \item{central_idx}{Indices of observations selected as the central subset.}
#'   \item{outlier_idx}{Indices of observations considered outliers.}
#'   \item{phi}{Proximity function values for all observations.}
#'   \item{q}{Threshold value used for trimming (quantile of phi).}
#' }
#'
#' @examples
#' # Load a small subset of the example dataset
#' data("Data_HC_contamination", package = "dbrobust")
#' Data_small <- Data_HC_contamination[1:20, ]
#'
#' # Select only continuous variables
#' cont_vars <- names(Data_small)[1:4]
#' Data_cont <- Data_small[, cont_vars]
#'
#' # Set uniform weights and trimming proportion
#' weights <- rep(1, nrow(Data_cont))
#' alpha <- 0.10
#'
#' # Compute robust covariance with trimming
#' res <- dbrobust:::robust_covariance_gv(Data_cont, weights, alpha)
#'
#' # Inspect results: central observations, outliers, covariance, threshold, proximity
#' res$central_idx
#' res$outlier_idx
#' round(res$S, 4)
#' res$q
#' round(res$phi[1:10], 4)
#'
#' @importFrom stats quantile
#' @keywords internal
robust_covariance_gv <- function (X, w, alpha) {
  n <- dim(X)[1]  # number of individuals

  # Ensure X is numeric matrix
  X <- as.matrix(X)
  w <- as.numeric(w)

  # Normalize weights
  w <- w / sum(w)
  Dw <- diag(w)
  Jw <- diag(n) - rep(1, n) %*% t(w)
  Jw <- sqrt(Dw) %*% Jw       # element-wise sqrt of Dw
  Xw <- Jw %*% X

  # Mahalanobis distance
  D <- StatMatch::mahalanobis.dist(Xw)  # from library(StatMatch)
  D2 <- D^2
  VG <- 1/2 * t(w) %*% D2 %*% w

  # Proximity function for each observation
  phi <- numeric(n)
  for (i in 1:n) {
    phi[i] <- D2[i,] %*% w - VG
  }

  # Select central subset
  q <- quantile(phi, probs = 1 - alpha, type = 5)
  central_idx <- which(phi <= q)
  outlier_idx <- setdiff(seq_len(n), central_idx)

  # Trimmed dataset
  Xtrim <- X[central_idx, , drop = FALSE]
  w2 <- w[central_idx]
  w2 <- w2 / sum(w2)

  # Robust covariance calculation
  Dw <- diag(w2)
  Jw <- diag(length(central_idx)) - rep(1, length(central_idx)) %*% t(w2)
  Jw <- sqrt(Dw) %*% Jw
  Xw <- Jw %*% Xtrim
  S <- t(Xw) %*% Xw

  return(list(
    S = S,
    central_idx = central_idx,
    outlier_idx = outlier_idx,
    phi = phi,
    q = q
  ))
}

Try the dbrobust package in your browser

Any scripts or data that you put into this service are public.

dbrobust documentation built on Nov. 5, 2025, 6:24 p.m.