Nothing
#' Compute Robust Squared Distances for Mixed Data
#'
#' Computes a weighted, robust squared distance matrix for datasets
#' containing continuous, binary, and categorical variables. Continuous
#' variables are handled via a robust Mahalanobis distance, and binary
#' and categorical variables are transformed via similarity coefficients.
#' The output is suitable for Euclidean correction with \code{\link{make_euclidean}}.
#'
#' @param data Data frame or numeric matrix containing the observations.
#' @param cont_vars Character vector of column names for continuous variables.
#' @param bin_vars Character vector of column names for binary variables.
#' @param cat_vars Character vector of column names for categorical variables.
#' @param w Numeric vector of observation weights. If NULL, uniform weights are used.
#' @param p Integer vector of length 3: \code{c(#cont, #binary, #categorical)}. Overrides variable type selection if provided.
#' @param method Character string: either \code{"ggower"} or \code{"relms"} for distance computation.
#' @param robust_cov Optional. Precomputed robust covariance matrix for continuous variables.
#' If NULL, it will be estimated internally using the specified trimming proportion \code{alpha}.
#' @param alpha Numeric trimming proportion for robust covariance of continuous variables.
#' @param return_dist Logical. If TRUE, returns an object of class \code{dist}; otherwise, returns a squared distance matrix.
#'
#' @examples
#' # Example: Robust Squared Distances for Mixed Data
#'
#' # Load example data and subset
#' data("Data_HC_contamination", package = "dbrobust")
#' Data_small <- Data_HC_contamination[1:50, ]
#'
#' # Define variable types
#' cont_vars <- c("V1", "V2", "V3", "V4") # continuous
#' cat_vars <- c("V5", "V6", "V7") # categorical
#' bin_vars <- c("V8", "V9") # binary
#'
#' # Use column w_loop as weights
#' w <- Data_small$w_loop
#'
#' # -------------------------------
#' # Method 1: Gower distances
#' # -------------------------------
#' dist_sq_ggower <- robust_distances(
#' data = Data_small,
#' cont_vars = cont_vars,
#' bin_vars = bin_vars,
#' cat_vars = cat_vars,
#' w = w,
#' alpha = 0.10,
#' method = "ggower"
#' )
#'
#' # Apply Euclidean correction if needed
#' res_ggower <- make_euclidean(dist_sq_ggower, w)
#'
#' # Show first 5x5 block of original and corrected distances
#' cat("GGower original squared distances (5x5 block):\n")
#' print(round(dist_sq_ggower[1:5, 1:5], 4))
#' cat("\nGGower corrected squared distances (5x5 block):\n")
#' print(round(res_ggower$D_euc[1:5, 1:5], 4))
#'
#' # -------------------------------
#' # Method 2: RelMS distances
#' # -------------------------------
#' dist_sq_relms <- robust_distances(
#' data = Data_small,
#' cont_vars = cont_vars,
#' bin_vars = bin_vars,
#' cat_vars = cat_vars,
#' w = w,
#' alpha = 0.10,
#' method = "relms"
#' )
#'
#' # Apply Euclidean correction if needed
#' res_relms <- make_euclidean(dist_sq_relms, w)
#'
#' # Show first 5x5 block of original and corrected distances
#' cat("RelMS original squared distances (5x5 block):\n")
#' print(round(dist_sq_relms[1:5, 1:5], 4))
#' cat("\nRelMS corrected squared distances (5x5 block):\n")
#' print(round(res_relms$D_euc[1:5, 1:5], 4))
#'
#' @return A numeric matrix of squared robust distances (n x n) or a \code{dist} object if \code{return_dist = TRUE}.
#' @export
robust_distances <- function(
data = NULL,
cont_vars = NULL,
bin_vars = NULL,
cat_vars = NULL,
w = NULL,
p = NULL,
method = c("ggower", "relms"),
robust_cov = NULL,
alpha = 0.1,
return_dist = FALSE # if TRUE returns simple distances (not squared)
) {
method <- match.arg(method)
if (is.null(data)) stop("'data' must be provided")
if (!is.null(p) && (!is.null(cont_vars) || !is.null(bin_vars) || !is.null(cat_vars))) {
stop("Provide either 'p' or the variable groups (cont_vars, bin_vars, cat_vars), but not both.")
}
if (is.null(p)) {
if (is.null(cont_vars) || is.null(bin_vars) || is.null(cat_vars)) {
stop("You must provide either 'p' or all variable groups (cont_vars, bin_vars, cat_vars).")
}
vars_to_check <- c(cont_vars, bin_vars, cat_vars)
if (!all(vars_to_check %in% colnames(data))) {
missing_vars <- vars_to_check[!(vars_to_check %in% colnames(data))]
stop(paste("The following variables are missing in 'data':", paste(missing_vars, collapse = ", ")))
}
if (any(is.na(data[, vars_to_check]))) stop("NA values found in selected variables. Please clean or impute them first.")
# Reorder columns as indicated
df_ordered <- data[, vars_to_check]
# Convert factors/ordered to numeric for calculations
df_ordered <- data.frame(lapply(df_ordered, function(x) {
if (is.factor(x) || is.character(x)) as.numeric(as.factor(x)) else x
}))
data_mat <- as.matrix(df_ordered)
p <- c(length(cont_vars), length(bin_vars), length(cat_vars))
} else {
# If p is defined, assume data is matrix ready
if (any(is.na(data))) stop("NA values found in 'data' matrix. Please clean or impute them first.")
data_mat <- data
}
n <- nrow(data_mat)
if (sum(p) != ncol(data_mat)) stop("Sum of p must equal number of columns in 'data'.")
# Weights
if (is.null(w)) {
w <- rep(1/n, n)
} else {
if (length(w) != n) stop("Length of 'w' must match number of rows in 'data'.")
if (any(w < 0)) stop("Weights must be non-negative.")
if (abs(sum(w) - 1) > sqrt(.Machine$double.eps)) w <- w / sum(w)
}
# Robust covariance for continuous variables if not provided
if (p[1] > 0) {
if (is.null(robust_cov)) {
X_cont <- as.matrix(data_mat[, 1:p[1], drop = FALSE])
rob_res <- robust_covariance_gv(X_cont, w, alpha)
robust_cov <- rob_res$S
} else {
if (!is.matrix(robust_cov)) stop("'robust_cov' must be a matrix")
if (nrow(robust_cov) != p[1] || ncol(robust_cov) != p[1]) {
stop(paste0("'robust_cov' must be a square matrix with dimension ", p[1]))
}
}
}
# Calculate squared distances
D2 <- switch(
method,
ggower = robust_ggower(data_mat, w, p, robust_cov),
relms = robust_RelMS(data_mat, w, p, robust_cov)
)
D2 <- as.matrix(D2)
attr(D2, "class") <- "matrix"
# Attach robust covariance trimming info if available
if (exists("rob_res")) {
if (!is.null(rob_res$central_idx)) attr(D2, "central_idx") <- rob_res$central_idx
if (!is.null(rob_res$outlier_idx)) attr(D2, "outlier_idx") <- rob_res$outlier_idx
if (!is.null(rob_res$phi)) attr(D2, "phi") <- rob_res$phi
if (!is.null(rob_res$q)) attr(D2, "q") <- rob_res$q
}
# Return simple distances if requested
if (return_dist) {
return(dbstats::D2toDist(D2))
}
return(D2)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.