Nothing
#' Compute Gower dissimilarity for mixed-type data
#'
#' Internal helper function to compute pairwise dissimilarities for datasets
#' containing a mix of continuous, binary, and categorical variables using
#' Gower's method \insertCite{gower1971general}{dbrobust}.
#'
#' Continuous, binary, and categorical columns can be automatically detected,
#' or explicitly specified by the user via \code{continuous_cols}, \code{binary_cols},
#' and \code{categorical_cols}.
#'
#' @param x A data frame with rows as observations and columns as variables.
#' @param continuous_cols Optional numeric indices or column names for continuous variables.
#' @param binary_cols Optional numeric indices or column names for binary variables.
#' @param categorical_cols Optional numeric indices or column names for categorical/multiclass variables.
#' @param binary_asym Logical; if TRUE, binary variables are treated as asymmetric (only 1/1 counts as match).
#'
#' @return A symmetric numeric matrix of pairwise dissimilarities in [0,1].
#'
#' @details
#' \itemize{
#' \item Continuous, binary, and categorical columns are combined into a single dissimilarity
#' measure following Gower's approach.
#' \item Continuous variables are scaled by their range.
#' \item Binary variables can be treated as symmetric (0/0 and 1/1 count as match)
#' or asymmetric (only 1/1 counts as match).
#' \item Categorical variables are compared using simple matching.
#' \item Missing values are ignored pairwise.
#' }
#'
#' \strong{Advantages:}
#' \itemize{
#' \item Low computational cost.
#' \item Works naturally with mixed-type data.
#' }
#'
#' \strong{Limitations:}
#' \itemize{
#' \item Neglects potential correlations among quantitative variables.
#' \item Sensitive to outliers, which can affect robustness.
#' \item May overemphasize categorical differences in mixed-data settings.
#' }
#'
#' @examples
#' # Small example: Compute classical Gower for a simulated data frame
#' df <- data.frame(
#' height = c(170, 160, 180),
#' gender = factor(c("M", "F", "M")),
#' smoker = c(1, 0, 1)
#' )
#'
#' # Compute Gower dissimilarities automatically detecting types
#' dbrobust:::dist_mixed(df)
#'
#' # Manual type specification
#' cont_cols <- "height"
#' cat_cols <- NULL
#' bin_cols <- c("gender","smoker")
#' dbrobust:::dist_mixed(
#' x = df,
#' continuous_cols = cont_cols,
#' categorical_cols = cat_cols,
#' binary_cols = bin_cols
#' )
#'
#' @references
#' \insertRef{gower1971general}{dbrobust}
#'
#' @keywords internal
dist_mixed <- function(x,
continuous_cols = NULL,
binary_cols = NULL,
categorical_cols = NULL,
binary_asym = FALSE) {
x <- as.data.frame(x)
n <- nrow(x)
p <- ncol(x)
# Helper logic inline for matching column names/indices
resolve_cols <- function(cols, df) {
if (is.null(cols)) return(NULL)
if (is.character(cols)) {
idx <- match(cols, names(df))
if (any(is.na(idx))) stop("Some column names not found in data.frame")
return(idx)
} else if (is.numeric(cols)) {
return(cols)
} else {
stop("continuous_cols, binary_cols, categorical_cols must be numeric or character")
}
}
# Automatic type detection if any type is NULL
if (is.null(continuous_cols) || is.null(binary_cols) || is.null(categorical_cols)) {
all_cols <- 1:p
# Continuous: numeric columns not 0/1
cont_auto <- which(sapply(x, is.numeric) & !sapply(x, function(col) all(col %in% c(0,1))))
# Binary: numeric 0/1 or factor/char with 2 levels
bin_auto <- which(
(sapply(x, is.numeric) & sapply(x, function(col) all(col %in% c(0,1)))) |
(sapply(x, is.factor) & sapply(x, nlevels) == 2) |
(sapply(x, is.character) & sapply(x, function(col) length(unique(col))==2))
)
# Categorical: factors/char with >2 levels
cat_auto <- setdiff(all_cols, union(cont_auto, bin_auto))
# Apply user overrides if provided
if (!is.null(continuous_cols)) cont_auto <- resolve_cols(continuous_cols, x)
if (!is.null(binary_cols)) bin_auto <- resolve_cols(binary_cols, x)
if (!is.null(categorical_cols)) cat_auto <- resolve_cols(categorical_cols, x)
continuous_cols <- cont_auto
binary_cols <- bin_auto
categorical_cols <- cat_auto
} else {
# Resolve names/indices if user provided
continuous_cols <- resolve_cols(continuous_cols, x)
binary_cols <- resolve_cols(binary_cols, x)
categorical_cols <- resolve_cols(categorical_cols, x)
}
# Compute ranges for continuous vars
ranges <- sapply(continuous_cols, function(j) {
r <- diff(range(x[[j]], na.rm=TRUE))
if (r == 0) 1 else r
})
# Initialize distance matrix
d <- matrix(0, n, n)
for (i in 1:(n-1)) {
for (j in (i+1):n) {
sim_vals <- numeric(p)
for (k in 1:p) {
xi <- x[i,k]; xj <- x[j,k]
if (is.na(xi) || is.na(xj)) {
sim_vals[k] <- NA
} else if (k %in% continuous_cols) {
sim_vals[k] <- 1 - abs(xi - xj)/ranges[which(continuous_cols==k)]
} else if (k %in% binary_cols) {
if (binary_asym) {
sim_vals[k] <- ifelse(xi==1 & xj==1, 1, 0)
} else {
sim_vals[k] <- ifelse(xi==xj, 1, 0)
}
} else if (k %in% categorical_cols) {
sim_vals[k] <- ifelse(xi==xj, 1, 0)
}
}
valid <- !is.na(sim_vals)
if (any(valid)) {
d[i,j] <- d[j,i] <- 1 - mean(sim_vals[valid])
} else {
d[i,j] <- d[j,i] <- NA
}
}
}
diag(d) <- 0
return(d)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.