R/dist_mixed.R
In dbrobust: Robust Distance-Based Visualization and Analysis of Mixed-Type Data

Documented in dist_mixed

#' Compute Gower dissimilarity for mixed-type data
#'
#' Internal helper function to compute pairwise dissimilarities for datasets
#' containing a mix of continuous, binary, and categorical variables using
#' Gower's method \insertCite{gower1971general}{dbrobust}.
#'
#' Continuous, binary, and categorical columns can be automatically detected,
#' or explicitly specified by the user via \code{continuous_cols}, \code{binary_cols},
#' and \code{categorical_cols}.
#'
#' @param x A data frame with rows as observations and columns as variables.
#' @param continuous_cols Optional numeric indices or column names for continuous variables.
#' @param binary_cols Optional numeric indices or column names for binary variables.
#' @param categorical_cols Optional numeric indices or column names for categorical/multiclass variables.
#' @param binary_asym Logical; if TRUE, binary variables are treated as asymmetric (only 1/1 counts as match).
#'
#' @return A symmetric numeric matrix of pairwise dissimilarities in [0,1].
#'
#' @details
#' \itemize{
#'  \item Continuous, binary, and categorical columns are combined into a single dissimilarity
#'   measure following Gower's approach.
#'  \item Continuous variables are scaled by their range.
#'  \item Binary variables can be treated as symmetric (0/0 and 1/1 count as match)
#'   or asymmetric (only 1/1 counts as match).
#'  \item Categorical variables are compared using simple matching.
#'  \item Missing values are ignored pairwise.
#' }
#'
#' \strong{Advantages:}
#' \itemize{
#'   \item Low computational cost.
#'   \item Works naturally with mixed-type data.
#' }
#'
#' \strong{Limitations:}
#' \itemize{
#'   \item Neglects potential correlations among quantitative variables.
#'   \item Sensitive to outliers, which can affect robustness.
#'   \item May overemphasize categorical differences in mixed-data settings.
#' }
#'
#' @examples
#' # Small example: Compute classical Gower for a simulated data frame
#' df <- data.frame(
#'   height = c(170, 160, 180),
#'   gender = factor(c("M", "F", "M")),
#'   smoker = c(1, 0, 1)
#' )
#'
#' # Compute Gower dissimilarities automatically detecting types
#' dbrobust:::dist_mixed(df)
#'
#' # Manual type specification
#' cont_cols <- "height"
#' cat_cols <- NULL
#' bin_cols <- c("gender","smoker")
#' dbrobust:::dist_mixed(
#'   x = df,
#'   continuous_cols = cont_cols,
#'   categorical_cols = cat_cols,
#'   binary_cols = bin_cols
#' )
#'
#' @references
#' \insertRef{gower1971general}{dbrobust}
#'
#' @keywords internal
dist_mixed <- function(x,
                       continuous_cols = NULL,
                       binary_cols = NULL,
                       categorical_cols = NULL,
                       binary_asym = FALSE) {

  x <- as.data.frame(x)
  n <- nrow(x)
  p <- ncol(x)

  # Helper logic inline for matching column names/indices
  resolve_cols <- function(cols, df) {
    if (is.null(cols)) return(NULL)
    if (is.character(cols)) {
      idx <- match(cols, names(df))
      if (any(is.na(idx))) stop("Some column names not found in data.frame")
      return(idx)
    } else if (is.numeric(cols)) {
      return(cols)
    } else {
      stop("continuous_cols, binary_cols, categorical_cols must be numeric or character")
    }
  }

  # Automatic type detection if any type is NULL
  if (is.null(continuous_cols) || is.null(binary_cols) || is.null(categorical_cols)) {
    all_cols <- 1:p

    # Continuous: numeric columns not 0/1
    cont_auto <- which(sapply(x, is.numeric) & !sapply(x, function(col) all(col %in% c(0,1))))

    # Binary: numeric 0/1 or factor/char with 2 levels
    bin_auto <- which(
      (sapply(x, is.numeric) & sapply(x, function(col) all(col %in% c(0,1)))) |
        (sapply(x, is.factor) & sapply(x, nlevels) == 2) |
        (sapply(x, is.character) & sapply(x, function(col) length(unique(col))==2))
    )

    # Categorical: factors/char with >2 levels
    cat_auto <- setdiff(all_cols, union(cont_auto, bin_auto))

    # Apply user overrides if provided
    if (!is.null(continuous_cols)) cont_auto <- resolve_cols(continuous_cols, x)
    if (!is.null(binary_cols)) bin_auto <- resolve_cols(binary_cols, x)
    if (!is.null(categorical_cols)) cat_auto <- resolve_cols(categorical_cols, x)

    continuous_cols <- cont_auto
    binary_cols <- bin_auto
    categorical_cols <- cat_auto
  } else {
    # Resolve names/indices if user provided
    continuous_cols <- resolve_cols(continuous_cols, x)
    binary_cols <- resolve_cols(binary_cols, x)
    categorical_cols <- resolve_cols(categorical_cols, x)
  }

  # Compute ranges for continuous vars
  ranges <- sapply(continuous_cols, function(j) {
    r <- diff(range(x[[j]], na.rm=TRUE))
    if (r == 0) 1 else r
  })

  # Initialize distance matrix
  d <- matrix(0, n, n)

  for (i in 1:(n-1)) {
    for (j in (i+1):n) {
      sim_vals <- numeric(p)

      for (k in 1:p) {
        xi <- x[i,k]; xj <- x[j,k]

        if (is.na(xi) || is.na(xj)) {
          sim_vals[k] <- NA
        } else if (k %in% continuous_cols) {
          sim_vals[k] <- 1 - abs(xi - xj)/ranges[which(continuous_cols==k)]
        } else if (k %in% binary_cols) {
          if (binary_asym) {
            sim_vals[k] <- ifelse(xi==1 & xj==1, 1, 0)
          } else {
            sim_vals[k] <- ifelse(xi==xj, 1, 0)
          }
        } else if (k %in% categorical_cols) {
          sim_vals[k] <- ifelse(xi==xj, 1, 0)
        }
      }

      valid <- !is.na(sim_vals)
      if (any(valid)) {
        d[i,j] <- d[j,i] <- 1 - mean(sim_vals[valid])
      } else {
        d[i,j] <- d[j,i] <- NA
      }
    }
  }

  diag(d) <- 0
  return(d)
}