R/calculate_distances.R

Defines functions calculate_distances

Documented in calculate_distances

#' Compute Distance or Similarity Matrices
#'
#' Computes a distance or similarity matrix between rows of a data frame or matrix, supporting a wide variety of distance metrics.
#'
#' @param x A matrix or data.frame. Each row represents an observation.
#' @param method A string specifying the distance/similarity method. Supported:
#' \itemize{
#'   \item \strong{Binary}: \code{"jaccard"}, \code{"dice"}, \code{"sokal_michener"}, \code{"russell_rao"},
#'   \code{"sokal_sneath"}, \code{"kulczynski"},\code{"hamming"}.
#'   \item \strong{Categorical}: \code{"matching_coefficient"}.
#'   \item \strong{Continuous}: \code{"euclidean"}, \code{"euclidean_standardized"}, \code{"manhattan"},
#'    \code{"minkowski"}, \code{"canberra"}, \code{"maximum"}, \code{"cosine"},
#'    \code{"correlation"}, \code{"mahalanobis"}.
#'   \item \strong{Mixed}: \code{"gower"}.
#' }
#' @param output_format Output format: \code{"dist"} (distance object), \code{"matrix"} (numeric matrix),
#'        or \code{"similarity"} (only for binary/categorical/mixed methods).
#' @param squared Logical; if \code{TRUE}, returns squared distances (not applied to similarities).
#' @param p Numeric; the power parameter for the Minkowski distance (required if \code{method = "minkowski"}).
#' @param similarity_transform Character string; if \code{output_format = "similarity"}, this specifies the formula to convert distances to similarity scores.
#' Supported:
#' \itemize{
#'   \item \code{"linear"} (default): \eqn{s_{ij} = 1 - \delta_{ij}}
#'   \item \code{"sqrt"}: \eqn{s_{ij} = 1 - \delta_{ij}^2}
#' }
#' @param ... Additional arguments passed to underlying functions.
#'
#' @return Depending on \code{output_format}, returns:
#' \itemize{
#'   \item dist object (if \code{output_format = "dist"})
#'   \item numeric matrix (if \code{output_format = "matrix"} or \code{"output_format = similarity"})
#' }
#' @details
#' When \code{output_format = "similarity"}, the function transforms computed distances into similarity scores using one of the supported transformations.
#'
#' The similarity transformation options are:
#' \describe{
#'   \item{\code{"linear"}}{Direct inversion of distance: \eqn{s_{ij} = 1 - \delta_{ij}}.}
#'   \item{\code{"sqrt"}}{Squared distance inversion: \eqn{s_{ij} = 1 - \delta_{ij}^2}, which may better preserve Euclidean properties.}
#' }
#' @examples
#' # Load example dataset
#' data("Data_HC_contamination", package = "dbrobust")
#' df <- Data_HC_contamination
#'
#' # --- Quick Example ---
#' numeric_data <- df[1:10, 1:4]  # subset for speed
#' d_euclid <- calculate_distances(
#'   numeric_data,
#'   method = "euclidean",
#'   output_format = "matrix"
#' )
#' \donttest{
#' # Load example dataset
#' data("Data_HC_contamination", package = "dbrobust")
#' df <- Data_HC_contamination[1:20,]
#'
#' # Example 1: Euclidean distance (numeric variables only)
#' numeric_data <- df[, 1:4]
#' d_euclid <- calculate_distances(
#'   numeric_data,
#'   method = "euclidean",
#'   output_format = "matrix"
#' )
#'
#' # Example 2: Manhattan distance
#' d_manhattan <- calculate_distances(
#'   numeric_data,
#'   method = "manhattan",
#'   output_format = "matrix"
#' )
#'
#' # Example 3: Categorical distance using Matching Coefficient
#' categorical_data <- df[, 5:7]
#' d_match <- calculate_distances(
#'   categorical_data,
#'   method = "matching_coefficient",
#'   output_format = "matrix"
#' )
#'
#' # Example 4: Mixed data distance using Gower (automatic type detection, asymmetric binary)
#' d_gower_asym <- calculate_distances(
#'   df,
#'   method = "gower",
#'   output_format = "dist",
#'   binary_asym = TRUE
#' )
#'
#' # Example 5: Minkowski distance with p = 3
#' d_minkowski <- calculate_distances(
#'   numeric_data,
#'   method = "minkowski",
#'   p = 3,
#'   output_format = "matrix"
#' )
#'
#' # Example 6: Jaccard distance for binary variables
#' binary_data <- df[, 8:9]
#' d_jaccard <- calculate_distances(
#'   binary_data,
#'   method = "jaccard",
#'   output_format = "matrix"
#' )
#'
#' # Example 7: Mahalanobis distance
#' d_mahal <- calculate_distances(
#'   numeric_data,
#'   method = "mahalanobis",
#'   output_format = "matrix"
#' )
#'
#' # Example 8: Manual selection of predictors for Gower distance
#' continuous_vars <- 1:4
#' binary_vars <- 8:9
#' categorical_vars <- 5:7
#' d_gower_manual <- calculate_distances(
#'   df,
#'   method = "gower",
#'   output_format = "dist",
#'   continuous_cols = continuous_vars,
#'   binary_cols = binary_vars,
#'   categorical_cols = categorical_vars
#' )
#' }
#' @seealso
#'   \code{\link[stats]{dist}} for basic distance measures,
#'   \code{\link[ade4]{dist.binary}} for binary distances,
#'   \code{\link[proxy]{dist}} for advanced metrics like cosine or correlation
#'
#' @importFrom stats dist as.dist
#' @export
calculate_distances <- function(x, method = "gower", output_format = "dist",
                                squared = FALSE, p = NULL, similarity_transform = "linear", ...) {
  # Normalize method and output_format names
  method <- tolower(gsub(" ", "_", method))
  output_format <- tolower(output_format)

  # Available method categories
  binary_methods <- c("jaccard", "dice", "sokal_michener", "russell_rao",
                      "sokal_sneath", "kulczynski", "hamming")
  categorical_methods <- c("matching_coefficient")
  continuous_methods <- c("euclidean", "euclidean_standardized", "manhattan", "minkowski", "canberra",
                          "maximum", "cosine", "correlation", "mahalanobis")

  if (!output_format %in% c("dist", "matrix", "similarity")) {
    stop("Invalid output_format: must be 'dist', 'matrix', or 'similarity'")
  }

  # Validate use of 'p' only for minkowski
  if (method == "minkowski" && is.null(p)) {
    stop("The parameter 'p' must be specified when method = 'minkowski'")
  }
  if (!is.null(p) && method != "minkowski") {
    stop("Parameter 'p' is only applicable when method = 'minkowski'")
  }

  if (output_format == "similarity" && method %in% continuous_methods) {
    stop(sprintf("Similarity output is not supported for continuous method '%s'. Please use output_format = 'dist' or 'matrix'.", method))
  }

  # General input validation
  if (!is.matrix(x) && !is.data.frame(x)) {
    stop("Input must be a matrix or data.frame")
  }

  dist_mat <- NULL
  n <- nrow(x)

  # Dispatch to appropriate distance function
  if (method %in% binary_methods) {
    dist_mat <- dist_binary(x, method)
    if (any(is.nan(dist_mat))) {
      warning("Some binary distances are NaN, likely due to zero denominators (e.g., Jaccard with all-zero rows).")
    }

  } else if (method %in% categorical_methods) {
    dist_mat <- dist_categorical(x, method)

  } else if (method %in% continuous_methods) {
    dist_mat <- dist_continuous(x, method, p)

  } else if (method == "gower") {
    warning("Gower distances may not satisfy the Euclidean property. Use caution with methods assuming euclideanity.")
    dist_mat <- dist_mixed(x)

  } else {
    stop("Unsupported method: ", method)
  }

  # Square distances if needed (only if output is not similarity)
  if (squared && output_format != "similarity") {
    dist_mat <- dist_mat^2
  }

  # Return properly formatted result
  is_similarity <- (output_format == "similarity")

  return(format_output(dist_mat, output_format, similarity = is_similarity, similarity_transform = similarity_transform))
}

Try the dbrobust package in your browser

Any scripts or data that you put into this service are public.

dbrobust documentation built on Nov. 5, 2025, 6:24 p.m.