Nothing
#' Compute Distance or Similarity Matrices
#'
#' Computes a distance or similarity matrix between rows of a data frame or matrix, supporting a wide variety of distance metrics.
#'
#' @param x A matrix or data.frame. Each row represents an observation.
#' @param method A string specifying the distance/similarity method. Supported:
#' \itemize{
#' \item \strong{Binary}: \code{"jaccard"}, \code{"dice"}, \code{"sokal_michener"}, \code{"russell_rao"},
#' \code{"sokal_sneath"}, \code{"kulczynski"},\code{"hamming"}.
#' \item \strong{Categorical}: \code{"matching_coefficient"}.
#' \item \strong{Continuous}: \code{"euclidean"}, \code{"euclidean_standardized"}, \code{"manhattan"},
#' \code{"minkowski"}, \code{"canberra"}, \code{"maximum"}, \code{"cosine"},
#' \code{"correlation"}, \code{"mahalanobis"}.
#' \item \strong{Mixed}: \code{"gower"}.
#' }
#' @param output_format Output format: \code{"dist"} (distance object), \code{"matrix"} (numeric matrix),
#' or \code{"similarity"} (only for binary/categorical/mixed methods).
#' @param squared Logical; if \code{TRUE}, returns squared distances (not applied to similarities).
#' @param p Numeric; the power parameter for the Minkowski distance (required if \code{method = "minkowski"}).
#' @param similarity_transform Character string; if \code{output_format = "similarity"}, this specifies the formula to convert distances to similarity scores.
#' Supported:
#' \itemize{
#' \item \code{"linear"} (default): \eqn{s_{ij} = 1 - \delta_{ij}}
#' \item \code{"sqrt"}: \eqn{s_{ij} = 1 - \delta_{ij}^2}
#' }
#' @param ... Additional arguments passed to underlying functions.
#'
#' @return Depending on \code{output_format}, returns:
#' \itemize{
#' \item dist object (if \code{output_format = "dist"})
#' \item numeric matrix (if \code{output_format = "matrix"} or \code{"output_format = similarity"})
#' }
#' @details
#' When \code{output_format = "similarity"}, the function transforms computed distances into similarity scores using one of the supported transformations.
#'
#' The similarity transformation options are:
#' \describe{
#' \item{\code{"linear"}}{Direct inversion of distance: \eqn{s_{ij} = 1 - \delta_{ij}}.}
#' \item{\code{"sqrt"}}{Squared distance inversion: \eqn{s_{ij} = 1 - \delta_{ij}^2}, which may better preserve Euclidean properties.}
#' }
#' @examples
#' # Load example dataset
#' data("Data_HC_contamination", package = "dbrobust")
#' df <- Data_HC_contamination
#'
#' # --- Quick Example ---
#' numeric_data <- df[1:10, 1:4] # subset for speed
#' d_euclid <- calculate_distances(
#' numeric_data,
#' method = "euclidean",
#' output_format = "matrix"
#' )
#' \donttest{
#' # Load example dataset
#' data("Data_HC_contamination", package = "dbrobust")
#' df <- Data_HC_contamination[1:20,]
#'
#' # Example 1: Euclidean distance (numeric variables only)
#' numeric_data <- df[, 1:4]
#' d_euclid <- calculate_distances(
#' numeric_data,
#' method = "euclidean",
#' output_format = "matrix"
#' )
#'
#' # Example 2: Manhattan distance
#' d_manhattan <- calculate_distances(
#' numeric_data,
#' method = "manhattan",
#' output_format = "matrix"
#' )
#'
#' # Example 3: Categorical distance using Matching Coefficient
#' categorical_data <- df[, 5:7]
#' d_match <- calculate_distances(
#' categorical_data,
#' method = "matching_coefficient",
#' output_format = "matrix"
#' )
#'
#' # Example 4: Mixed data distance using Gower (automatic type detection, asymmetric binary)
#' d_gower_asym <- calculate_distances(
#' df,
#' method = "gower",
#' output_format = "dist",
#' binary_asym = TRUE
#' )
#'
#' # Example 5: Minkowski distance with p = 3
#' d_minkowski <- calculate_distances(
#' numeric_data,
#' method = "minkowski",
#' p = 3,
#' output_format = "matrix"
#' )
#'
#' # Example 6: Jaccard distance for binary variables
#' binary_data <- df[, 8:9]
#' d_jaccard <- calculate_distances(
#' binary_data,
#' method = "jaccard",
#' output_format = "matrix"
#' )
#'
#' # Example 7: Mahalanobis distance
#' d_mahal <- calculate_distances(
#' numeric_data,
#' method = "mahalanobis",
#' output_format = "matrix"
#' )
#'
#' # Example 8: Manual selection of predictors for Gower distance
#' continuous_vars <- 1:4
#' binary_vars <- 8:9
#' categorical_vars <- 5:7
#' d_gower_manual <- calculate_distances(
#' df,
#' method = "gower",
#' output_format = "dist",
#' continuous_cols = continuous_vars,
#' binary_cols = binary_vars,
#' categorical_cols = categorical_vars
#' )
#' }
#' @seealso
#' \code{\link[stats]{dist}} for basic distance measures,
#' \code{\link[ade4]{dist.binary}} for binary distances,
#' \code{\link[proxy]{dist}} for advanced metrics like cosine or correlation
#'
#' @importFrom stats dist as.dist
#' @export
calculate_distances <- function(x, method = "gower", output_format = "dist",
squared = FALSE, p = NULL, similarity_transform = "linear", ...) {
# Normalize method and output_format names
method <- tolower(gsub(" ", "_", method))
output_format <- tolower(output_format)
# Available method categories
binary_methods <- c("jaccard", "dice", "sokal_michener", "russell_rao",
"sokal_sneath", "kulczynski", "hamming")
categorical_methods <- c("matching_coefficient")
continuous_methods <- c("euclidean", "euclidean_standardized", "manhattan", "minkowski", "canberra",
"maximum", "cosine", "correlation", "mahalanobis")
if (!output_format %in% c("dist", "matrix", "similarity")) {
stop("Invalid output_format: must be 'dist', 'matrix', or 'similarity'")
}
# Validate use of 'p' only for minkowski
if (method == "minkowski" && is.null(p)) {
stop("The parameter 'p' must be specified when method = 'minkowski'")
}
if (!is.null(p) && method != "minkowski") {
stop("Parameter 'p' is only applicable when method = 'minkowski'")
}
if (output_format == "similarity" && method %in% continuous_methods) {
stop(sprintf("Similarity output is not supported for continuous method '%s'. Please use output_format = 'dist' or 'matrix'.", method))
}
# General input validation
if (!is.matrix(x) && !is.data.frame(x)) {
stop("Input must be a matrix or data.frame")
}
dist_mat <- NULL
n <- nrow(x)
# Dispatch to appropriate distance function
if (method %in% binary_methods) {
dist_mat <- dist_binary(x, method)
if (any(is.nan(dist_mat))) {
warning("Some binary distances are NaN, likely due to zero denominators (e.g., Jaccard with all-zero rows).")
}
} else if (method %in% categorical_methods) {
dist_mat <- dist_categorical(x, method)
} else if (method %in% continuous_methods) {
dist_mat <- dist_continuous(x, method, p)
} else if (method == "gower") {
warning("Gower distances may not satisfy the Euclidean property. Use caution with methods assuming euclideanity.")
dist_mat <- dist_mixed(x)
} else {
stop("Unsupported method: ", method)
}
# Square distances if needed (only if output is not similarity)
if (squared && output_format != "similarity") {
dist_mat <- dist_mat^2
}
# Return properly formatted result
is_similarity <- (output_format == "similarity")
return(format_output(dist_mat, output_format, similarity = is_similarity, similarity_transform = similarity_transform))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.