Nothing
# ==============================================================================
# Matching Distance - Distance computation and scaling
# ==============================================================================
#' Compute pairwise distance matrix
#'
#' @return Numeric matrix of pairwise distances (n_left x n_right).
#' @keywords internal
compute_distance_matrix <- function(left_mat, right_mat, distance = "euclidean") {
n_left <- nrow(left_mat)
n_right <- nrow(right_mat)
n_vars <- ncol(left_mat)
# Validate dimensions
if (ncol(right_mat) != n_vars) {
stop("left_mat and right_mat must have same number of columns", call. = FALSE)
}
# Handle distance specification
if (is.function(distance)) {
# User-provided distance function
# Assume it takes two matrices and returns a distance matrix
return(distance(left_mat, right_mat))
}
# Built-in distance metrics
distance <- tolower(as.character(distance)[1])
dist_matrix <- matrix(0, nrow = n_left, ncol = n_right)
if (distance %in% c("euclidean", "l2")) {
# Euclidean distance: sqrt(sum((x - y)^2))
for (i in seq_len(n_left)) {
for (j in seq_len(n_right)) {
diff <- left_mat[i, ] - right_mat[j, ]
dist_matrix[i, j] <- sqrt(sum(diff^2))
}
}
} else if (distance %in% c("manhattan", "l1", "cityblock")) {
# Manhattan distance: sum(|x - y|)
for (i in seq_len(n_left)) {
for (j in seq_len(n_right)) {
diff <- abs(left_mat[i, ] - right_mat[j, ])
dist_matrix[i, j] <- sum(diff)
}
}
} else if (distance %in% c("squared_euclidean", "sqeuclidean", "sq")) {
# Squared Euclidean distance: sum((x - y)^2)
for (i in seq_len(n_left)) {
for (j in seq_len(n_right)) {
diff <- left_mat[i, ] - right_mat[j, ]
dist_matrix[i, j] <- sum(diff^2)
}
}
} else if (distance %in% c("chebyshev", "chebychev", "maximum", "max")) {
# Chebyshev distance: max(|x - y|)
for (i in seq_len(n_left)) {
for (j in seq_len(n_right)) {
diff <- abs(left_mat[i, ] - right_mat[j, ])
dist_matrix[i, j] <- max(diff)
}
}
} else if (distance %in% c("mahalanobis", "maha")) {
# Mahalanobis distance: sqrt((x-y)' * Sigma^-1 * (x-y))
# Use pooled covariance matrix
combined <- rbind(left_mat, right_mat)
cov_mat <- stats::cov(combined)
# Check for singularity
if (det(cov_mat) == 0) {
stop("Covariance matrix is singular; cannot compute Mahalanobis distance",
call. = FALSE)
}
inv_cov <- solve(cov_mat)
for (i in seq_len(n_left)) {
for (j in seq_len(n_right)) {
diff <- left_mat[i, ] - right_mat[j, ]
dist_matrix[i, j] <- sqrt(as.numeric(t(diff) %*% inv_cov %*% diff))
}
}
} else {
stop(sprintf("Unknown distance metric: %s", distance), call. = FALSE)
}
dist_matrix
}
#' Apply scaling to matching variables
#'
#' @return List with scaled left/right matrices and scaling parameters.
#' @keywords internal
apply_scaling <- function(left_mat, right_mat, method = "standardize") {
if (method == FALSE || method == "none" || is.null(method)) {
return(list(left = left_mat, right = right_mat, params = NULL))
}
# Compute scaling parameters from combined data
combined <- rbind(left_mat, right_mat)
if (method == TRUE || method == "standardize" || method == "scale") {
# Standardize to mean 0, sd 1
means <- colMeans(combined)
sds <- apply(combined, 2, stats::sd)
# Avoid division by zero
sds[sds == 0] <- 1
left_scaled <- scale(left_mat, center = means, scale = sds)
right_scaled <- scale(right_mat, center = means, scale = sds)
params <- list(method = "standardize", means = means, sds = sds)
} else if (method == "range" || method == "minmax") {
# Scale to [0, 1] based on combined range
mins <- apply(combined, 2, min)
maxs <- apply(combined, 2, max)
ranges <- maxs - mins
# Avoid division by zero
ranges[ranges == 0] <- 1
left_scaled <- sweep(sweep(left_mat, 2, mins, "-"), 2, ranges, "/")
right_scaled <- sweep(sweep(right_mat, 2, mins, "-"), 2, ranges, "/")
params <- list(method = "range", mins = mins, maxs = maxs, ranges = ranges)
} else if (method == "robust") {
# Robust scaling using median and MAD (median absolute deviation)
medians <- apply(combined, 2, stats::median)
mads <- apply(combined, 2, stats::mad)
# Avoid division by zero
mads[mads == 0] <- 1
left_scaled <- sweep(sweep(left_mat, 2, medians, "-"), 2, mads, "/")
right_scaled <- sweep(sweep(right_mat, 2, medians, "-"), 2, mads, "/")
params <- list(method = "robust", medians = medians, mads = mads)
} else {
stop(sprintf("Unknown scaling method: %s", method), call. = FALSE)
}
# Remove attributes added by scale()
left_scaled <- as.matrix(left_scaled)
right_scaled <- as.matrix(right_scaled)
attr(left_scaled, "scaled:center") <- NULL
attr(left_scaled, "scaled:scale") <- NULL
attr(right_scaled, "scaled:center") <- NULL
attr(right_scaled, "scaled:scale") <- NULL
list(left = left_scaled, right = right_scaled, params = params)
}
#' Apply weights to matching variables
#'
#' @return Numeric matrix with columns weighted.
#' @keywords internal
apply_weights <- function(mat, weights) {
if (is.null(weights) || all(weights == 1)) {
return(mat)
}
if (length(weights) != ncol(mat)) {
stop("Length of weights must match number of columns in matrix", call. = FALSE)
}
# Apply weights by scaling columns
# For distance calculation, we multiply each variable by sqrt(weight)
# so that squared differences are weighted correctly
sweep(mat, 2, sqrt(weights), "*")
}
#' Build cost matrix for matching
#'
#' This is the main entry point for distance computation.
#'
#' @return Numeric matrix of distances with optional scaling/weights applied.
#' @keywords internal
build_cost_matrix <- function(left, right, vars, distance = "euclidean",
weights = NULL, scale = FALSE) {
# Extract variable matrices
left_mat <- extract_matching_vars(left, vars)
right_mat <- extract_matching_vars(right, vars)
# Validate and normalize weights
weights <- validate_weights(weights, vars)
# Apply scaling if requested
if (!identical(scale, FALSE)) {
scaled <- apply_scaling(left_mat, right_mat, method = scale)
left_mat <- scaled$left
right_mat <- scaled$right
scaling_params <- scaled$params
} else {
scaling_params <- NULL
}
# Apply weights
left_mat <- apply_weights(left_mat, weights)
right_mat <- apply_weights(right_mat, weights)
# Compute distance matrix
dist_matrix <- compute_distance_matrix(left_mat, right_mat, distance)
# Add metadata as attributes
attr(dist_matrix, "distance") <- distance
attr(dist_matrix, "weights") <- weights
attr(dist_matrix, "scaling") <- scaling_params
attr(dist_matrix, "vars") <- vars
dist_matrix
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.