#' Matching
#'
#' Conduct K-partite or unrestricted (minimum distance) matching to
#' find pairs or groups of similar elements. By default, finding
#' matches is based on the Euclidean distance between data points, but
#' a custom dissimilarity measure can also be employed.
#'
#' @param x The data input. Can be one of two structures: (1) A feature
#' matrix where rows correspond to elements and columns correspond
#' to variables (a single numeric variable can be passed as a
#' vector). (2) An N x N matrix dissimilarity matrix; can be an
#' object of class \code{dist} (e.g., returned by
#' \code{\link{dist}} or \code{\link{as.dist}}) or a \code{matrix}
#' where the entries of the upper and lower triangular matrix
#' represent pairwise dissimilarities.
#' @param p The size of the groups; the default is 2, in which case
#' the function returns pairs.
#' @param match_between An optional vector, \code{data.frame} or
#' matrix representing one or several categorical constraints. If
#' passed, the argument \code{p} is ignored and matches are sought
#' between elements of different categories.
#' @param match_within An optional vector, \code{data.frame} or matrix
#' representing one or several categorical constraints. If passed,
#' matches are sought between elements of the same category.
#' @param match_extreme_first Logical: Determines if matches are first
#' sought for extreme elements first or for central
#' elements. Defaults to \code{TRUE}.
#' @param target_group Currently, the options "none",
#' smallest" and "diverse" are supported. See Details.
#' @param sort_output Boolean. If \code{TRUE} (default), the output clusters
#' are sorted by similarity. See Details.
#'
#'
#' @return An integer vector encoding the matches. See Details for
#' more information.
#'
#'
#' @details
#'
#' If the data input \code{x} is a feature matrix, matching is based
#' on the Euclidean distance between data points. If the argument
#' \code{x} is a dissimilarity matrix, matching is based on the
#' user-specified dissimilarities. To find matches, the algorithm
#' proceeds by selecting a target element and then searching its
#' nearest neighbours. Critical to the behaviour or the algorithm is
#' the order in which target elements are selected. By default, the
#' most extreme elements are selected first, i.e., elements with the
#' highest distance to the centroid of the data set (see
#' \code{\link{balanced_clustering}} that relies on the same
#' algorithm). Set the argument \code{match_extreme_first} to
#' \code{FALSE}, to enforce that elements close to the centroid are
#' first selected as targets.
#'
#' If the argument \code{match_between} is passed and the groups
#' specified via this argument are of different size, target elements
#' are selected from the smallest group by default (because in this
#' group, all elements can be matched). However, it is also possible
#' to specify how matches are selected through the option
#' \code{target_group}. When specifying \code{"none"}, matches are
#' always selected from extreme elements, irregardless of the group
#' sizes (or from central elements first if \code{match_extreme_first
#' = FALSE}). With option \code{"smallest"}, matches are selected from
#' the smallest group. With option \code{"diverse"}, matches are
#' selected from the most heterogenous group according to the sum of
#' pairwise distances within groups.
#'
#' The output is an integer vector encoding which elements have been
#' matched. The grouping numbers are sorted by similarity. That is,
#' elements with the grouping number »1« have the highest intra-group
#' similarity, followed by 2 etc (groups having the same similarity
#' index are still assigned a different grouping number,
#' though). Similarity is measured as the sum of pairwise (Euclidean)
#' distances within groups (see \code{\link{diversity_objective}}). To
#' prevent sorting by similarity (this is some extra computational burden),
#' set \code{sort_output = FALSE}. Some unmatched elements may be \code{NA}.
#' This happens if it is not
#' possible to evenly split the item pool evenly into groups of size
#' \code{p} or if the categories described by the argument
#' \code{match_between} are of different size.
#'
#' @note It is possible to specify grouping restrictions via
#' \code{match_between} and \code{match_within} at the same time.
#'
#' @author
#' Martin Papenberg \email{martin.papenberg@@hhu.de}
#'
#' @examples
#'
#' # Find triplets
#' N <- 120
#' lds <- data.frame(f1 = rnorm(N), f2 = rnorm(N))
#' triplets <- matching(lds, p = 3)
#' plot_clusters(
#' lds,
#' clusters = triplets,
#' within_connection = TRUE
#' )
#'
#' # Bipartite matching with unequal-sized groups:
#' # Only selects matches for some elements
#' N <- 100
#' data <- matrix(rnorm(N), ncol = 1)
#' groups <- sample(1:2, size = N, replace = TRUE, prob = c(0.8, 0.2))
#' matched <- matching(data[, 1], match_between = groups)
#' plot_clusters(
#' cbind(groups, data),
#' clusters = matched,
#' within_connection = TRUE
#' )
#'
#' # Match objects from the same category only
#' matched <- matching(
#' schaper2019[, 3:6],
#' p = 3,
#' match_within = schaper2019$room
#' )
#' head(table(matched, schaper2019$room))
#'
#' # Match between different plant species in the »iris« data set
#' species <- iris$Species != "versicolor"
#' matched <- matching(
#' iris[species, 1],
#' match_between = iris[species, 5]
#' )
#' # Adjust `match_extreme_first` argument
#' matched2 <- matching(
#' iris[species, 1],
#' match_between = iris[species, 5],
#' match_extreme_first = FALSE
#' )
#' # Plot the matching results
#' user_par <- par("mfrow")
#' par(mfrow = c(1, 2))
#' data <- data.frame(
#' Species = as.numeric(iris[species, 5]),
#' Sepal.Length = iris[species, 1]
#' )
#' plot_clusters(
#' data,
#' clusters = matched,
#' within_connection = TRUE,
#' main = "Extreme elements matched first"
#' )
#' plot_clusters(
#' data,
#' clusters = matched2,
#' within_connection = TRUE,
#' main = "Central elements matched first"
#' )
#' par(mfrow = user_par)
#'
#'
#' @export
#'
matching <- function(
x,
p = 2,
match_between = NULL,
match_within = NULL,
match_extreme_first = TRUE,
target_group = NULL,
sort_output = TRUE
) {
input_validation_matching(x, p, match_between, match_within, match_extreme_first, target_group)
data <- to_matrix(x)
match_between <- merge_into_one_variable(match_between)
target_group <- get_target_group(data, match_between, target_group)
if (argument_exists(match_within)) {
cl <- match_within(data, p, match_between, match_within, match_extreme_first, target_group)
} else {
cl <- nn_centroid_clustering(data, p, match_between, match_extreme_first, target_group)
}
# Before returning: order the group numbers by objective - most similar
# matches have lower indices
if (sort_output) {
return(sort_by_objective(cl, data))
}
cl
}
# Determine from which group target elements are selected
get_target_group <- function(data, match_between, target_group) {
if (!argument_exists(match_between)) { # there is no target group
return(FALSE)
}
tab <- table(match_between)
# next: target group was not specified by user
if (!argument_exists(target_group)) {
if (all(tab == tab[1])) {
# if all groups are of same size, there is no target group
return(FALSE)
}
# smallest group is target group
return(which.min(tab))
}
# last: user specified target group ("smallest", "diverse", "none")
if (target_group == "smallest") {
return(which.min(tab))
} else if (target_group == "diverse") {
return(which.max(diversity_objective_by_group(match_between, data)))
} else if (target_group == "none") {
return(FALSE)
} else {
stop("argument `target_group` must be one of 'smallest', 'diverse' or 'none'")
}
}
# conduct a matching for each category if `match_within` is passed
match_within <- function(data, p, match_between, match_within, match_extreme_first, target_group) {
match_within <- merge_into_one_variable(match_within)
N <- nrow(data)
cl <- rep(NA, N)
c <- length(unique(match_within))
for (i in 1:c) {
tmp_data <- subset_data_matrix(data, match_within == i)
cl_tmp <- nn_centroid_clustering(
tmp_data,
p,
match_between[match_within == i],
match_extreme_first
)
# ensure that different cluster numbers are given to different groups
cl[match_within == i] <- ifelse(is.na(cl_tmp), NA, paste0(cl_tmp, "_", i))
}
to_numeric(cl)
}
# After matching was conducted, reorder the group labels by similarity
sort_by_objective <- function(cl, data, N) {
N <- nrow(data)
selected <- (1:N)[!is.na(cl)]
cl_sub <- cl[selected]
cl_sub <- order_cluster_vector(cl_sub)
objectives <- diversity_objective_by_group(cl_sub, subset_data_matrix(data, selected))
# recode original matching labels according to objective
N <- length(cl_sub)
# sorry for this code, but it works and it was really complicated
one <- data.frame(match = cl_sub, order_matches = 1:length(cl_sub))
two <- data.frame(match = order(objectives), objective_id = 1:length(objectives))
merged <- merge(one, two)
new <- rep(NA, N)
new[merged$order_matches] <- merged$objective_id
cl[!is.na(cl)] <- new
cl
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.