Nothing
#' Select matching pairs enforcing one-to-one linkage
#'
#' @param pairs a \code{pairs} object, such as generated by
#' \code{\link{pair_blocking}}
#' @param variable the name of the new variable to create in pairs. This will be a
#' logical variable with a value of \code{TRUE} for the selected pairs.
#' @param score name of the score/weight variable of the pairs. When not given
#' and \code{attr(pairs, "score")} is defined, that is used.
#' @param threshold the threshold to apply. Pairs with a score above the
#' threshold are selected.
#' @param preselect a logical variable with the same length as \code{pairs} has
#' rows, or the name of such a variable in \code{pairs}. Pairs are only
#' selected when \code{preselect} is \code{TRUE}. This interacts with
#' \code{threshold} (pairs have to be selected with both conditions).
#' @param id_x a integer vector with the same length as the number of rows in
#' \code{pairs}, or the name of a column in \code{x}. This vector should
#' identify unique objects in \code{x}. When not specified it is assumed that
#' each element in \code{x} is unique.
#' @param id_y a integer vector with the same length as the number of rows in
#' \code{pairs}, or the name of a column in \code{y}. This vector should
#' identify unique objects in \code{y}. When not specified it is assumed that
#' each element in \code{y} is unique.
#' @param x \code{data.table} with one half of the pairs.
#' @param y \code{data.table} with the other half of the pairs.
#' @param inplace logical indicating whether \code{pairs} should be modified in place. When
#' pairs is large this can be more efficient.
#' @param include_ties when pairs for a given record have an equal weight, should
#' all pairs be included.
#' @param n an integer. Each element of x can be linked to at most n elements of
#' y.
#' @param m an integer. Each element of y can be linked to at most m elements of
#' x.
#' @param ... Used to pass additional arguments to methods
#'
#' @details
#' Both methods force one-to-one matching. \code{select_greedy} uses a greedy
#' algorithm that selects the first pair with the highest weight.
#' \code{select_n_to_m} tries to optimise the total weight of all of the
#' selected pairs. In general this will result in a better selection. However,
#' \code{select_n_to_m} uses much more memory and is much slower and, therefore,
#' can only be used when the number of possible pairs is not too large.
#'
#' Note that when \code{include_ties = TRUE} the same record can still be
#' selected more than once. In that case the pairs will have an equal weight.
#'
#' @return
#' Returns the \code{pairs} with the variable given by \code{variable} added. This
#' is a logical variable indicating which pairs are selected as matches.
#'
#' @examples
#' data("linkexample1", "linkexample2")
#' pairs <- pair_blocking(linkexample1, linkexample2, "postcode")
#' pairs <- compare_pairs(pairs, c("lastname", "firstname", "address", "sex"))
#' model <- problink_em(~ lastname + firstname + address + sex, data = pairs)
#' pairs <- predict(model, pairs, type = "mpost", add = TRUE, binary = TRUE)
#'
#' # Select pairs with a mpost > 0.5 and force one-to-one linkage
#' pairs <- select_n_to_m(pairs, "ntom", "mpost", 0.5)
#' pairs <- select_greedy(pairs, "greedy", "mpost", 0.5)
#' table(pairs$ntom, pairs$greedy)
#'
#' # The same example as above using a cluster;
#' library(parallel)
#' cl <- makeCluster(2)
#' \dontshow{clusterEvalQ(cl, data.table::setDTthreads(1))}
#' pairs <- cluster_pair_blocking(cl, linkexample1, linkexample2, "postcode")
#' compare_pairs(pairs, c("lastname", "firstname", "address", "sex"))
#' model <- problink_em(~ lastname + firstname + address + sex, data = pairs)
#' predict(model, pairs, type = "mpost", add = TRUE, binary = TRUE)
#' # Select pairs with a mpost > 0.5 and force one-to-one linkage
#' # select_n_to_m and select_greedy only work on pairs that are local;
#' # therefore we first collect the pairs
#' select_threshold(pairs, "selected", "mpost", 0.5)
#' local_pairs <- cluster_collect(pairs, "selected")
#' local_pairs <- select_n_to_m(local_pairs, "ntom", "mpost", 0.5)
#' local_pairs <- select_greedy(local_pairs, "greedy", "mpost", 0.5)
#' table(local_pairs$ntom, local_pairs$greedy)
#'
#' stopCluster(cl)
#'
#' @rdname select_n_to_m
#' @export
select_n_to_m <- function(pairs, variable, score, threshold, preselect = NULL,
id_x = NULL, id_y = NULL, ...) {
UseMethod("select_n_to_m")
}
#' @rdname select_n_to_m
#' @export
select_n_to_m.pairs <- function(pairs, variable, score, threshold, preselect = NULL,
id_x = NULL, id_y = NULL, x = attr(pairs, "x"), y = attr(pairs, "y"),
inplace = FALSE, ...) {
prep <- select_preprocess(pairs, score = score, threshold = threshold,
preselect = preselect, id_x = id_x, id_y = id_y, x = x, y = y)
sel_ind <- match_n_to_m(prep$.x, prep$.y, prep$score, n = 1, m = 1)
sel <- prep$index[sel_ind]
if (inplace) {
pairs[, (variable) := FALSE]
pairs[sel, (variable) := TRUE]
invisible(pairs)
} else {
pairs[[variable]] <- FALSE
pairs[[variable]][sel] <- TRUE
pairs
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.