Nothing
#' Deselect pairs that are linked to multiple records
#'
#' @param pairs a \code{pairs} object, such as generated by
#' \code{\link{pair_blocking}}
#' @param variable the name of the new variable to create in pairs. This will be a
#' logical variable with a value of \code{TRUE} for the selected pairs.
#' @param preselect a logical variable with the same length as \code{pairs} has
#' rows, or the name of such a variable in \code{pairs}. Pairs are only
#' selected when \code{preselect} is \code{TRUE}.
#' @param n do not select pairs with a y-record that is linked to more than
#' \code{n} records.
#' @param m do not select pairs with a m-record that is linked to more than
#' \code{m} records.
#' @param id_x a integer vector with the same length as the number of rows in
#' \code{pairs}, or the name of a column in \code{x}. This vector should
#' identify unique objects in \code{x}. When not specified it is assumed that
#' each element in \code{x} is unique.
#' @param id_y a integer vector with the same length as the number of rows in
#' \code{pairs}, or the name of a column in \code{y}. This vector should
#' identify unique objects in \code{y}. When not specified it is assumed that
#' each element in \code{y} is unique.
#' @param x \code{data.table} with one half of the pairs.
#' @param y \code{data.table} with the other half of the pairs.
#' @param inplace logical indicating whether \code{pairs} should be modified in place. When
#' pairs is large this can be more efficient.
#' @param ... Used to pass additional arguments to methods
#'
#' @details
#' This function can be used to remove pairs for which there is ambiguity. For
#' example when a record from \code{x} is linked to multiple records from
#' \code{y} and we know that there are no duplicate records in \code{y} (records
#' that belong to the same object), then we know that at least on of the two
#' links is incorrect but we cannot decide which of the two. In that case we may
#' want to decide that we will not link both records. Running
#' \code{select_unique} with \code{m == 1} will remove both records.
#'
#' In case one wants to select one of the records randomly: \code{select_greedy}
#' will select the pair with the highest weight and in case of an equal weight
#' the first. Adding a random component to the weights will ensure a random
#' selection.
#'
#' @return
#' Returns the \code{pairs} with the variable given by \code{variable} added. This
#' is a logical variable indicating which pairs are selected as matches.
#'
#' @examples
#' data("linkexample1", "linkexample2")
#' pairs <- pair_blocking(linkexample1, linkexample2, "postcode")
#' compare_pairs(pairs, on = c("lastname", "firstname", "address", "sex"),
#' default_comparator = jaro_winkler(0.9), inplace = TRUE)
#' score_simple(pairs, "score",
#' on = c("lastname", "firstname", "address", "sex"),
#' w1 = list(lastname = 2), inplace = TRUE)
#' select_threshold(pairs, variable = "select",
#' score = "score", threshold = 4.0, inplace = TRUE)
#' select_unique(pairs, variable = "select_unique", preselect = "select")
#'
#' @rdname select_unique
#' @export
select_unique <- function(pairs, variable, preselect = NULL, n = 1, m = 1,
id_x = NULL, id_y = NULL, ...) {
UseMethod("select_unique")
}
#' @rdname select_unique
#' @export
select_unique.pairs <- function(pairs, variable, preselect = NULL, n = 1, m = 1,
id_x = NULL, id_y = NULL, x = attr(pairs, "x"), y = attr(pairs, "y"),
inplace = FALSE, ...) {
prep <- select_preprocess(pairs, score = rep(1, nrow(pairs)),
preselect = preselect, id_x = id_x, id_y = id_y, x = x, y = y)
# Create a logical vector indicating which records are to be selected;
# for now select all records
sel <- !logical(nrow(prep))
# Check which records from x need to be removed
nx <- prep[, list(M = .N), by = .x]
x_remove <- nx[M > m][[".x"]]
sel[prep$.x %in% x_remove] <- FALSE
# Check which records from y need to be removed
ny <- prep[, list(N = .N), by = .y]
y_remove <- ny[N > n][[".y"]]
sel[prep$.y %in% y_remove] <- FALSE
# Update the selection
if (inplace) {
pairs[, (variable) := FALSE]
index <- prep$index[sel]
pairs[index, (variable) := TRUE]
invisible(pairs)
} else {
pairs[[variable]] <- if (nrow(pairs) > 0) FALSE else logical(0)
pairs[[variable]][prep$index[sel]] <- if (nrow(pairs) > 0) TRUE else logical(0)
pairs
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.