R/select_n_to_m.R
In reclin2: Record Linkage Toolkit

Documented in select_n_to_m select_n_to_m.pairs

#' Select matching pairs enforcing one-to-one linkage
#'
#' @param pairs a \code{pairs} object, such as generated by 
#'   \code{\link{pair_blocking}}
#' @param variable the name of the new variable to create in pairs. This will be a
#'   logical variable with a value of \code{TRUE} for the selected pairs.
#' @param score name of the score/weight variable of the pairs. When not given
#'   and \code{attr(pairs, "score")} is defined, that is used. 
#' @param threshold the threshold to apply. Pairs with a score above the 
#'   threshold are selected. 
#' @param preselect a logical variable with the same length as \code{pairs} has
#'   rows, or the name of such a variable in \code{pairs}. Pairs are only 
#'   selected when \code{preselect} is \code{TRUE}. This interacts with 
#'   \code{threshold} (pairs have to be selected with both conditions).
#' @param id_x a integer vector with the same length as the number of rows in 
#'   \code{pairs}, or the name of a column in \code{x}. This vector should 
#'   identify unique objects in \code{x}. When not specified it is assumed that
#'   each element in \code{x} is unique. 
#' @param id_y a integer vector with the same length as the number of rows in 
#'   \code{pairs}, or the name of a column in \code{y}. This vector should 
#'   identify unique objects in \code{y}. When not specified it is assumed that
#'   each element in \code{y} is unique. 
#' @param x \code{data.table} with one half of the pairs.
#' @param y \code{data.table} with the other half of the pairs.
#' @param inplace logical indicating whether \code{pairs} should be modified in place. When
#'   pairs is large this can be more efficient.
#' @param include_ties when pairs for a given record have an equal weight, should
#'   all pairs be included.
#' @param n an integer. Each element of x can be linked to at most n elements of
#'   y. 
#' @param m an integer. Each element of y can be linked to at most m elements of
#'   x. 
#' @param ... Used to pass additional arguments to methods
#'   
#' @details 
#' Both methods force one-to-one matching. \code{select_greedy} uses a greedy 
#' algorithm that selects the first pair with the highest weight. 
#' \code{select_n_to_m} tries to optimise the total weight of all of the 
#' selected pairs. In general this will result in a better selection. However,
#' \code{select_n_to_m} uses much more memory and is much slower and, therefore,
#' can only be used when the number of possible pairs is not too large. 
#'
#' Note that when \code{include_ties = TRUE} the same record can still be 
#' selected more than once. In that case the pairs will have an equal weight.
#'
#' @return
#' Returns the \code{pairs} with the variable given by \code{variable} added. This
#' is a logical variable indicating which pairs are selected as matches.
#'
#' @examples 
#' data("linkexample1", "linkexample2")
#' pairs <- pair_blocking(linkexample1, linkexample2, "postcode")
#' pairs <- compare_pairs(pairs, c("lastname", "firstname", "address", "sex"))
#' model <- problink_em(~ lastname + firstname + address + sex, data = pairs)
#' pairs <- predict(model, pairs, type = "mpost", add = TRUE, binary = TRUE)
#' 
#' # Select pairs with a mpost > 0.5 and force one-to-one linkage
#' pairs <- select_n_to_m(pairs, "ntom", "mpost", 0.5)
#' pairs <- select_greedy(pairs, "greedy", "mpost", 0.5)
#' table(pairs$ntom, pairs$greedy)
#' 
#' # The same example as above using a cluster;
#' library(parallel)
#' cl <- makeCluster(2)
#' \dontshow{clusterEvalQ(cl, data.table::setDTthreads(1))}
#' pairs <- cluster_pair_blocking(cl, linkexample1, linkexample2, "postcode")
#' compare_pairs(pairs, c("lastname", "firstname", "address", "sex"))
#' model <- problink_em(~ lastname + firstname + address + sex, data = pairs)
#' predict(model, pairs, type = "mpost", add = TRUE, binary = TRUE)
#' # Select pairs with a mpost > 0.5 and force one-to-one linkage
#' # select_n_to_m and select_greedy only work on pairs that are local; 
#' # therefore we first collect the pairs
#' select_threshold(pairs, "selected", "mpost", 0.5)
#' local_pairs <- cluster_collect(pairs, "selected")
#' local_pairs <- select_n_to_m(local_pairs, "ntom", "mpost", 0.5)
#' local_pairs <- select_greedy(local_pairs, "greedy", "mpost", 0.5)
#' table(local_pairs$ntom, local_pairs$greedy)
#' 
#' stopCluster(cl)
#' 
#' @rdname select_n_to_m
#' @export
select_n_to_m <- function(pairs, variable, score, threshold, preselect = NULL, 
    id_x = NULL, id_y = NULL, ...) { 
  UseMethod("select_n_to_m")
}

#' @rdname select_n_to_m
#' @export
select_n_to_m.pairs <- function(pairs, variable, score, threshold, preselect = NULL, 
    id_x = NULL, id_y = NULL, x = attr(pairs, "x"), y = attr(pairs, "y"), 
    inplace = FALSE, ...) {
  prep <- select_preprocess(pairs, score = score, threshold = threshold, 
    preselect = preselect, id_x = id_x, id_y = id_y, x = x, y = y)
  sel_ind <- match_n_to_m(prep$.x, prep$.y, prep$score, n = 1, m = 1)
  sel <- prep$index[sel_ind]
  if (inplace) {
    pairs[, (variable) := FALSE]
    pairs[sel, (variable) := TRUE]
    invisible(pairs)
  } else {
    pairs[[variable]] <- FALSE
    pairs[[variable]][sel] <- TRUE
    pairs
  }
}