reclin: Record Linkage Toolkit

Documented in select_n_to_m

#' Select matching pairs enforcing one-to-one linkage
#'
#' @param pairs a \code{pairs} object, such as generated by 
#'   \code{\link{pair_blocking}}
#' @param threshold the threshold to apply. Pairs with a score above the 
#'   threshold are selected. 
#' @param weight name of the score/weight variable of the pairs. When not given
#'   and \code{attr(pairs, "score")} is defined, that is used. 
#' @param var the name of the new variable to create in pairs. This will be a
#'   logical variable with a value of \code{TRUE} for the selected pairs.
#' @param preselect a logical variable with the same length as \code{pairs} has
#'   rows, or the name of such a variable in \code{pairs}. Pairs are only 
#'   selected when \code{preselect} is \code{TRUE}. This interacts with 
#'   \code{threshold} (pairs have to be selected with both conditions).
#' @param n the number of records from \code{x} that can at most be linked to a
#'   record in \code{y}. 
#' @param m the number of records from \code{y} that can at most be linked to a
#'   record in \code{x}.
#' @param id_x a integer vector with the same length a the number of rows in 
#'   \code{pairs}, or the name of a column in \code{x}. This vector should 
#'   identify unique objects in \code{x}. When not specified it is assumed that
#'   each element in \code{x} is unique. 
#' @param id_y a integer vector with the same length a the number of rows in 
#'   \code{pairs}, or the name of a column in \code{y}. This vector should 
#'   identify unique objects in \code{y}. When not specified it is assumed that
#'   each element in \code{y} is unique. 
#' @param ... passed on to other methods.
#'   
#' @details 
#' Both methods force one-to-one matching. \code{select_greedy} uses a greedy 
#' algorithm that selects the first pair with the highest weight. 
#' \code{select_n_to_m} tries to optimise the total weight of all of the 
#' selected pairs. In general this will result in a better selection. However,
#' \code{select_n_to_m} uses much more memory and is much slower and, therefore,
#' can only be used when the number of possible pairs is not too large. 
#'
#' @return
#' Returns the \code{pairs} with the variable given by \code{var} added. This
#' is a logical variable indicating which pairs are selected a matches.
#'
#' @examples 
#' data("linkexample1", "linkexample2")
#' pairs <- pair_blocking(linkexample1, linkexample2, "postcode")
#' pairs <- compare_pairs(pairs, c("lastname", "firstname", "address", "sex"))
#' pairs <- score_simsum(pairs)
#' 
#' # Select pairs with a simsum > 5 and force one-to-one linkage
#' pairs <- select_n_to_m(pairs, 0, var = "ntom")
#' pairs <- select_greedy(pairs, 0, var = "greedy")
#' table(pairs[c("ntom", "greedy")])
#' 
#' \dontshow{gc()}
#'
#' @rdname select_n_to_m
#' @export
select_n_to_m <- function(pairs, threshold = NULL, weight = NULL, var = "select", 
    preselect = NULL, n = 1, m = 1, id_x = NULL, id_y = NULL, ...) {
  UseMethod("select_n_to_m")
}

#' @export
select_n_to_m.data.frame <- function(pairs, threshold = NULL, weight = NULL, var = "select",
    preselect = NULL, n = 1, m = 1, id_x = NULL, id_y = NULL, ...) {
  prep <- select_preprocess(pairs, threshold, weight, preselect, id_x, id_y)
  select <- prep$select

  sel_ind <-  match_n_to_m(prep$x, prep$y, prep$w, n = n, m = m)
  sel <- logical(length(prep$x))
  sel[sel_ind] <- TRUE
  select[select] <- sel

  pairs[[var]] <- select
  attr(pairs, "selection") <- var
  pairs
}

#' @export
select_n_to_m.ldat <- function(pairs, threshold = NULL, weight = NULL, var = "select",
    preselect = NULL, n = 1, m = 1, id_x = NULL, id_y = NULL, ...) {
  prep <- select_preprocess(pairs, threshold, weight, preselect, id_x, id_y)
  select <- prep$select

  sel_ind <-  match_n_to_m(prep$x, prep$y, prep$w, n = n, m = m)
  sel <- logical(length(prep$x))
  sel[sel_ind] <- TRUE
  select[select] <- sel

  pairs[[var]] <- select
  attr(pairs, "selection") <- var
  pairs
}