R/select_unique.R

Defines functions select_unique.pairs select_unique

Documented in select_unique select_unique.pairs

#' Deselect pairs that are linked to multiple records
#'
#' @param pairs a \code{pairs} object, such as generated by 
#'   \code{\link{pair_blocking}}
#' @param variable the name of the new variable to create in pairs. This will be a
#'   logical variable with a value of \code{TRUE} for the selected pairs.
#' @param preselect a logical variable with the same length as \code{pairs} has
#'   rows, or the name of such a variable in \code{pairs}. Pairs are only 
#'   selected when \code{preselect} is \code{TRUE}. 
#' @param n do not select pairs with a y-record that is linked to more than
#'   \code{n} records.
#' @param m do not select pairs with a m-record that is linked to more than
#'   \code{m} records.
#' @param id_x a integer vector with the same length as the number of rows in 
#'   \code{pairs}, or the name of a column in \code{x}. This vector should 
#'   identify unique objects in \code{x}. When not specified it is assumed that
#'   each element in \code{x} is unique. 
#' @param id_y a integer vector with the same length as the number of rows in 
#'   \code{pairs}, or the name of a column in \code{y}. This vector should 
#'   identify unique objects in \code{y}. When not specified it is assumed that
#'   each element in \code{y} is unique. 
#' @param x \code{data.table} with one half of the pairs.
#' @param y \code{data.table} with the other half of the pairs.
#' @param inplace logical indicating whether \code{pairs} should be modified in place. When
#'   pairs is large this can be more efficient.
#' @param ... Used to pass additional arguments to methods
#'
#' @details
#' This function can be used to remove pairs for which there is ambiguity. For
#' example when a record from \code{x} is linked to multiple records from
#' \code{y} and we know that there are no duplicate records in \code{y} (records
#' that belong to the same object), then we know that at least on of the two
#' links is incorrect but we cannot decide which of the two. In that case we may
#' want to decide that we will not link both records. Running
#' \code{select_unique} with \code{m == 1} will remove both records.
#'
#' In case one wants to select one of the records randomly: \code{select_greedy}
#' will select the pair with the highest weight and in case of an equal weight
#' the first. Adding a random component to the weights will ensure a random
#' selection.
#'
#' @return
#' Returns the \code{pairs} with the variable given by \code{variable} added. This
#' is a logical variable indicating which pairs are selected as matches.
#'
#' @examples
#' data("linkexample1", "linkexample2")
#' pairs <- pair_blocking(linkexample1, linkexample2, "postcode")
#' compare_pairs(pairs, on = c("lastname", "firstname", "address", "sex"),
#'   default_comparator = jaro_winkler(0.9), inplace = TRUE)
#' score_simple(pairs, "score", 
#'   on = c("lastname", "firstname", "address", "sex"),
#'   w1 = list(lastname = 2), inplace = TRUE)
#' select_threshold(pairs, variable = "select", 
#'   score = "score", threshold = 4.0, inplace =  TRUE)
#' select_unique(pairs, variable = "select_unique", preselect = "select")
#' 
#' @rdname select_unique
#' @export
select_unique <- function(pairs, variable, preselect = NULL, n = 1, m = 1,
    id_x = NULL, id_y = NULL, ...) {
  UseMethod("select_unique")
}


#' @rdname select_unique
#' @export
select_unique.pairs <- function(pairs, variable, preselect = NULL, n = 1, m = 1,
    id_x = NULL, id_y = NULL, x = attr(pairs, "x"), y = attr(pairs, "y"), 
    inplace = FALSE, ...) {
  prep <- select_preprocess(pairs, score = rep(1, nrow(pairs)), 
    preselect = preselect, id_x = id_x, id_y = id_y, x = x, y = y)
  # Create a logical vector indicating which records are to be selected;
  # for now select all records
  sel <- !logical(nrow(prep))
  # Check which records from x need to be removed
  nx <- prep[, list(M = .N), by = .x]
  x_remove <- nx[M > m][[".x"]]
  sel[prep$.x %in% x_remove] <- FALSE
  # Check which records from y need to be removed
  ny <- prep[, list(N = .N), by = .y]
  y_remove <- ny[N > n][[".y"]]
  sel[prep$.y %in% y_remove] <- FALSE
  # Update the selection
  if (inplace) {
    pairs[, (variable) := FALSE]
    index <- prep$index[sel]
    pairs[index, (variable) := TRUE]
    invisible(pairs)
  } else {
    pairs[[variable]] <- if (nrow(pairs) > 0) FALSE else logical(0)
    pairs[[variable]][prep$index[sel]] <- if (nrow(pairs) > 0) TRUE else logical(0)
    pairs
  }
}

Try the reclin2 package in your browser

Any scripts or data that you put into this service are public.

reclin2 documentation built on May 29, 2024, 4:21 a.m.