R/compare_pairs.R

Defines functions extend_to compare_pairs

Documented in compare_pairs

#' Compare all pairs of records 
#' 
#' @param pairs a \code{pairs} object, such as generated by 
#'   \code{\link{pair_blocking}}
#' @param by variables from \code{x} and \code{y} on which to compare the 
#'   records.
#' @param comparators a names list of \link[=identical]{comparator functions}, 
#'   for the named variables the given functions will be used to compare the 
#'   records. For the remaining variables the \code{default_comparator} will 
#'   be used. 
#' @param x the first \code{data.frame}, when missing \code{attr(pairs, "x")} 
#'   is used. 
#' @param y the second \code{data.frame}, when missing \code{attr(pairs, "y")} 
#'   is used. 
#' @param default_comparator the default \link[=identical]{comparison function}.
#' @param overwrite overwrite exiting variables in \code{pairs}
#' 
#' @return 
#' Returns the \code{pairs} object with a column added for each variable in 
#' \code{by}. The value is the column is given by the return value of the 
#' corresponding \link[=identical]{comparison function}.
#' 
#' @examples
#' data("linkexample1", "linkexample2")
#' pairs <- pair_blocking(linkexample1, linkexample2, "postcode")
#' pairs <- compare_pairs(pairs, c("lastname", "firstname", "address", "sex"))
#' 
#' \dontshow{gc()}
#' 
#' @import ldat
#' @import lvec
#' @export
compare_pairs <- function(pairs, by, comparators = list(default_comparator), 
    x, y, default_comparator = identical(), overwrite = FALSE) {
  # Process and preparare input
  if (missing(x)) x <- attr(pairs, "x")
  if (is.null(x)) stop("Missing x.")
  if (missing(y)) y <- attr(pairs, "y")
  if (is.null(y)) stop("Missing y.")
  if (missing(by) && !missing(by)) by <- names(comparators)
  if (missing(by) || is.null(by)) stop("by is missing.")
  if (!overwrite && any(by %in% names(pairs))) 
    stop("Variable in by already present in pairs.")
  if (!all(by %in% names(x)))
    stop("Not all variables in by are present in x.")
  if (!all(by %in% names(y))) 
    stop("Not all variables in by are present in y.")
  comparators <- extend_to(by, comparators, default = default_comparator)  
  # Compare
  chunks <- chunk(pairs$x)
  for (col in by) {
    res <- if (is_ldat(pairs)) lvec(0, "numeric") else numeric()
    comparator <- comparators[[col]]
    for (c in chunks) {
      x_i <- slice_range(pairs$x, range = c, as_r = TRUE)
      x_chunk <- x[[col]][x_i]
      y_i <- slice_range(pairs$y, range = c, as_r = TRUE)
      y_chunk <- y[[col]][y_i]
      comparison <- comparator(x_chunk, y_chunk)
      if (is.null(res) || length(res) == 0) {
        res <- if (is_ldat(pairs)) as_lvec(comparison) else comparison
        length(res) <- length(pairs$x)
      } else {
        lset(res, range = c, values = comparison)
      }
    }
    pairs[[col]] <- res
  }
  attr(pairs, "by") <- by
  attr(pairs, "comparators") <- comparators
  class(pairs) <- unique(c("compare", class(pairs)))
  pairs
}


extend_to <- function(by, what = list(default), default) {
  if (!is.list(what)) stop("what should be a list.")
  has_names <- !is.null(names(what))
  if (has_names) {
    res <- vector("list", length(by))
    names(res) <- by
    for (el in names(res)) {
      res[[el]] <- if (is.null(what[[el]])) default else what[[el]]
    }
  } else {
    res <- rep(what, length.out = length(by))
    names(res) <- by
  }
  res
}

Try the reclin package in your browser

Any scripts or data that you put into this service are public.

reclin documentation built on Nov. 23, 2021, 9:09 a.m.