R/deduplicate_equivalence.R

Defines functions deduplicate_equivalence_impl deduplicate_equivalence.ldat deduplicate_equivalence.data.frame deduplicate_equivalence

Documented in deduplicate_equivalence

#' Deduplicatin using equivalence groups
#' 
#' @param pairs a \code{pairs} object, such as generated by 
#'   \code{\link{pair_blocking}}
#' @param var name of the variable to create in \code{x} that will contain the
#'   group labels.
#' @param selection a logical variable with the same length as \code{pairs} has
#'   rows, or the name of such a variable in \code{pairs}. Pairs are only 
#'   selected when \code{select} is \code{TRUE}. When missing 
#'   \code{attr(pairs, "selection")} is used when available. 
#' @param x the first data set; when missing \code{attr(pairs, "x")} is used.
#' 
#' @return 
#' Returns \code{x} with a variable containing the group labels. Records with 
#' the same group label (should) correspond to the same entity. 
#' 
#' @export
deduplicate_equivalence <- function(pairs, var = "duplicate_groups", selection, x) {
  if (!methods::is(pairs, "pairs")) stop("pairs should be an object of type 'pairs'.")
  UseMethod("deduplicate_equivalence")
}

#' @export
deduplicate_equivalence.data.frame <- function(pairs, var = "duplicate_groups", 
    selection, x) {
  if (missing(selection)) selection <- NULL
  if (missing(x)) x <- NULL
  deduplicate_equivalence_impl(pairs, var, selection, x)
}

#' @export
deduplicate_equivalence.ldat <- function(pairs, var = "duplicate_groups", 
    selection, x) {
  if (missing(selection)) selection <- NULL
  if (missing(x)) x <- NULL
  deduplicate_equivalence_impl(pairs, var, selection, x)
}

deduplicate_equivalence_impl <- function(pairs, var, selection, x) {
  # Process x
  if (missing(x) || is.null(x)) x <- attr(pairs, "x")
  if (is.null(x)) stop("Missing x")
  # Process selection
  if (missing(selection) || is.null(selection)) 
    selection <- attr(pairs, "selection")
  if (is.null(selection)) stop("Missing selection")
  if (is.character(selection)) selection <- pairs[[selection]]
  # 
  tmp <- as.data.frame(pairs[selection, c("x", "y")])
  x[[var]] <- equivalence(seq_len(nrow(x)), tmp)
  x
}
djvanderlaan/reclin documentation built on Oct. 4, 2022, 7:03 p.m.