R/select_threshold.R

Defines functions select_threshold.pairs select_threshold

Documented in select_threshold select_threshold.pairs

#' Select matching pairs with a score above or equal to a threshold
#'
#' @param pairs a \code{pairs} object, such as generated by 
#'   \code{\link{pair_blocking}}
#' @param variable the name of the new variable to create in pairs. This will be a
#'   logical variable with a value of \code{TRUE} for the selected pairs.
#' @param score name of the score/weight variable of the pairs. When not given
#'   and \code{attr(pairs, "score")} is defined, that is used. 
#' @param threshold the threshold to apply. Pairs with a score above or equal to the 
#'   threshold are selected. 
#' @param inplace logical indicating whether \code{pairs} should be modified in place. When
#'   pairs is large this can be more efficient.
#' @param new_name name of new object to assign the pairs to on the cluster
#'   nodes.
#' @param ... ignored
#'
#' @return
#' Returns the \code{pairs} with the variable given by \code{variable} added. This
#' is a logical variable indicating which pairs are selected a matches.
#'
#' @examples 
#' data("linkexample1", "linkexample2")
#' pairs <- pair_blocking(linkexample1, linkexample2, "postcode")
#' pairs <- compare_pairs(pairs, c("lastname", "firstname", "address", "sex"))
#' model <- problink_em(~ lastname + firstname + address + sex, data = pairs)
#' pairs <- predict(model, pairs, type = "mpost", add = TRUE, binary = TRUE)
#' # Select pairs with a mpost > 0.5
#' select_threshold(pairs, "selected", "mpost", 0.5, inplace = TRUE)
#'
#' # Example using cluster;
#' # In general the syntax is exactly the same except for the first call to 
#' # to cluster_pair. Note the in general `inplace = TRUE` is implied when
#' # working with a cluster; therefore the assignment back to pairs can be 
#' # omitted (also not a problem if it is not).
#' library(parallel)
#' data("linkexample1", "linkexample2")
#' cl <- makeCluster(2)
#' \dontshow{clusterEvalQ(cl, data.table::setDTthreads(1))}
#' pairs <- cluster_pair(cl, linkexample1, linkexample2)
#' compare_pairs(pairs, c("lastname", "firstname", "address", "sex"))
#' model <- problink_em(~ lastname + firstname + address + sex, data = pairs)
#' predict(model, pairs, type = "mpost", add = TRUE, binary = TRUE)
#' # Select pairs with a mpost > 0.5
#' # Unlike the regular pairs: inplace = TRUE is implied here
#' select_threshold(pairs, "selected", "mpost", 0.5)
#' stopCluster(cl)
#' 
#' @rdname select_threshold
#' @export
select_threshold <- function(pairs, variable, score, threshold, ...) {
  UseMethod("select_threshold")
}

#' @rdname select_threshold
#' @export
select_threshold.pairs <- function(pairs, variable, score, threshold, 
    inplace = FALSE, ...) {
  if (is.character(score)) {
    stopifnot(score %in% names(pairs))
    score <- pairs[[score]]
  }
  if (inplace) {
    pairs[, (variable) := ..score >= ..threshold]
  } else {
    pairs[[variable]] <- score >= threshold
  }
  if (inplace) invisible(pairs) else pairs
}

Try the reclin2 package in your browser

Any scripts or data that you put into this service are public.

reclin2 documentation built on May 29, 2024, 4:21 a.m.