R/score_simple.R

Defines functions score_simple.pairs score_simple

Documented in score_simple score_simple.pairs

#' Score pairs based on a number of comparison vectors
#'
#' @param pairs a \code{pairs} object, such as generated by 
#'   \code{\link{pair_blocking}}
#' @param variable the name of the new variable to create in pairs. This will be a
#'   logical variable with a value of \code{TRUE} for the selected pairs.
#' @param on character vector of variables on which the score should be based.
#' @param inplace logical indicating whether \code{pairs} should be modified in place. When
#'   pairs is large this can be more efficient. 
#' @param w1 a vector or list with weights for agreement for each of the
#'   variables. It can either be a numeric vector of length 1 in which case the
#'   same weight is used for all variables; A numeric vector of length equal to
#'   the length of \code{on} in which case the weights correspond one-to-one to
#'   the variables in \code{on}; A named numeric vector where the names
#'   correspond to those in \code{on}, missing values are assigned a value of 1;
#'   or a named list with numeric values.  See details for more information.
#' @param w0 a vector or list with weights for non-agreement for each of the
#'   variables. See details for more information. For the format see \code{w1}.
#' @param wna a vector or list with weights for agreement for each of the
#'   variables. See details for more information. For the format see \code{w1}.
#' @param new_name name of new object to assign the pairs to on the cluster
#'   nodes.
#' @param ... ignored
#'
#' @details
#' The individual contribution of a variable \code{x} to the total score is
#' given by \code{x * w1 + (1-x) * w0} in case of non-\code{NA} values and
#' \code{wna} in case of \code{NA}. This assumes that the values 1 corresponds
#' to complete agreement and the value 0 to complete non-agreement. In case of
#' complete agreement a variable contributes \code{w1} to the total score and in
#' case of complete non-agreement it contributes \code{w0} to the total score.
#'
#' @return
#' Returns the \code{data.table} \code{pairs} with the column \code{variable} added in
#' case of \code{score_simple.pairs}. 
#' 
#' In case of \code{score_simple.cluster_pairs}, \code{score_simple.pairs} is called on
#' each cluster node and the resulting pairs are assigned to \code{new_name} in
#' the environment \code{reclin_env}. When \code{new_name} is not given (or
#' equal to NULL) the original pairs on the nodes are overwritten.
#'
#' @examples
#' data("linkexample1", "linkexample2")
#' pairs <- pair_blocking(linkexample1, linkexample2, "postcode")
#' compare_pairs(pairs, on = c("firstname", "lastname", "sex"), inplace = TRUE)
#' 
#' score_simple(pairs, "score", on = c("firstname", "lastname", "sex"))
#' 
#' # Change the default weights
#' score_simple(pairs, "score", on = c("firstname", "lastname", "sex"), 
#'   w1 = 2, w0 = -1, wna = NA)
#' 
#' # Use a named vector; omited elements from w1 get a weight of 1; those from
#' # w0 and wna a weight of 0.
#' score_simple(pairs, "score", on = c("firstname", "lastname", "sex"), 
#'   w1 = c("firstname" = 2, "lastname" = 3), 
#'   w0 = c("firstname" = -1, "lastname" = -0.5))
#' 
#' # Use a named list; omited elements from w1 get a weight of 1; those from
#' # w0 and wna a weight of 0.
#' score_simple(pairs, "score", on = c("firstname", "lastname", "sex"), 
#'   w1 = list("firstname" = 2, "lastname" = 3), 
#'   w0 = list("firstname" = -1, "lastname" = -0.5))
#'
#' @rdname score_simple
#' @export
score_simple <- function(pairs, variable, on, w1 = 1, w0 = 0, wna = 0, ...) {
  UseMethod("score_simple")
}


#' @rdname score_simple
#' @export
score_simple.pairs <- function(pairs, variable, on, w1 = 1.0, w0 = 0.0, wna = 0.0, 
    inplace = FALSE, ...) {
  if (!all(on %in% names(pairs)))
    stop("Not all variables in on are present in pairs")
  # Process w1
  w1_default <- if (is.numeric(w1) && length(w1) == 1) w1 else 1.0
  if (!is.list(w1)) w1 <- as.list(w1)
  w1 <- extend_to(on, w1, w1_default)
  # Process w0
  w0_default <- if (is.numeric(w0) && length(w0) == 1) w0 else 0.0
  if (!is.list(w0)) w0 <- as.list(w0)
  w0 <- extend_to(on, w0, w0_default)
  # Process wna
  wna_default <- if (is.numeric(wna) && length(wna) == 1) wna else 0.0
  if (!is.list(wna)) wna <- as.list(wna)
  wna <- extend_to(on, wna, wna_default)
  # Calculate score
  score <- numeric(nrow(pairs))
  for (col in on) {
    x <- as.numeric(pairs[[col]])
    s <- x*w1[[col]] + (1-x) * w0[[col]]
    s[is.na(s)] <- wna[[col]]
    score <- score + s
  }
  if (inplace) {
    pairs[, `:=`((variable), score)]
    invisible(pairs)
  } else {
    pairs[[variable]] <- score
    pairs
  }
}

Try the reclin2 package in your browser

Any scripts or data that you put into this service are public.

reclin2 documentation built on May 29, 2024, 4:21 a.m.