R/tabulate_patterns.R

Defines functions tabulate_patterns_impl tabulate_patterns.ldat tabulate_patterns.data.frame tabulate_patterns

Documented in tabulate_patterns

#' Create a table of comparison patterns
#' 
#' @param pairs a \code{pairs} object, such as generated by 
#'   \code{\link{pair_blocking}}
#' @param comparators a list with comparison functions for each of the 
#'   columns. When missing or \code{NULL}, \code{attr(pairs, "comparators")} is
#'   used. Therefore, this parameter usually does not need to be specified. 
#' @param by the columns that should be used for the comparison vectors. When
#'   missing or \code{NULL}, \code{attr(pairs, "by")} is used. Therefore, this
#'   parameter usually does not need to be specified. 
#' @param ... passed on to other methods.
#'   
#' @details 
#' Since comparison vectors can contain continuous numbers (usually between 0
#' and 1), this could result in a very large number of possible comparison 
#' vectors. Therefore, the comparison vectors are passed on to the comparators
#' in order to threshold them. This usually results in values 0 or 1. Missing
#' values are usually codes as 0. However, this all depends on the comparison
#' functions used. For more information see the documentation on the 
#' \link[=identical]{comparison functions}.
#' 
#' @return 
#' Returns a \code{data.frame} with all unique comparison patterns that exist
#' in \code{pairs}, with a column \code{n} added with the number of times each
#' pattern occurs. 
#' 
#' @examples
#' data("linkexample1", "linkexample2")
#' pairs <- pair_blocking(linkexample1, linkexample2, "postcode")
#' pairs <- compare_pairs(pairs, c("lastname", "firstname", "address", "sex"))
#' tabulate_patterns(pairs)
#'
#' \dontshow{gc()}
#'
#' @export
tabulate_patterns <- function(pairs, ..., comparators = NULL, by = NULL) {
  if (!methods::is(pairs, "pairs")) stop("pairs should be an object of type 'pairs'.")
  UseMethod("tabulate_patterns")
}

#' @export
tabulate_patterns.data.frame <- function(pairs, ..., comparators = NULL, by = NULL) {
  tabulate_patterns_impl(pairs, comparators, by)
}

#' @export
tabulate_patterns.ldat <- function(pairs, ..., comparators = NULL, by = NULL) {
  tabulate_patterns_impl(pairs, comparators, by)
}

#' @import ldat
#' @import lvec
#' @import dplyr
tabulate_patterns_impl <- function(pairs, comparators, by) {
  # Process arguments
  if (missing(comparators) || is.null(comparators)) 
    comparators <- attr(pairs, "comparators")
  if (missing(by) || is.null(by)) 
    by <- if (missing(comparators)) attr(pairs, "by") else names(comparators)
  # Tabulate chunks
  chunks <- chunk(pairs)
  tab <- vector("list", length(chunks))
  for (i in seq_along(chunks)) {
    d <- slice_range(pairs, range = chunks[[i]], as_r = TRUE)
    for (col in by) d[[col]] <- comparators[[col]](d[[col]])
    tab[[i]] <- d %>% group_by_(.dots = by) %>% summarise(n = n())
  }
  # Combine
  bind_rows(tab) %>% group_by_(.dots = by) %>% summarise(n = sum(n)) %>%
    ungroup() %>% as.data.frame()
}
djvanderlaan/reclin documentation built on Oct. 4, 2022, 7:03 p.m.