R/tabulate_patterns.R

Defines functions tabulate_patterns_impl tabulate_patterns.ldat tabulate_patterns.data.frame tabulate_patterns

Documented in tabulate_patterns

#' Create a table of comparison patterns
#' 
#' @param pairs a \code{pairs} object, such as generated by 
#'   \code{\link{pair_blocking}}
#' @param comparators a list with comparison functions for each of the 
#'   columns. When missing or \code{NULL}, \code{attr(pairs, "comparators")} is
#'   used. Therefore, this parameter usually does not need to be specified. 
#' @param by the columns that should be used for the comparison vectors. When
#'   missing or \code{NULL}, \code{attr(pairs, "by")} is used. Therefore, this
#'   parameter usually does not need to be specified. 
#' @param ... passed on to other methods.
#'   
#' @details 
#' Since comparison vectors can contain continuous numbers (usually between 0
#' and 1), this could result in a very large number of possible comparison 
#' vectors. Therefore, the comparison vectors are passed on to the comparators
#' in order to threshold them. This usually results in values 0 or 1. Missing
#' values are usually codes as 0. However, this all depends on the comparison
#' functions used. For more information see the documentation on the 
#' \link[=identical]{comparison functions}.
#' 
#' @return 
#' Returns a \code{data.frame} with all unique comparison patterns that exist
#' in \code{pairs}, with a column \code{n} added with the number of times each
#' pattern occurs. 
#' 
#' @examples
#' data("linkexample1", "linkexample2")
#' pairs <- pair_blocking(linkexample1, linkexample2, "postcode")
#' pairs <- compare_pairs(pairs, c("lastname", "firstname", "address", "sex"))
#' tabulate_patterns(pairs)
#'
#' \dontshow{gc()}
#'
#' @export
tabulate_patterns <- function(pairs, ..., comparators = NULL, by = NULL) {
  if (!methods::is(pairs, "pairs")) stop("pairs should be an object of type 'pairs'.")
  UseMethod("tabulate_patterns")
}

#' @export
tabulate_patterns.data.frame <- function(pairs, ..., comparators = NULL, by = NULL) {
  tabulate_patterns_impl(pairs, comparators, by)
}

#' @export
tabulate_patterns.ldat <- function(pairs, ..., comparators = NULL, by = NULL) {
  tabulate_patterns_impl(pairs, comparators, by)
}

#' @import ldat
#' @import lvec
#' @import dplyr
tabulate_patterns_impl <- function(pairs, comparators, by) {
  # Process arguments
  if (missing(comparators) || is.null(comparators)) 
    comparators <- attr(pairs, "comparators")
  if (missing(by) || is.null(by)) 
    by <- if (missing(comparators)) attr(pairs, "by") else names(comparators)
  # Tabulate chunks
  chunks <- chunk(pairs)
  tab <- vector("list", length(chunks))
  for (i in seq_along(chunks)) {
    d <- slice_range(pairs, range = chunks[[i]], as_r = TRUE)
    for (col in by) d[[col]] <- comparators[[col]](d[[col]])
    tab[[i]] <- d %>% group_by(across(all_of(by))) %>% summarise(n = n())
  }
  # Combine
  tab <- bind_rows(tab) %>% group_by(across(all_of(by))) %>% summarise(n = sum(n)) %>%
    ungroup() %>% as.data.frame()
  # Add patterns not present in dataset
  complete <- TRUE
  if (complete) {
    possible_patterns <- lapply(tab[, by], function(x) {
      u <- unique(x)
      if (is.logical(u)) u <- unique(c(u, c(TRUE, FALSE)))
      if (is.factor(x)) union(x, levels(x)) else u
    })
    possible_patterns <- do.call(expand.grid, possible_patterns)
    tab <- merge(possible_patterns, tab, by = by, all.x = TRUE)
    tab$n[is.na(tab$n)] <- 0
  }
  tab
}

Try the reclin package in your browser

Any scripts or data that you put into this service are public.

reclin documentation built on Nov. 23, 2021, 9:09 a.m.