matchtools: Tools For Matching Firms From Different Datasets

Documented in generate_matches

#' Generate candidate matches (using fuzzy blocked postal codes)
#'
#' Given blocks of postal codes, join two datasets of firm names and addresses
#' and return likely candidate matches.
#'
#' @param tbl_x A tbl of firm names, addresses and postal codes; for now, must
#' have variables `name`, `address`, `postal_code`
#'
#' @param tbl_y A tbl of firm names, addresses and postal codes; for now, must
#' have variables `name`, `address`, `postal_code`
#'
#' @param block A tbl of (postal code, postal code) blocked pairs on which to merge
#' tbl_x and tbl_y to. If not supplied, block is calculated using fuzzy_block() in the
#' first couples of lines, using only the postal codes in tbl_x and tbl_y.
#'
#' @return A tbl of (firm, firm) candidate matches with stringdist measures
#' @keywords match, postal codes, addresses, block, fuzzy, join
#' @examples
#'
#' brx <- br %>%
#'         select(name, address, postal_code) %>%
#'         mutate(name = standardize(name, dictionary = company_dictionary),
#'                address = address %>% standardize(dictionary = address_dictionary) %>% fix_unit_names())
#'
#' generate_matches(brx, brx)
#'
#' @export
generate_matches <- function(tbl_x, tbl_y, name_var = 'name', address_var = 'address', block = NULL) {
  # generate candidate matches between tbl_x, tbl_y, based on names and addresses

  # if the fuzzy blocked postal code tbl isn't supplied, create it
  # as long as you also have the data package `postalcodes`
  if (is.null(block) & requireNamespace("postalcodes", quietly = TRUE)) {
    postal_coords <- postalcodes::postal_coords
    postal_input <- postal_coords[postal_coords$postalcode %in% unique(tbl_x$postal_code, tbl_y$postal_code), ]$postalcode
    block <- fuzzy_block(postal_input = postal_input, postal_coords = postal_coords)
  } else if (is.null(block)) {
    stop("Please supply the blocked postal code table.
         Alternatively, install the `postalcodes` package.",
         call. = FALSE)
  }

  # start with postal code block, merge on firms from tbl_x and tbl_y; this expands block from
  # (pc, pc) to (firm, firm). and then calculate stringdist measures on names and addresses
  # the inner_join drops some postal codes that might otherwise not be, depending on postal codes.
  # check this again with Tim Tom Construction & Concrete.
  # matches <- block %>%
  #   dplyr::inner_join(tbl_x, by = c('postalcode.x' = 'postal_code')) %>%
  #   dplyr::inner_join(tbl_y, by = c('postalcode.y' = 'postal_code')) %>%
  #   dplyr::mutate(name_cos = stringdist::stringdist(name.x, name.y, method = 'cosine', q = 2, p = 0.1),
  #          add_cos = stringdist::stringdist(address.x, address.y, method = 'cosine', q = 2, p = 0.1))
  matches <- dplyr::inner_join(block, tbl_x, by = c('postalcode.x' = 'postal_code'))
  matches <- dplyr::inner_join(matches, tbl_y, by = c('postalcode.y' = 'postal_code'))
  matches <- dplyr::mutate(matches,
                 name_cos = stringdist::stringdist(name.x, name.y, method = 'cosine', q = 2),
                 add_cos = stringdist::stringdist(address.x, address.y, method = 'cosine', q = 2)

  # do.call might work here, with list of arguments with all different names, etc.
  matches <- dplyr::group_by(matches, name.x, name.y, address.x, address.y)
  matches <- dplyr::arrange(matches, name.x, name.y, address.x, address.y, name_cos, add_cos)
  dplyr::slice(matches, 1)
}