#' Generate candidate matches (using fuzzy blocked postal codes)
#'
#' Given blocks of postal codes, join two datasets of firm names and addresses
#' and return likely candidate matches.
#'
#' @param tbl_x A tbl of firm names, addresses and postal codes; for now, must
#' have variables `name`, `address`, `postal_code`
#'
#' @param tbl_y A tbl of firm names, addresses and postal codes; for now, must
#' have variables `name`, `address`, `postal_code`
#'
#' @param block A tbl of (postal code, postal code) blocked pairs on which to merge
#' tbl_x and tbl_y to. If not supplied, block is calculated using fuzzy_block() in the
#' first couples of lines, using only the postal codes in tbl_x and tbl_y.
#'
#' @return A tbl of (firm, firm) candidate matches with stringdist measures
#' @keywords match, postal codes, addresses, block, fuzzy, join
#' @examples
#'
#' brx <- br %>%
#' select(name, address, postal_code) %>%
#' mutate(name = standardize(name, dictionary = company_dictionary),
#' address = address %>% standardize(dictionary = address_dictionary) %>% fix_unit_names())
#'
#' generate_matches(brx, brx)
#'
#' @export
generate_matches <- function(tbl_x, tbl_y, name_var = 'name', address_var = 'address', block = NULL) {
# generate candidate matches between tbl_x, tbl_y, based on names and addresses
# if the fuzzy blocked postal code tbl isn't supplied, create it
# as long as you also have the data package `postalcodes`
if (is.null(block) & requireNamespace("postalcodes", quietly = TRUE)) {
postal_coords <- postalcodes::postal_coords
postal_input <- postal_coords[postal_coords$postalcode %in% unique(tbl_x$postal_code, tbl_y$postal_code), ]$postalcode
block <- fuzzy_block(postal_input = postal_input, postal_coords = postal_coords)
} else if (is.null(block)) {
stop("Please supply the blocked postal code table.
Alternatively, install the `postalcodes` package.",
call. = FALSE)
}
# start with postal code block, merge on firms from tbl_x and tbl_y; this expands block from
# (pc, pc) to (firm, firm). and then calculate stringdist measures on names and addresses
# the inner_join drops some postal codes that might otherwise not be, depending on postal codes.
# check this again with Tim Tom Construction & Concrete.
# matches <- block %>%
# dplyr::inner_join(tbl_x, by = c('postalcode.x' = 'postal_code')) %>%
# dplyr::inner_join(tbl_y, by = c('postalcode.y' = 'postal_code')) %>%
# dplyr::mutate(name_cos = stringdist::stringdist(name.x, name.y, method = 'cosine', q = 2, p = 0.1),
# add_cos = stringdist::stringdist(address.x, address.y, method = 'cosine', q = 2, p = 0.1))
matches <- dplyr::inner_join(block, tbl_x, by = c('postalcode.x' = 'postal_code'))
matches <- dplyr::inner_join(matches, tbl_y, by = c('postalcode.y' = 'postal_code'))
matches <- dplyr::mutate(matches,
name_cos = stringdist::stringdist(name.x, name.y, method = 'cosine', q = 2),
add_cos = stringdist::stringdist(address.x, address.y, method = 'cosine', q = 2)
# do.call might work here, with list of arguments with all different names, etc.
matches <- dplyr::group_by(matches, name.x, name.y, address.x, address.y)
matches <- dplyr::arrange(matches, name.x, name.y, address.x, address.y, name_cos, add_cos)
dplyr::slice(matches, 1)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.