matchtools: Tools For Matching Firms From Different Datasets

Documented in fuzzy_block par_fuzzy_block

#' Fuzzy block Canadian things with addresses and postal codes
#'
#' Generate geography pairs to block matching on. Taking
#' into account weird postal code things.
#'
#' @param postal_input A list of postal codes. Not optimal.
#'
#' @param postal_coords A tbl of all postal codes and coordinates
#'
#' @param input Name of input postal code column
#'
#' @param output Name of output postal code column
#'
#' @param distance_col Name of output distance column
#'
#' @param max_dist Maximum distance in fuzzy join, in kilometres
#'
#' @return Geography pairs to block on.
#' @keywords match, postal codes, addresses, block, fuzzy, join
#' @examples
#'
#' #library(postalcodes)
#' postal_input <- postalcodes::postal_coords[1:10, 'postalcode']
#' fuzzy_block(postal_input = postal_input, postal_coords = postal_coords)
#'
#' @export
fuzzy_block <- function(postal_input, postal_coords,
                        input = "postalcode", output = "postalcode",
                        distance_col = "d", max_dist = 1) {
  # the annoying thing about tbls is you need to know the name of the variables you're working with.
  # input name of codes must be 'postalcode', right now.

  # Use this to modify the name of the column that has the postal code.
  # may need to do the same with lat/long in postal_coords etc.
  # output <- quo_name(enquo(output))
  input <- dplyr::quo_name(dplyr::enquo(input))
  # postal_input <- postal_input %>% dplyr::rename(postalcode = !!input)
  # postal_coords <- postal_coords %>% dplyr::rename(postalcode = !!input)
  # postal_input <- dplyr::rename(postal_input, postalcode = !!input)
  postal_coords <- dplyr::rename(postal_coords, postalcode = !!input)

  # get a list of the FSAs in the input tbl of postal codes.
  fsa_inp <- unique(substr(postal_input, 1, 3))

  # the list of postal codes that are in the FSAs that are in the input list of postal codes
  postal_codes_fsa <- postal_coords[substr(postal_coords$postalcode, 1, 3) %in% fsa_inp, ]
  # next, add the postal codes in FSAs contiguous with the ones in the fsa_inp list.

  # fsa_contig <- matchtools:::fsa_contig

  # one level: (also need to include the original fsa)
  fsa_c1 <- unique(c(fsa_inp, # original FSA.
                     fsa_contig[fsa_contig$fsa %in% fsa_inp, # select rows with the original FSA
                                ]$contiguous_fsa)) # and get the column that gives all the contiguous FSAs

  # two levels (same idea):
  fsa_c2 <- fsa_contig[fsa_contig$fsa %in% fsa_c1, ]$contiguous_fsa

  # (only do one level for now.)
  fsa_all <- fsa_c1 #unique(c(fsa_c1, fsa_c2))

  postal_codes_fsa <- postal_coords[substr(postal_coords$postalcode, 1, 3) %in% fsa_all, ]

  # might need to pay attention to the default names of postal codes coords now, because they inputs.
  # fuzzy block the list of postal codes based on distance;
  temp <- fuzzyjoin::geo_inner_join(
            postal_coords[postal_coords$postalcode %in% postal_input, ],
            postal_codes_fsa,
            by = c('latitude', 'longitude'),
            unit = 'km',
            max_dist = max_dist,
            distance_col = distance_col)

  temp <- temp[, c('postalcode.x', 'postalcode.y', distance_col)]
  names(temp)[1:2] <- c(paste0(output, '.x'), paste0(output, '.y'))

  # gc() # I don't know why people say this doesn't do anything---needed using parallel on windows server at work.
  temp
}

# now some code that returns...what...
# some code that does the parallel processing?
# goes on list of...what...could be FSAs, why not
# fsa <- postal_coords$postalcode %>% substr(1,3) %>% unique()
# fuzzy_block(postal_code_list[1], postal_coords = postal_coords, input = postalcode)
# or, maybe given...what....
# it's really just a wrapper to parallel code?
# then what do I need? keep in mind the actual other code that merges on data and generates candidates.

#' A parallel wrapper for the fuzzy_block() function; should work on Windows and OSX
#'
#' Split postal code table into list based on FSAs, return blocks of all of them.
#'
#' @param clusters Number of clusters to pass to parallel::parLapply()
#' @param postal_input A list similar to input into fuzzy_block(), but can be much larger
#' @param postal_coords A tbl Similar to input into fuzzy_block()
#' @return Geography pairs to block on.
#' @keywords match, postal codes, addresses, block, fuzzy, join, parallel
#' @examples
#'
#' library(postalcodes)
#' par_fuzzy_block_(clusters = 4,
#'                  postal_inp = postal_coords[1:500, ]$postalcode,
#'                  postal_coords = postal_coords)
#'
#' @export
par_fuzzy_block <- function(clusters = 4, postal_input, postal_coords) {
# might need other inputs to fuzzy_block too, like name, etc.
  # need a list of things to apply.
  fsa <- postal_input %>% substr(1,3) %>% unique()
  # split on ... that?

  # get a list of things...ez. split on FSA.
  postal_code_list <- postal_input %>% split(substr(postal_input, 1, 3))

  # get clusters
  cl <- parallel::makeCluster(clusters)

  # parallelize call to fuzzy_block, based on a list of postal codes divided by FSA.
  postal_codes_blocked <- parallel::parLapply(cl, postal_code_list, fuzzy_block, postal_coords = postal_coords)

  # stop/clean up clusters.
  parallel::stopCluster(cl)

  postal_codes_blocked <- dplyr::bind_rows(postal_codes_blocked)
  postal_codes_blocked
}