matchtools: Tools For Matching Firms From Different Datasets

Documented in fix_unit_names_

#' Non-vectorized version of fix_unit_names()
#'
#' This function standardizes apartment numbers to the format
#' "#-# Street St, Extra Info". Vectorized over 'address'.
#'
#' @param address A vector of address strings
#'
#' @param ignore.case Ignore case for internal regex matching and rearranging.
#'
#' @return A vector of converted address strings
#' @keywords standardize, apartment, unit, suite, address
#'
#' @examples
#'
#' fix_unit_names_('halifax shopping centre 7001 mumford rd unit 166')
#'
fix_unit_names_ <- function(address, ignore.case = FALSE) {

  two_nums <- "(\\d+\\b)(.*)(\\b\\d+)" # two numbers in the address
  unit_names_box <- "#|unit|unite|apt|ste|suite|box|pobox|lot|site|bay|hangar|conc|bldg|room"
  unit_names <- "#|unit|unite|apt|ste|suite"
  apt_pattern = paste0("(.*)(\\b(", unit_names, ")\\s*(\\d+))(.*)")
  # apt_pattern = paste0("(.*)(\\b(", unit_names, ")(\\s|\\d))(.*)")

  prefix_unit_split <- paste0("^(\\D+)(\\b(", unit_names_box, ")(\\s|\\d))")
  prefix_unit_pattern <- paste0("^(\\D+?)(\\s(", unit_names_box, ")(\\s|\\d))(.*)")
  prefix_nounit_pattern <- "^(\\D+)(.*)"
  prefix_unit <- paste0("^(", unit_names_box, "|rte|cmp|((n|s|e|w|ne|nw|se|sw)(\\b|\\d)))")

  if (grepl(pattern = "^(rr|hwy)(\\d|\\b)", x = address, ignore.case = ignore.case)) {
    # don't bother if address starts with 'rr' or 'hwy'
    return(address)

  } else if (!grepl(pattern = two_nums, x = address, ignore.case = ignore.case)) {
    # don't bother if address has one or zero numbers in it
    return(address)

  } else if (grepl(pattern = prefix_unit_split, x = address, ignore.case = ignore.case)) {
    # match a prefix when the address also contains a unit term

    # pipe version
    # temp <- address %>% stringr::str_match(pattern = stringr::regex(prefix_unit_pattern, ignore_case = ignore.case)) %>% remove_tl_whitespace()
    temp <- remove_tl_whitespace(stringr::str_match(address, pattern = stringr::regex(prefix_unit_pattern, ignore_case = ignore.case)))
    # temp is a matrix of things that match the regular expression groups
    # *regex groups are the things in parentheses*

    # rearrange---put the extraneous info at the back (that's temp[,2])
    address <- paste0(temp[,4], ' ', temp[,6], ' ', temp[,2])

  } else if (!grepl(pattern = prefix_unit_split, x = address, ignore.case = ignore.case) & # technically don't need, since the previous condition takes care of this condition
             !grepl(pattern = prefix_unit, x = address, ignore.case = ignore.case) &
             grepl(pattern = prefix_nounit_pattern, x = address, ignore.case = ignore.case)) {

    # match a prefix when the address doesn't contain a unit term
    temp <- remove_tl_whitespace(stringr::str_match(address, pattern = stringr::regex(prefix_nounit_pattern, ignore_case = ignore.case)))

    # rearrange---put the extraneous info at the back (that's temp[,2]; temp[,3] is everything else)
    address <- paste0(temp[,3], ' ', temp[,2])

  }

  if (grepl(pattern = apt_pattern, x = address, ignore.case = ignore.case)) {
    # if there is an apartment in the address

    # now switch around the unit names
    # temp <- address %>% stringr::str_match(pattern = stringr::regex(apt_pattern, ignore_case = ignore.case)) %>% remove_tl_whitespace()
    # temp <- remove_tl_whitespace(stringr::str_match(address, pattern = stringr::regex(apt_pattern, ignore_case = ignore.case)))
    temp <- remove_tl_whitespace(stringr::str_match(address, pattern = stringr::regex(apt_pattern, ignore_case = ignore.case)))

    # temp[,5] is the unit number, temp[,2] is the normal address, and temp[,6] is everything else
    address <- paste0(temp[,5], '-', ifelse(temp[,2] == '', '', paste0(temp[,2], ' ')), temp[,6])
  }

  # return the modified address
  remove_tl_whitespace(address)
}

#' Rearrange unit/apt numbers to standardize addresses
#'
#' This function standardizes apartment numbers to the format
#' #-# Street St, Extra Info. Vectorized over 'address'.
#'
#' @param address A vector of address strings
#' @param ignore.case Ignore case for internal regex matching and rearranging.
#' @return A vector of converted address strings
#' @keywords standardize, apartment, unit, suite, address
#' @examples
#'
#' fix_unit_names('halifax shopping centre 7001 mumford rd unit 166')
#'
#' fix_unit_names(c('halifax shopping centre 7001 mumford rd',
#'                  'rr 2 box 166'))
#'
#' fix_unit_names('HFX SHOPPING CENTRE 7001 MUMFORD RD UNIT 166',
#'                 ignore.case = TRUE)
#' @export
fix_unit_names <- Vectorize(FUN = fix_unit_names_,
                            vectorize.args = 'address',
                            USE.NAMES = FALSE)