sabre: Spruce up Addresse and Business REcords

Documented in find_buildings_numbers find_postcodes find_postcodes_in_string

#' Find postcodes in a string.
#'
#' Returns an empty string where no match is found.
#' The string can be an address or any text string.
#' For large flat files, use `find_postcodes()`.
#'
#' @param string Input vector. Either a character vector, or something
#'   coercible to one (e.g. a data frame).
#' @param locale A string, the country code in format ISO 3. Default is "GBR".
#'
#' @return A character vector containing all the matches.
#' @seealso [find_postcodes()]
#' @export
#'
#' @section TODO:
#' Handle extra white spaces: try trimming ws inside postcode if not match
#' Handle special cases with lower casing without interferring with other strings
#'   such as Ec2M 1aA
#' Handle case: currently only findingpostcodes that are in caps (to deal with possible
#' typos, see test)
#'
#' @examples
#' string <- "The quick brown fox lives at 6 Bridge Road, N17 0RN."
#' find_postcodes_in_string(string)
#'
#' @importFrom stringr str_extract_all
find_postcodes_in_string <- function(string, locale = "GBR") {
  switch(locale,
    "GBR" = {
      postcode_pattern <- paste0(
        "(\\b[A-Z]{1,2}\\d[A-Z\\d]?|",    # district preceded by word
        "\\w{0}[A-Z]{1,2}\\d[A-Z\\d]?) ", # district preceded by word with no ws
        "?\\d[A-Z]{2}"                    # sector + unit
      )
    }
  )

  string %>%
    str_extract_all(postcode_pattern) %>%
    unlist()
}


#' Find postcodes in a list, vector or data frame of strings/addresses.
#'
#' Returns an empty string where no match is found.
#' Vectorized version of `find_postcodes_in_string()`.
#'
#' @param X list, vector or data frame, appropriate to a call to lapply
#' @param ... optional arguments to `find_postcodes_in_string()`.
#'
#' @return list
#' @seealso [find_postcodes_in_string()]
#' @export
#'
#' @examples
#' find_postcodes(businesses)
find_postcodes <- function(X, ...) {
  lapply(X, find_postcodes_in_string)
}


#' Find the numerical parts of an address (building|house|unit).
#'
#' Returns an empty string where no match is found.
#' The match is performed with a look ahead to match address number patterns
#'   that are not a postcode.
#'
#' @param string Input vector. Either a character vector, or something
#'   coercible to one (e.g. a data frame).
#' @param locale A string, the country code in format ISO 3. Default is "GBR".
#' @param collapse A character string separator used to collapse the matches.
#' @param unlist A boolean.
#'
#' @return A character vector of the contatenated values, with the same length
#'   as the input character vector.
#' @export
#'
#' @examples
#' string <- "The quick brown fox's family lives at 22A-22B Bridge Road, N17 0RN."
#' find_buildings_numbers(string)
#'
#' find_buildings_numbers(businesses, unlist = FALSE)
#'
#' @importFrom stringr str_c str_extract_all
#' @importFrom purrr when
find_buildings_numbers <- function(string, locale = "GBR", collapse = "|", unlist = TRUE) {
  switch(locale,
         "GBR" = {
           numbers_pattern <- paste0(
             "(?![a-zA-Z]{1,2}\\d[a-zA-Z\\d]?)", # area + district (outward)
             "(?!\\d[a-zA-Z]{2})\\b",            # sector + unit (inward)
             "([0-9]+[a-zA-Z]?)\\b",             # numeric possibly with letter
             "|\\b\\d+(?=[a-zA-Z]{3,}\\b)"       #
           )
         })

  string %>%
    str_extract_all(numbers_pattern) %>%
    lapply(str_c, collapse = collapse) %>%
    when(
      unlist ~ unlist(.),
      ~.
    )
}


#' Strip buildings numbers (building|house|unit) from a string or
#' character vector of addresses.
#'
#' @param string Input vector. Either a character vector, or something
#'   coercible to one.
#' @param locale A string, the country code in format ISO 3. Default is "GBR".
#' @param squish_ws A boolean; determines if trailing white spaces and repeated
#'   whitespace inside a string (generated by the replacement), are trimmed or
#'   not.
#'
#' @return A character vector of the original character strings striped from
#'   building numbers.
#' @seealso [strip_buildings_numbers_vec()]
#' @export
#'
#' @examples
#'strip_buildings_numbers_vec(businesses[['address']])
#'
#' @importFrom stringr str_replace_all str_squish
#' @importFrom purrr when
strip_buildings_numbers_vec <- function(string, locale = "GBR", squish_ws = TRUE) {
  switch(locale,
         "GBR" = {
           numbers_pattern <- paste0(
             "(?![a-zA-Z]{1,2}\\d[a-zA-Z\\d]?)", # area + district (outward)
             "(?!\\d[a-zA-Z]{2})\\b",            # sector + unit (inward)
             "([0-9]+[a-zA-Z]?)\\b",             # numeric possibly with letter
             "|\\b\\d+(?=[a-zA-Z]{3,}\\b)"       #
           )
         })

  string %>%
    str_replace_all(numbers_pattern, "") %>%
    when(
      squish_ws ~ str_squish(.),
      ~.
    )
}


#' Strip buildings numbers (building|house|unit) from a list, or data
#' frame of addresses.
#'
#' @param X list or data frame, appropriate for a call to lapply
#' @param ... optional arguments to `strip_buildings_numbers_vec()`.
#'
#' @return list
#' @seealso [strip_buildings_numbers_vec()]
#' @export
#'
#' @examples
#' strip_buildings_numbers_list(businesses)
strip_buildings_numbers_list <- function(X, ...) {
  lapply(X, strip_buildings_numbers_vec)
}


#' Strip buildings numbers (building|house|unit).
#'
#' @param .data chacter, character vector, list or data frame.
#' @param locale A string, the country code in format ISO 3. Default is "GBR".
#' @param squish_ws A boolean; determines if trailing white spaces and repeated
#'   whitespace inside a string (generated by the replacement), are trimmed or
#'   not.
#'
#' @return An object of the same type as .data.
#' @export
#'
#' @examples
#' strip_buildings_numbers(businesses[['address']])
#' strip_buildings_numbers(businesses)
#'
#' @importFrom dplyr as_tibble
strip_buildings_numbers <- function(.data, locale = "GBR", squish_ws = TRUE) {
  if (is.vector(.data) & !is.list(.data)) {
    strip_buildings_numbers_vec(.data, locale, squish_ws)
  } else if (is.data.frame(.data)) {
    lapply(.data, strip_buildings_numbers_vec) %>%
      as_tibble()
  }
}


#' Reformat a postcode to standards.
#'
#' For GBR postcodes, if it is missing a white space between the outward
#' (area + district) and inward (sector + unit) codes, then add it.
#'
#' @param postcode A character string or vector of strings.
#' @param locale A string, the country code in format ISO 3. Default is "GBR".
#'
#' @return A character string.
#' @export
#'
#' @examples
#' format_postcode('eC2A0Rn')
#'
#' @importFrom stringr str_squish
format_postcode <- function(postcode, locale = "GBR") {
  switch(locale,
    "GBR" = {
      postcode_pattern <- "([A-Z]{1,2}\\d[A-Z\\d]?)(\\d[A-Z]{2})"
    }
  )

  postcode <- str_squish(postcode) %>%
    toupper()

  if (!grepl(" ", postcode, fixed = TRUE)) {
    gsub(postcode_pattern, "\\1 \\2", postcode, perl = TRUE)
  } else {
    postcode
  }

  case_when(
    !grepl(" ", postcode, fixed = TRUE) ~ gsub(postcode_pattern, "\\1 \\2", postcode, perl = TRUE),
    postcode == "NAN" ~ NA_character_,
    is.na(postcode) ~ NA_character_,
    TRUE ~ postcode
  )
}


#' Is the string the district part of a postcode.
#'
#' @param postcode A character string.
#' @param ignore_case A boolean.
#'
#' @return A boolean.
#' @export
#'
#' @examples
#' is_district("EC2A")
#' is_district("se16", ignore_case = TRUE)
is_district <- function(postcode, ignore_case = FALSE) {
  grepl(
    paste0("^[A-PR-UWYZ]([0-9]{1,2}",             # district (E2)
           "|([A-HK-Y][0-9]([0-9ABEHMNPRV-Y])?)", # district (EC2A)
           "|[0-9][A-HJKPS-UW])$"),               # district (SE16)
    postcode,
    ignore.case = ignore_case
  )
}


#' Is the string the sector part of a postcode.
#'
#' @param postcode A character string.
#' @param ignore_case A boolean.
#'
#' @return A boolean.
#' @export
#'
#' @section FIXME:
#' SE16 is TRUE (should be FALSE) cause it takes it as SE1 6 without space
#'
#' @examples
#' is_sector("EC2A 0")
#' is_sector("se16 0", ignore_case = TRUE)
is_sector <- function(postcode, ignore_case = FALSE) {
  grepl(
    paste0("^[A-PR-UWYZ]([0-9]{1,2}",             # district (E2 0)
           "|([A-HK-Y][0-9]([0-9ABEHMNPRV-Y])?)", # district (EC2A 0)
           "|[0-9][A-HJKPS-UW]) ?[0-9]$"),        # district (SE16 0)
    postcode,
    ignore.case = ignore_case
  )
}


#' Is the string a full postcode.
#'
#' @param postcode A character string.
#' @param ignore_case A boolean.
#'
#' @return A boolean.
#' @export
#'
#' @section TODO:
#' Handle case is_postcode_complete("ng22 9qc", ignore_case = TRUE)
#' This postcode is not recognized by the regex but is valid(?)
#'
#' @examples
#' is_postcode_complete("EC2A 3JX")
#' is_postcode_complete("ec2a", ignore_case = TRUE)
is_postcode_complete <- function(postcode, ignore_case = FALSE) {
  grepl(
    "(GIR ?0AA|[A-PR-UWYZ]([0-9]{1,2}|([A-HK-Y][0-9]([0-9ABEHMNPRV-Y])?)|[0-9][A-HJKPS-UW]) ?[0-9][ABD-HJLNP-UW-Z]{2})",
    postcode,
    ignore.case = ignore_case
  )
}


#' Is the string a partial postcode.
#'
#' A postcode is partial if:
#' 1. it is not complete
#' 2. it is the area + district (outward) part of the postcode
#' 3. OR it is the sector + unit (inward) part of the postcode
#'
#' @param postcode A character string.
#' @param ignore_case A boolean.
#'
#' @return A boolean.
#' @export
#'
#' @examples
#' is_postcode_partial("EC2A 3JX")
#' is_postcode_partial("ec2a", ignore_case = TRUE)
is_postcode_partial <- function(postcode, ignore_case = FALSE) {
  !is_postcode_complete(postcode, ignore_case = ignore_case) &
    is_district(postcode, ignore_case = ignore_case)
}