#' Find postcodes in a string.
#'
#' Returns an empty string where no match is found.
#' The string can be an address or any text string.
#' For large flat files, use `find_postcodes()`.
#'
#' @param string Input vector. Either a character vector, or something
#' coercible to one (e.g. a data frame).
#' @param locale A string, the country code in format ISO 3. Default is "GBR".
#'
#' @return A character vector containing all the matches.
#' @seealso [find_postcodes()]
#' @export
#'
#' @section TODO:
#' Handle extra white spaces: try trimming ws inside postcode if not match
#' Handle special cases with lower casing without interferring with other strings
#' such as Ec2M 1aA
#' Handle case: currently only findingpostcodes that are in caps (to deal with possible
#' typos, see test)
#'
#' @examples
#' string <- "The quick brown fox lives at 6 Bridge Road, N17 0RN."
#' find_postcodes_in_string(string)
#'
#' @importFrom stringr str_extract_all
find_postcodes_in_string <- function(string, locale = "GBR") {
switch(locale,
"GBR" = {
postcode_pattern <- paste0(
"(\\b[A-Z]{1,2}\\d[A-Z\\d]?|", # district preceded by word
"\\w{0}[A-Z]{1,2}\\d[A-Z\\d]?) ", # district preceded by word with no ws
"?\\d[A-Z]{2}" # sector + unit
)
}
)
string %>%
str_extract_all(postcode_pattern) %>%
unlist()
}
#' Find postcodes in a list, vector or data frame of strings/addresses.
#'
#' Returns an empty string where no match is found.
#' Vectorized version of `find_postcodes_in_string()`.
#'
#' @param X list, vector or data frame, appropriate to a call to lapply
#' @param ... optional arguments to `find_postcodes_in_string()`.
#'
#' @return list
#' @seealso [find_postcodes_in_string()]
#' @export
#'
#' @examples
#' find_postcodes(businesses)
find_postcodes <- function(X, ...) {
lapply(X, find_postcodes_in_string)
}
#' Find the numerical parts of an address (building|house|unit).
#'
#' Returns an empty string where no match is found.
#' The match is performed with a look ahead to match address number patterns
#' that are not a postcode.
#'
#' @param string Input vector. Either a character vector, or something
#' coercible to one (e.g. a data frame).
#' @param locale A string, the country code in format ISO 3. Default is "GBR".
#' @param collapse A character string separator used to collapse the matches.
#' @param unlist A boolean.
#'
#' @return A character vector of the contatenated values, with the same length
#' as the input character vector.
#' @export
#'
#' @examples
#' string <- "The quick brown fox's family lives at 22A-22B Bridge Road, N17 0RN."
#' find_buildings_numbers(string)
#'
#' find_buildings_numbers(businesses, unlist = FALSE)
#'
#' @importFrom stringr str_c str_extract_all
#' @importFrom purrr when
find_buildings_numbers <- function(string, locale = "GBR", collapse = "|", unlist = TRUE) {
switch(locale,
"GBR" = {
numbers_pattern <- paste0(
"(?![a-zA-Z]{1,2}\\d[a-zA-Z\\d]?)", # area + district (outward)
"(?!\\d[a-zA-Z]{2})\\b", # sector + unit (inward)
"([0-9]+[a-zA-Z]?)\\b", # numeric possibly with letter
"|\\b\\d+(?=[a-zA-Z]{3,}\\b)" #
)
})
string %>%
str_extract_all(numbers_pattern) %>%
lapply(str_c, collapse = collapse) %>%
when(
unlist ~ unlist(.),
~.
)
}
#' Strip buildings numbers (building|house|unit) from a string or
#' character vector of addresses.
#'
#' @param string Input vector. Either a character vector, or something
#' coercible to one.
#' @param locale A string, the country code in format ISO 3. Default is "GBR".
#' @param squish_ws A boolean; determines if trailing white spaces and repeated
#' whitespace inside a string (generated by the replacement), are trimmed or
#' not.
#'
#' @return A character vector of the original character strings striped from
#' building numbers.
#' @seealso [strip_buildings_numbers_vec()]
#' @export
#'
#' @examples
#'strip_buildings_numbers_vec(businesses[['address']])
#'
#' @importFrom stringr str_replace_all str_squish
#' @importFrom purrr when
strip_buildings_numbers_vec <- function(string, locale = "GBR", squish_ws = TRUE) {
switch(locale,
"GBR" = {
numbers_pattern <- paste0(
"(?![a-zA-Z]{1,2}\\d[a-zA-Z\\d]?)", # area + district (outward)
"(?!\\d[a-zA-Z]{2})\\b", # sector + unit (inward)
"([0-9]+[a-zA-Z]?)\\b", # numeric possibly with letter
"|\\b\\d+(?=[a-zA-Z]{3,}\\b)" #
)
})
string %>%
str_replace_all(numbers_pattern, "") %>%
when(
squish_ws ~ str_squish(.),
~.
)
}
#' Strip buildings numbers (building|house|unit) from a list, or data
#' frame of addresses.
#'
#' @param X list or data frame, appropriate for a call to lapply
#' @param ... optional arguments to `strip_buildings_numbers_vec()`.
#'
#' @return list
#' @seealso [strip_buildings_numbers_vec()]
#' @export
#'
#' @examples
#' strip_buildings_numbers_list(businesses)
strip_buildings_numbers_list <- function(X, ...) {
lapply(X, strip_buildings_numbers_vec)
}
#' Strip buildings numbers (building|house|unit).
#'
#' @param .data chacter, character vector, list or data frame.
#' @param locale A string, the country code in format ISO 3. Default is "GBR".
#' @param squish_ws A boolean; determines if trailing white spaces and repeated
#' whitespace inside a string (generated by the replacement), are trimmed or
#' not.
#'
#' @return An object of the same type as .data.
#' @export
#'
#' @examples
#' strip_buildings_numbers(businesses[['address']])
#' strip_buildings_numbers(businesses)
#'
#' @importFrom dplyr as_tibble
strip_buildings_numbers <- function(.data, locale = "GBR", squish_ws = TRUE) {
if (is.vector(.data) & !is.list(.data)) {
strip_buildings_numbers_vec(.data, locale, squish_ws)
} else if (is.data.frame(.data)) {
lapply(.data, strip_buildings_numbers_vec) %>%
as_tibble()
}
}
#' Reformat a postcode to standards.
#'
#' For GBR postcodes, if it is missing a white space between the outward
#' (area + district) and inward (sector + unit) codes, then add it.
#'
#' @param postcode A character string or vector of strings.
#' @param locale A string, the country code in format ISO 3. Default is "GBR".
#'
#' @return A character string.
#' @export
#'
#' @examples
#' format_postcode('eC2A0Rn')
#'
#' @importFrom stringr str_squish
format_postcode <- function(postcode, locale = "GBR") {
switch(locale,
"GBR" = {
postcode_pattern <- "([A-Z]{1,2}\\d[A-Z\\d]?)(\\d[A-Z]{2})"
}
)
postcode <- str_squish(postcode) %>%
toupper()
if (!grepl(" ", postcode, fixed = TRUE)) {
gsub(postcode_pattern, "\\1 \\2", postcode, perl = TRUE)
} else {
postcode
}
case_when(
!grepl(" ", postcode, fixed = TRUE) ~ gsub(postcode_pattern, "\\1 \\2", postcode, perl = TRUE),
postcode == "NAN" ~ NA_character_,
is.na(postcode) ~ NA_character_,
TRUE ~ postcode
)
}
#' Is the string the district part of a postcode.
#'
#' @param postcode A character string.
#' @param ignore_case A boolean.
#'
#' @return A boolean.
#' @export
#'
#' @examples
#' is_district("EC2A")
#' is_district("se16", ignore_case = TRUE)
is_district <- function(postcode, ignore_case = FALSE) {
grepl(
paste0("^[A-PR-UWYZ]([0-9]{1,2}", # district (E2)
"|([A-HK-Y][0-9]([0-9ABEHMNPRV-Y])?)", # district (EC2A)
"|[0-9][A-HJKPS-UW])$"), # district (SE16)
postcode,
ignore.case = ignore_case
)
}
#' Is the string the sector part of a postcode.
#'
#' @param postcode A character string.
#' @param ignore_case A boolean.
#'
#' @return A boolean.
#' @export
#'
#' @section FIXME:
#' SE16 is TRUE (should be FALSE) cause it takes it as SE1 6 without space
#'
#' @examples
#' is_sector("EC2A 0")
#' is_sector("se16 0", ignore_case = TRUE)
is_sector <- function(postcode, ignore_case = FALSE) {
grepl(
paste0("^[A-PR-UWYZ]([0-9]{1,2}", # district (E2 0)
"|([A-HK-Y][0-9]([0-9ABEHMNPRV-Y])?)", # district (EC2A 0)
"|[0-9][A-HJKPS-UW]) ?[0-9]$"), # district (SE16 0)
postcode,
ignore.case = ignore_case
)
}
#' Is the string a full postcode.
#'
#' @param postcode A character string.
#' @param ignore_case A boolean.
#'
#' @return A boolean.
#' @export
#'
#' @section TODO:
#' Handle case is_postcode_complete("ng22 9qc", ignore_case = TRUE)
#' This postcode is not recognized by the regex but is valid(?)
#'
#' @examples
#' is_postcode_complete("EC2A 3JX")
#' is_postcode_complete("ec2a", ignore_case = TRUE)
is_postcode_complete <- function(postcode, ignore_case = FALSE) {
grepl(
"(GIR ?0AA|[A-PR-UWYZ]([0-9]{1,2}|([A-HK-Y][0-9]([0-9ABEHMNPRV-Y])?)|[0-9][A-HJKPS-UW]) ?[0-9][ABD-HJLNP-UW-Z]{2})",
postcode,
ignore.case = ignore_case
)
}
#' Is the string a partial postcode.
#'
#' A postcode is partial if:
#' 1. it is not complete
#' 2. it is the area + district (outward) part of the postcode
#' 3. OR it is the sector + unit (inward) part of the postcode
#'
#' @param postcode A character string.
#' @param ignore_case A boolean.
#'
#' @return A boolean.
#' @export
#'
#' @examples
#' is_postcode_partial("EC2A 3JX")
#' is_postcode_partial("ec2a", ignore_case = TRUE)
is_postcode_partial <- function(postcode, ignore_case = FALSE) {
!is_postcode_complete(postcode, ignore_case = ignore_case) &
is_district(postcode, ignore_case = ignore_case)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.