R/standardize.R

Defines functions remove_punct remove_tl_whitespace remove_articles standardize_

remove_punct <- function(string) {
  # original / magrittr pipe version:
  # remove_punct <- . %>% gsub(pattern = "[^[:alnum:][:space:]#-]", replacement = "", x = .)
  gsub(pattern = "[^[:alnum:][:space:]#-]", replacement = "", x = string)
}

remove_tl_whitespace <- function(string) {
  # this is the same as `trim_ws()` I think
  # original / magrittr pipe version:
  # remove_tl_whitespace <- . %>% gsub(pattern = "^\\s+|\\s+$", replacement = "", x = ., ignore.case = TRUE)
  gsub(pattern = "^\\s+|\\s+$", replacement = "", x = string, ignore.case = TRUE)
}

remove_articles <- function(string) {
  # original / magrittr pipe version:
  # remove_articles <- . %>%
  #   gsub(pattern = "\\b(d|de|le|l|la|aux|au|des|du|les|the)\\b", replacement = "", x = ., ignore.case = TRUE) %>%
  #   remove_tl_whitespace()
  gsub(pattern = "\\b(d|de|le|l|la|aux|au|des|du|les|the)\\b", replacement = "", x = remove_tl_whitespace(string), ignore.case = TRUE)
}

standardize_ <- function(text, dictionary) {

  # x <- text %>%
  #       iconv(to = "ASCII//TRANSLIT") %>% # convert encoding to correctly represent accents, etc.
  #       remove_punct() %>% # remove punctuation (and accents)
  #       remove_articles() %>% # remove definite/undefinite articles, and whitespace
  #       tolower() %>% # convert to all lowercase (lowercase easier to read, and lets later gsubs use ignore.case = FALSE)
  #       strsplit(' ') %>% # split into words (used to use unnest_tokens(), but that's overkill)
  #       unlist() %>% tibble::as_tibble() # convert to tibble because I have to use left_join

  # I've heard piping within user written functions can be hard to debug because the call stack
  # gets huge. So re-writing the above ^ to use temp variables.
  x <- iconv(text, to = "ASCII//TRANSLIT")
  x <- remove_punct(x) # remove punctuation (and accents)
  x <- remove_articles(x) # remove definite/undefinite articles, and whitespace
  x <- tolower(x) # convert to all lowercase (lowercase easier to read, and lets later gsubs use ignore.case = FALSE)
  x <- strsplit(x, ' ') # split into words (used to use unnest_tokens(), but that's overkill)
  x <- unlist(x) # annoying step to convert a list of lists to one list
  x <- tibble::as_tibble(x) # convert to tibble because I have to use left_join

  # merge on dictionary; adds a column with the "standard"
  x <- dplyr::left_join(x, dictionary, by = c("value" = "word"))

  # if there is an entry in the dictionary, replace the word with the standard
  x[!is.na(x$standard), "value"] <- x[!is.na(x$standard), "standard"]

  # put the address back together
  paste0(x$value, collapse = ' ')
}

#' Standardize names or addresses
#'
#' This function standardizes things, depending on a dictionary.
#' There are two dictionaries supplied (company_dictionary and address_dictionary),
#' and the user may supply a new one (which is a two column dataframe with columns "word" and
#' "standard").
#'
#' @param text A character vector of strings to convert using the dictionary
#'
#' @param dictionary A dictionary of word pairs (word, standard) to convert
#'
#' @return A vector of converted strings
#' @keywords standardize, names, addresses
#' @examples
#'
#' library(dplyr)
#'
#' # company_dictionary is a dataset from this package; may need its own package? nah.
#' "A.-B. SECURITY LIMITED" %>% standardize(dictionary = company_dictionary)
#'
#' # is vectorized:
#' c("A.-B. SECURITY LIMITED", "RNN Sales & Réntals") %>% standardize(dictionary = company_dictionary))
#'
#' \dontrun{
#' br %>% mutate(standardized_name = standardize(name, dictionary = company_dictionary))
#' }
#' @export
standardize <- Vectorize(FUN = standardize_, vectorize.args = c('text'), USE.NAMES = FALSE)
tweed1e/matchtools documentation built on May 29, 2019, 10:51 a.m.