R/dictionary.R

Defines functions pm_append pm_convert_case pm_case pm_dictionary_country pm_dictionary_us_intersection pm_dictionary_us_suffix pm_dictionary_us_dir pm_parse_place pm_get_tidycensus pm_dictionary_us_cities pm_dictionary_us_states pm_dictionary

Documented in pm_append pm_dictionary

#' Create Dictionary Objects
#'
#' @description This function allows for the creation of dictionary objects that are
#'     either optional or required elements of other \code{postmastr} functions. Once created,
#'     dictionary objects should not be modified as the formatting of dictionary elements provides
#'     other \code{postmastr} functions with predictable inputs. The \code{"state"},
#'     \code{"directional"}, and \code{"suffix"} dictionaries are based on tables that are also
#'     exported so that they can be previewed and tested.
#'
#'     Street dictionaries cannot be created using \code{pm_dictionary}. Users who wish to standardize
#'     street names in their data should create a dictionary appendix with \code{pm_append} and use
#'     that as the primary dictionary for functions that accept street dictionaries. The same is true
#'     for house suffix dictionaries.
#'
#' @details The city dictionary functionality is powered by the \link[tidycensus]{get_acs} function
#'     from the \pkg{tidycensus} package. This requires a Census Bureau API key, which can be
#'     obtained at \url{http://api.census.gov/data/key_signup.html}. Once you have a key, the
#'     \link[tidycensus]{census_api_key} function from \pkg{tidycensus} should be used to
#'     set-up the key before proceeding with the creation of any dictionary objects for cities.
#'
#' @usage pm_dictionary(type, append, filter, case = c("title", "lower", "upper"), locale = "us")
#'
#' @param type A string indicating the grammatical address element the dictionary
#'     should represent. Current options are \code{"state"}, \code{"city"},
#'     \code{"directional"}, and \code{"suffix"}.
#' @param append An optional dictionary appendix object created with \code{\link{pm_append}}
#' @param filter An optional character scalar or vector with output elements that should
#'     be retained. Filter inputs are based on the expected output: states should be specified
#'     with their two-letter abbrevation (i.e. \code{"MO"}), cities should be specified in title
#'     case (i.e. \code{"St. Louis"}), directionals should use one- or two-letter abbreivations
#'     (i.e. \code{"N"}), and street suffixes should use the \code{suf.output} for the desired
#'     suffix (i.e. \code{"Ave"} or \code{"Rd"}).
#' @param case An optional character scalar or vector containing one or more of three valid
#'     options - \code{"title"} (e.g. "Missouri"), \code{"lower"} (e.g. "missouri), or
#'     \code{"upper"} (e.g. "MISSOURI"). These are used to create a more robust dictionary of
#'     input terms.
#' @param locale A string indicating the country these data represent; the only
#'     current option is \code{"us"} but this is included to facilitate future expansion.
#'
#' @return A \code{postmastr} dictionary object, which will always include an input column
#'     of possible terms for the given grammatical address element. All dictionary objects
#'     except for city dictionaries will also contain an output column with the
#'     preferred output for each input. For American addresses, these outputs follow United States
#'     Postal Service guidelines.
#'
#' @seealso \code{\link{pm_append}}, \code{\link{dic_us_dir}}, \code{\link{dic_us_states}},
#'     \code{\link{dic_us_suffix}}
#'
#' @examples
#' # build state dictionary, title case only
#' pm_dictionary(type = "state", filter = "MO", case = "title", locale = "us")
#'
#' # build state dictionary, title and upper case
#' pm_dictionary(type = "state", filter = "MO", case = c("title", "upper"), locale = "us")
#'
#' # build New England state dictionary, all cases
#' pm_dictionary(type = "state", filter = c("CT", "MA", "ME", "NH", "RI", "VT"),
#'     case = c("title", "upper", "lower"), locale = "us")
#'
#' # add custom abbreviation for Massachusetts
#' ## create dictionary appendix
#' ma <- pm_append(type = "state", input = "Mass", output = "MA", locale = "us")
#'
#' ## add dictionary appendix to dictionary
#' pm_dictionary(type = "state", filter = c("CT", "MA", "ME", "NH", "RI", "VT"),
#'     append = ma, case = "title", locale = "us")
#'
#' # build Missouri city dictionary
#' \dontrun{
#' # tidycensus::census_api_key("YOUR API KEY GOES HERE")
#' pm_dictionary(type = "city", filter = "MO", case = "upper", locale = "us")
#' }
#'
#' # build directional dictionary
#' pm_dictionary(type = "directional", filter = c("N", "S", "E", "W"), locale = "us")
#'
#' # build suffix dictionary
#' pm_dictionary(type = "suffix", filter = c("Ave", "Rd"), locale = "us")
#'
#' @importFrom dplyr bind_rows
#' @importFrom dplyr filter
#'
#' @export
pm_dictionary <- function(type, append, filter, case = c("title", "lower", "upper"), locale = "us"){

  if (locale == "us"){

    if (type == "state"){

      if (missing(append) == FALSE & missing(filter) == FALSE){
        working <- pm_dictionary_us_states(append = append, filter = filter)
      } else if (missing(append) == FALSE & missing(filter) == TRUE){
        working <- pm_dictionary_us_states(append = append)
      } else if (missing(append) == TRUE & missing(filter) == FALSE){
        working <- pm_dictionary_us_states(filter = filter)
      } else if (missing(append) == TRUE & missing(filter) == TRUE){
        working <- pm_dictionary_us_states()
      }

      out <- pm_case(working, locale = locale, type = type, case = case)

    } else if (type == "city"){

      if (missing(filter) == TRUE){
        stop("The argument 'filter' is required for constructing city dictionaries.")
      }

      if (missing(append) == FALSE){
        working <- pm_dictionary_us_cities(append = append, states = filter)
      } else if (missing(append) == TRUE){
        working <- pm_dictionary_us_cities(states = filter)
      }

     out <- pm_case(working, locale = locale, type = type, case = case)

    } else if (type == "directional"){

      if (missing(append) == FALSE & missing(filter) == FALSE){
        working <- pm_dictionary_us_dir(append = append, filter = filter)
      } else if (missing(append) == FALSE & missing(filter) == TRUE){
        working <- pm_dictionary_us_dir(append = append)
      } else if (missing(append) == TRUE & missing(filter) == FALSE){
        working <- pm_dictionary_us_dir(filter = filter)
      } else if (missing(append) == TRUE & missing(filter) == TRUE){
        working <- pm_dictionary_us_dir()
      }

      out <- pm_case(working, locale = locale, type = type, case = case)

    } else if (type == "suffix"){

      if (missing(append) == FALSE & missing(filter) == FALSE){
        working <- pm_dictionary_us_suffix(append = append, filter = filter)
      } else if (missing(append) == FALSE & missing(filter) == TRUE){
        working <- pm_dictionary_us_suffix(append = append)
      } else if (missing(append) == TRUE & missing(filter) == FALSE){
        working <- pm_dictionary_us_suffix(filter = filter)
      } else if (missing(append) == TRUE & missing(filter) == TRUE){
        working <- pm_dictionary_us_suffix()
      }

      out <- pm_case(working, locale = locale, type = type, case = case)

    } else if (type == "intersection"){

      if (missing(append) == FALSE & missing(filter) == FALSE){
        working <- pm_dictionary_us_intersection(append = append, filter = filter)
      } else if (missing(append) == FALSE & missing(filter) == TRUE){
        working <- pm_dictionary_us_intersection(append = append)
      } else if (missing(append) == TRUE & missing(filter) == FALSE){
        working <- pm_dictionary_us_intersection(filter = filter)
      } else if (missing(append) == TRUE & missing(filter) == TRUE){
        working <- pm_dictionary_us_intersection()
      }

      out <- pm_case(working, locale = locale, type = type, case = case)

    } else if (type == "country"){

      if (missing(append) == FALSE & missing(filter) == FALSE){
        working <- pm_dictionary_country(append = append, filter = filter)
      } else if (missing(append) == FALSE & missing(filter) == TRUE){
        working <- pm_dictionary_country(append = append)
      } else if (missing(append) == TRUE & missing(filter) == FALSE){
        working <- pm_dictionary_country(filter = filter)
      } else if (missing(append) == TRUE & missing(filter) == TRUE){
        working <- pm_dictionary_country()
      }

      out <- pm_case(working, locale = locale, type = type, case = case)

    }

  }

  # return output
  return(out)

}

# us states
pm_dictionary_us_states <- function(append, filter){

  # global bindings
  state.output = NULL

  # load data
  out <- postmastr::dic_us_states

  # optionally append
  if (missing(append) == FALSE){

    # bind rows
    out <- dplyr::bind_rows(out, append)

    # re-order observations
    out <- out[order(out$state.output),]

  }

  # optionally filter
  if (missing(filter) == FALSE){
    out <- dplyr::filter(out, state.output %in% filter)
  }

  # return output
  return(out)

}

# us cities
pm_dictionary_us_cities <- function(append, states){

  out <- pm_get_tidycensus(states = states)

  # optionally append
  if (missing(append) == FALSE){

    # bind rows
    out <- dplyr::bind_rows(out, append)

    # re-order observations
    out <- out[order(out$city.input),]

  }

  # return output
  return(out)

}

# us cities via tidycensus
pm_get_tidycensus <- function(states){

  # global bindings
  state.abb = NAME = NULL

  # download data
  states %>%
    base::split(states) %>%
    purrr::map_df(~ suppressMessages(
      tidycensus::get_acs(year = 2017, state = .x, geography = "place", variable = "B01003_001"))) -> data

  # create state dictionary
  dict <- data.frame(
    state.name = c(datasets::state.name),
    state.abb = c(datasets::state.abb),
    stringsAsFactors = FALSE
  )

  dict <- dplyr::filter(dict, state.abb %in% states)
  dict <- dict$state.name

  # parse state names
  data %>%
    dplyr::select(NAME) %>%
    dplyr::mutate(NAME = stringr::str_replace_all(NAME, pattern = ",", replace = "")) %>%
    pm_parse_place(dictionary = dict) %>%
    dplyr::mutate(NAME = stringr::str_trim(NAME, side = "right")) -> data

  # create directory
  dict <- c("city", "town", "village", "CDP")

  # parse place types
  data %>%
    pm_parse_place(dictionary = dict) -> data

  # clean-up output
  data %>%
    dplyr::mutate(NAME = stringr::str_trim(NAME, side = "right")) %>%
    dplyr::distinct(NAME, .keep_all = TRUE) %>%
    dplyr::rename(city.input = NAME) -> data

  # re-order observations
  data <- data[order(data$city.input),]

  # return output
  return(data)

}

# parse components of tidycensus data
pm_parse_place <- function(.data, dictionary){

  # global bindings
  NAME = pm.place = NULL

  # iterate over observations
  .data %>%
    dplyr::mutate(pm.place = purrr::map(NAME, ~ pm_extract_pattern(.x, dictionary = dictionary, end = TRUE))) -> .data

  # clean address data
  .data %>%
    tidyr::unnest(pm.place) %>%
    dplyr::filter(is.na(pm.place) == FALSE) %>%
    dplyr::mutate(pm.place = as.character(pm.place)) %>%
    dplyr::mutate(NAME = stringr::str_replace(NAME,
                                              pattern = stringr::str_c("\\b", pm.place, "\\b$"),
                                              replacement = "")) %>%
    dplyr::select(-pm.place) -> .data

  return(.data)

}

# us directions
pm_dictionary_us_dir <- function(append, filter){

  # global bindings
  dir.output = NULL

  # load data
  out <- postmastr::dic_us_dir

  # optionally append
  if (missing(append) == FALSE){

    # bind rows
    out <- dplyr::bind_rows(out, append)

    # re-order observations
    out <- out[order(out$dir.output),]

  }

  # optionally filter
  if (missing(filter) == FALSE){
    out <- dplyr::filter(out, dir.output %in% filter)
  }

  # return output
  return(out)

}

# us street suffixes
pm_dictionary_us_suffix <- function(append, filter){

  # global bindings
  suf.output = NULL

  # load data
  out <- postmastr::dic_us_suffix

  # optionally append
  if (missing(append) == FALSE){

    # bind rows
    out <- dplyr::bind_rows(out, append)

    # re-order observations
    out <- out[order(out$suf.output),]

  }

  # optionally filter
  if (missing(filter) == FALSE){
    out <- dplyr::filter(out, suf.output %in% filter)
  }

  # return output
  return(out)

}

# us intersections
pm_dictionary_us_intersection <- function(append, filter){

  # global bindings
  intersect.output = intersect.input = NULL

  # load data
  out <- postmastr::dic_us_intersect

  # optionally append
  if (missing(append) == FALSE){

    # bind rows
    out <- dplyr::bind_rows(out, append)

    # re-order observations
    out <- out[order(out$intersect.output),]

  }

  # optionally filter
  if (missing(filter) == FALSE){
    out <- dplyr::filter(out, intersect.input %in% filter)
  }

  # return output
  return(out)

}

# country names
pm_dictionary_country <- function(append, filter){

  # global bindings
  con.output = NULL

  # load data
  out <- postmastr::dic_country

  # optionally append
  if (missing(append) == FALSE){

    # bind rows
    out <- dplyr::bind_rows(out, append)

    # re-order observations
    out <- out[order(out$con.output),]

  }

  # optionally filter
  if (missing(filter) == FALSE){
    out <- dplyr::filter(out, con.output %in% filter)
  }

  # return output
  return(out)

}

# Dictionary Case
pm_case <- function(.data, locale, type, case){

  if (locale == "us"){

    if (type == "state"){
      out <- pm_convert_case(.data, var = "state.input", orderVar = "state.output", case = case)
    } else if (type == "directional"){
      out <- pm_convert_case(.data, var = "dir.input", orderVar = "dir.output", case = case)
    } else if (type == "suffix"){
      out <- pm_convert_case(.data, var = "suf.input", orderVar = "suf.output", case = case)
    } else if (type == "intersection"){
      out <- pm_convert_case(.data, var = "intersect.input", orderVar = "intersect.output", case = case)
    } else if (type == "country"){
      out <- pm_convert_case(.data, var = "con.input", orderVar = "con.output", case = case)
    } else if (type == "city"){
      out <- pm_convert_case(.data, var = "city.input", orderVar = "city.input", case = case)
    }

  }

  return(out)

}

# Convert Case
pm_convert_case <- function(.data, var, orderVar, case){

  # global binding
  ...case = NULL

  # save parameters to list
  paramList <- as.list(match.call())

  # reformat address variable
  if (!is.character(paramList$var)) {
    varQ <- rlang::enquo(var)
  } else if (is.character(paramList$var)) {
    varQ <- rlang::quo(!! rlang::sym(var))
  }

  varQN <- rlang::quo_name(rlang::enquo(var))

  # convert
  title <- dplyr::mutate(.data, ...case = "title")
  lower <- dplyr::mutate(.data,
                         !!varQN := stringr::str_to_lower(!!varQ),
                         ...case = "lower")
  upper <- dplyr::mutate(.data,
                         !!varQN := stringr::str_to_upper(!!varQ),
                         ...case = "upper")

  dplyr::bind_rows(title, lower, upper) %>%
    dplyr::filter(...case %in% case) %>%
    dplyr::distinct(!!varQ, .keep_all = TRUE) %>%
    dplyr::select(-...case) -> out

  # re-order observations
  out <- out[order(out[[orderVar]]),]

}

#' Create Dictionary Object Appendicies
#'
#' @description This function allows for the creation of dictionary objects that are
#'     either optional or required elements of other \code{postmastr} functions. These
#'     objects are appendicies that can serve either as stand-alone dictionaries or
#'     used in a \code{\link{pm_dictionary}} call as the input  for the \code{append}
#'     parameter.
#'
#'     Once created, dictionary objects should not be modified as the formatting of dictionary
#'     elements provides other \code{postmastr} functions with predictable inputs. The
#'     \code{"state"}, \code{"directional"}, and \code{"suffix"} dictionaries are based on
#'     tables that are also exported so that they can be previewed and tested.
#'
#' @usage pm_append(type, input, output, locale = "us")
#'
#' @param type A string indicating the grammatical address element the dictionary
#'     should represent. Current options are \code{"state"}, \code{"city"},
#'     \code{"street"}, \code{"house suffix"}, \code{"directional"}, and \code{"suffix"}.
#' @param input A character scalar or vector containing possible terms existing in
#'     the data. This should be the same length as \code{output}.
#' @param output A character scalar or vector containing desired output for each input.
#'     This should be the same length as \code{input}. You may use \code{NA} values
#'     in the \code{output} vector for inputs that already are in the preferred,
#'     standardized form. This argument is optional when \code{type} is equal to
#'     \code{"city"}.
#' @param locale A string indicating the country these data represent; the only
#'     current option is \code{"us"} but this is included to facilitate future expansion.
#'
#' @return A \code{postmastr} dictionary object, which will always include an input column
#'     of possible terms for the given grammatical address element. All dictionary objects
#'     except for city dictionaries will also contain an output column with the
#'     preferred output for each input. For city dictionaries, the \code{output} argument
#'     is optional.
#'
#'     For American addresses, these outputs should follow
#'     United States Postal Service guidelines. These dictionary objects can be used as
#'     stand-alone dictionaries or fed into a \code{\link{pm_dictionary}} call as the input
#'     for the \code{append} parameter.
#'
#' @seealso \code{pm_dictionary}
#'
#' @examples
#' # create stand-alone state dictionary
#' mo <- pm_append(type = "state", input = c("Missouri", "MO", "MISSOURI"),
#'     output = c("MO", "MO", "MO"), locale = "us")
#'
#' # add custom abbreviation for Massachusetts to state dictionary
#' ## create dictionary appendix
#' ma <- pm_append(type = "state", input = "Mass", output = "MA", locale = "us")
#'
#' ## add dictionary appendix to dictionary
#' pm_dictionary(type = "state", filter = c("CT", "MA", "ME", "NH", "RI", "VT"),
#'     append = ma, case = "title", locale = "us")
#'
#' # create stand-alone city dictionary, with spelling correction for some cities
#' cities <- pm_append(type = "city",
#'     input = c("Brentwood", "Clayton", "CLAYTON", "Maplewood", "St. Louis",
#'               "SAINT LOUIS", "Webster Groves"),
#'     output = c(NA, NA, "Clayton", NA, NA, "St. Louis", NA),
#'     locale = "us")
#'
#' # create stand-alone street dictionary
#' cities <- pm_append(type = "street",
#'     input = c("MLK", "M.L.K."),
#'     output = c("Dr. Martin Luther King, Jr.", "Dr. Martin Luther King, Jr."),
#'     locale = "us")
#'
#' @importFrom dplyr tibble
#'
#' @export
pm_append <- function(type, input, output, locale = "us"){

  # global binding
  state.output = state.input = city.output = city.input = NULL

  if (locale == "us"){

    if (type == "state"){

      out <- dplyr::tibble(
        state.output = c(output),
        state.input = c(input))

      out <- dplyr::mutate(out,
                           state.output = stringr::str_to_upper(state.output),
                           state.input = stringr::str_to_title(state.input))

      # re-order observations
      out <- out[order(out$state.input),]

    } else if (type == "city"){

      if (missing(output) == TRUE){

        out <- dplyr::tibble(
          city.input = c(input))

      } else if (missing(output) == FALSE){

        out <- dplyr::tibble(
          city.output = c(output),
          city.input = c(input))

      }

      # re-order observations
      out <- out[order(out$city.input),]

    } else if (type == "directional"){

      out <- dplyr::tibble(
        dir.output = c(output),
        dir.input = c(input))

      # re-order observations
      out <- out[order(out$dir.input),]

    } else if (type == "suffix"){

      out <- dplyr::tibble(
        suf.output = c(output),
        suf.input = c(input))

      # re-order observations
      out <- out[order(out$suf.input),]

    } else if (type == "street"){

      out <- dplyr::tibble(
        st.output = c(output),
        st.input = c(input))

      # re-order observations
      out <- out[order(out$st.input),]

    } else if (type == "house suffix"){

      out <- dplyr::tibble(
        houseSuf.output = c(output),
        houseSuf.input = c(input))

      # re-order observations
      out <- out[order(out$houseSuf.input),]

    } else if (type == "intersection"){

      out <- dplyr::tibble(
        intersect.output = c(output),
        intersect.input = c(input))

      # re-order observations
      out <- out[order(out$intersect.input),]

    } else if (type == "country"){

      out <- dplyr::tibble(
        con.output = c(output),
        con.input = c(input))

      # re-order observations
      out <- out[order(out$houseSuf.input),]

    }

  }

  # return output
  return(out)

}

#' State Dictionary, United States of America
#'
#' @description A listing of abbreviations and full names for U.S. states, territories,
#'     and military "states" used for addressing mail to members of the U.S. armed forces.
#'     This dictionary includes all territories given two-letter state (or equivalent)
#'     abbreviations by the U.S. Postal Service.
#'
#' @docType data
#'
#' @usage data(dic_us_states)
#'
#' @format A tibble with 124 rows and 2 variables:
#' \describe{
#'   \item{state.output}{standard two-letter abbreviations}
#'   \item{state.input}{standard full names and two-letter abbreviations}
#' }
#'
#' @seealso \href{https://pe.usps.com/text/pub28/28apb.htm}{U.S. Postal Service, Publication 28, Appendix B}
#'
#' @examples
#' head(dic_us_states)
#'
"dic_us_states"

#' Directional Dictionary, United States of America
#'
#' @description A list of abbreviations for full names for prefix and suffix directionals
#'    in English.
#'
#' @docType data
#'
#' @usage data(dic_us_dir)
#'
#' @format A tibble with 20 rows and 2 variables:
#' \describe{
#'   \item{dir.output}{standard directional abbreviations}
#'   \item{dir.input}{standard full names and directional abbreviations}
#' }
#'
#' @seealso \href{https://pe.usps.com/text/pub28/28c2_014.htm}{U.S. Postal Service, Publication 28, Part 2-33}
#'
#' @examples
#' head(dic_us_dir)
#'
"dic_us_dir"

#' Street Suffix Dictionary, United States of America
#'
#' @description A list of abbreviations for full names for street suffix types for the
#'     United States.
#'
#' @docType data
#'
#' @usage data(dic_us_suffix)
#'
#' @format A tibble with 502 rows and 3 variables:
#' \describe{
#'   \item{suf.type}{suffix type}
#'   \item{suf.output}{standard suffix abbreviation}
#'   \item{suf.input}{standard full names and suffix abbreviations}
#' }
#'
#' @seealso \href{https://pe.usps.com/text/pub28/28apc_002.htm}{U.S. Postal Service, Publication 28, Appendix C1}
#'
#' @examples
#' head(dic_us_suffix)
#'
"dic_us_suffix"

#' Intersection Dictionary, United States of America
#'
#' @description A list of abbreviations used to identify intersections in the United States.
#'
#' @docType data
#'
#' @usage data(dic_us_intersect)
#'
#' @format A tibble with 5 rows and 2 variables:
#' \describe{
#'   \item{intersect.output}{standard intersection identifier}
#'   \item{intersect.input}{possible intersection identifiers}
#' }
#'
#' @examples
#' head(dic_us_intersect)
#'
"dic_us_intersect"

#' Country Dictionary
#'
#' @description A list of abbreviations for the United States.
#'
#' @docType data
#'
#' @usage data(dic_country)
#'
#' @format A tibble with 502 rows and 3 variables:
#' \describe{
#'   \item{con.output}{standard output}
#'   \item{con.input}{full names abbreviations}
#' }
#'
#' @examples
#' head(dic_country)
#'
"dic_country"
slu-openGIS/postmastr documentation built on Dec. 9, 2020, 3:32 a.m.