#' Control site names
#'
#' This function ensures site names conform to a controlled vocabulary.
#' It compares a vector of names to a thesaurus of variant names, misspellings
#' and character encoding inconsistencies and replaces matches with a canonical
#' form.
#'
#' @param x Character vector of site names.
#' @param quiet Logical. If `TRUE`, suppresses messages about replaced values.
#' Default: `FALSE`.
#'
#' @details
#' Elements of `x` are matched to the site name thesaurus using the following
#' rules (in order of priority):
#'
#' 1. **Exact match** to a variant name in the thesaurus
#' 2. **Plain text match** to a thesaurus variant, allowing for substitution
#' or omission of non-ASCII characters (e.g. "Catalhoyuk" → "Çatalhöyük"),
#' punctuation (e.g. "Baja" → "Ba'ja"), or whitespace (e.g. "El-Wad" →
#' "El Wad").
#'
#' Ambiguous site names—elements of `x` that match multiple thesaurus entries—
#' will cause an error.
#'
#' @return
#' `x` with elements matched to the thesaurus replaced by their canonical forms.
#'
#' Unless `quiet = TRUE`, the function also prints a list of values that were
#' changed. A warning will be issued if elements of `x` were not matched to the
#' thesaurus (regardless of the `quiet` argument).
#'
#' @family controlled vocabulary functions
#' @export
#'
#' @examples
#' sites <- c("Catalhoyuk", "Tepe Ganj Dareh", "Jericho")
#' swap_control_site_name(sites)
swap_control_site_name <- function(x, quiet = FALSE) {
# Uses internal dataset `swap_site_name_thesaurus`
# Generated by data-raw/swap_site_name_thesaurus.R
# TODO: case insensitive versions of both?
thesaurus <- dplyr::transmute(
swap_site_name_thesaurus,
.data$canon,
exact_match = .data$variant,
ascii_match = stringr::str_replace_all(.data$variant,
"[[^[:ascii:]][:punct:][:space:]]",
".?"),
ascii_match = paste0("^", .data$ascii_match, "$")
)
# Match x to thesaurus
# Turn into a factor first so we can distinguish duplicates in the data from
# duplicates from an ambiguous match
x <- factor(x)
tibble::tibble(x = levels(x)) %>%
dplyr::left_join(dplyr::select(thesaurus, .data$canon, .data$exact_match),
by = c("x" = "exact_match"), keep = FALSE) %>%
dplyr::rename(exact_match = .data$canon) %>%
fuzzyjoin::regex_left_join(dplyr::select(thesaurus, .data$canon, .data$ascii_match),
by = c("x" = "ascii_match")) %>%
dplyr::select(-.data$ascii_match) %>%
dplyr::rename(ascii_match = .data$canon) %>%
dplyr::mutate(canon = dplyr::coalesce(.data$exact_match, .data$ascii_match)) ->
y
# Attempt to recover from duplicated matches
if (length(levels(x)) != nrow(y)) {
y <- dplyr::distinct(y, .data$x, .data$canon, .keep_all = TRUE)
}
# Check for remaining ambiguous matches
# Errors here are probably a problem with the thesaurus. Check for:
# * Duplicate entries
# * Variants that are already covered by the plain text matching heuristic
if (length(levels(x)) != nrow(y)) {
dupes <- y[duplicated(y$x) | duplicated(y$x, fromLast = TRUE),]
dupes %>%
dplyr::group_by(.data$x) %>%
dplyr::mutate(canon = glue::glue('"{canon}"')) %>%
dplyr::summarise(canon = glue::glue_collapse(.data$canon, ", ")) %>%
glue::glue_data('"{x}" \u2192 {canon}') %>%
magrittr::set_names(rep("x", length(.))) %>%
c("Elements of `x` matched multiple thesaurus entries:",
.,
i = "This is probably a problem with swapdata's thesaurus, not your data! Please submit an issue at https://github.com/joeroe/swapdata/issues") %>%
rlang::abort(class = "swap_control_error")
}
# Message for replaced names
# TODO: Group messages by match type?
if (!quiet) {
replaced <- dplyr::filter(y, .data$x != .data$canon)
if (nrow(replaced) > 0) {
replaced %>%
glue::glue_data('"{x}" \u2192 "{canon}"') %>%
magrittr::set_names(rep("i", length(.))) %>%
c("Replaced site names:", .) %>%
rlang::inform()
}
else {
rlang::inform("No site names replaced.")
}
}
# Warning for unmatched names
y %>%
dplyr::filter(is.na(.data$exact_match), is.na(.data$ascii_match)) %>%
glue::glue_data('"{x}"') %>%
magrittr::set_names(rep("x", length(.))) %>%
c("Site names not matched in thesaurus:", .) %>%
{if (length(.) > 1) rlang::warn(.)}
# Return
y <- dplyr::coalesce(y$canon, y$x)
y[as.numeric(x)]
}
#' Generate site codes
#'
#' Generates abbreviated site codes from their full names.
#' The codes follow a set pattern designed to be predictable and probably (but
#' not necessarily) unique.
#'
#' @param x Character vector of site names.
#' @param max_length Numeric. Desired length of the generated codes. Must be
#' greater than 1. Default: `4`.
#'
#' @details
#' `swap_site_code()` attempts to generate a fixed-width code consisting of the
#' abbreviation site name and, if present in the site name, a numeric suffix.
#' The pattern is determined by the value of the `max_length` parameter:
#'
#' * `max_length = 2`: `AB` or `A1`
#' * `max_length = 3`: `ABC` or `AB1`
#' * `max_length = 4`: `ABCD` or `AB01` (the default)
#' * `max_length = 5`: `ABCDE` or `ABC01`
#' * ...
#'
#' A fixed-width output isn't guaranteed, for example if the name of the site
#' is shorter than `max_length`.
#'
#' As long as they are preceded by a space, Roman numerals up to XXXIX (39) are
#' also recognised as a numeric suffix, and are converted to Arabic numbers.
#'
#' @return
#' A character vector of site codes derived from `x`.
#'
#' @export
#'
#' @examples
#' sites <- c("Çatalhöyük", "Chia Sabz", "Azraq 31", "Gilgal I", "Tell es-Sultan")
#' swap_site_code(sites)
swap_site_code <- function(x, max_length = 4) {
checkmate::assert_number(max_length, lower = 2)
if (max_length <= 3) {
suffix_length <- 1
}
else {
suffix_length <- 2
}
tibble::tibble(x = x) %>%
# Work in uppercase from now on
dplyr::mutate(
x = toupper(x)
) %>%
# Numeric suffix
dplyr::mutate(
num_suffix = as.numeric(stringr::str_extract(.data$x, "[0-9]+$")),
roman_suffix = stringr::str_extract(.data$x, " [XVI]+$"),
roman_suffix = stringr::str_remove_all(.data$roman_suffix, " "),
roman_suffix = as.numeric(utils::as.roman(.data$roman_suffix)),
suffix = dplyr::coalesce(.data$num_suffix, .data$roman_suffix),
suffix = stringr::str_pad(.data$suffix, suffix_length, pad = "0"),
suffix = stringr::str_replace_na(.data$suffix, "")
) %>%
# Abbreviated site name
dplyr::mutate(
root_length = dplyr::if_else(.data$suffix == "",
max_length,
max_length - suffix_length),
root = stringr::str_remove(.data$x, "( ?[0-9]+| [XVI]+)$"),
words = stringr::str_split(.data$root, stringr::boundary("word")),
short = purrr::map2_chr(.data$words, .data$root_length, swap_acronym)
) %>%
# Concatenate
dplyr::mutate(
code = stringr::str_c(.data$short, .data$suffix)
) %>%
dplyr::pull(.data$code)
}
swap_acronym <- function(x, max_length) {
# Remove articles (al, es, etc)
articles <- c("AL", "AT", "ATH", "AD", "ADH", "AR", "AZ", "AS", "ASH",
"EL", "ET", "ETH", "ED", "EDH", "ER", "EZ", "ES", "ESH",
"E", "I")
y <- x[!x %in% articles]
# Set abbreviations for common topographic elements
y <- dplyr::recode(y,
TELL = "T",
TEL = "T",
TALL = "T",
TAL = "T",
TEPE = "T",
CHOGA = "CH",
CHOGHA = "CH",
CHAGA = "CH",
CHAGHA = "CH",
HOYUK = "H",
AIN = "A",
AYN = "A",
"'AIN" = "A",
"'AYN" = "A",
NAHAL = "N",
"H\u00D6Y\u00DCK" = "H",
JEBEL = "J",
JABAL = "J",
ABU = "AB",
UMM = "UM",
WADI = "W",
CAVE = "C",
TERRACE = "T",
ROCKSHELTER = "S",
NORTH = "N",
SOUTH = "S",
EAST = "E",
WEST = "W",
SURVEY = "S"
)
# Determine desired length of abbreviations
if (length(y) >= max_length) {
y <- y[1:max_length]
abbr_lengths <- rep(1, length(y))
}
else {
abbr_lengths <- nchar(y)
i <- length(y)
while (sum(abbr_lengths) > max_length) {
if (abbr_lengths[i] != 1) {
abbr_lengths[i] <- abbr_lengths[i] - 1
}
i <- i - 1
if (i == 0) i <- length(y)
}
}
# Abbreviate individual words
y <- swap_abbr(y, abbr_lengths)
# Concatenate & return
paste0(y, collapse = "")
}
#' Abbreviate a single word in a site name
#' @keywords internal
#' @noRd
swap_abbr <- function(x, max_length) {
# Remove H at the end of words
y <- stringr::str_remove(x, "H$")
# Remove vowels unless they're initial
# TODO: Turkish dotted/dotless I
# TODO: Francophone romanizations of Arabic
# TODO: Kurdish vowels
y <- stringr::str_remove_all(y, "(?<!^)[AEIOU\u00D6\u00DCY']")
# Check length
# TODO: Re-add vowels if underlength?
y <- stringr::str_sub(y, 1, max_length)
x[nchar(x) > max_length] <- y[nchar(x) > max_length]
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.