swapdata: Southwest Asian Palaeoarchaeology Data

Documented in swap_control_site_name

#' Control site names
#'
#' This function ensures site names conform to a controlled vocabulary.
#' It compares a vector of names to a thesaurus of variant names, misspellings
#' and character encoding inconsistencies and replaces matches with a canonical
#' form.
#'
#' @param x Character vector of site names.
#' @param quiet Logical. If `TRUE`, suppresses messages about replaced values.
#'  Default: `FALSE`.
#'
#' @details
#' Elements of `x` are matched to the site name thesaurus using the following
#' rules (in order of priority):
#'
#' 1. **Exact match** to a variant name in the thesaurus
#' 2. **Plain text match** to a thesaurus variant, allowing for substitution
#'    or omission of non-ASCII characters (e.g. "Catalhoyuk" → "Çatalhöyük"),
#'    punctuation (e.g. "Baja" → "Ba'ja"), or whitespace (e.g. "El-Wad" →
#'    "El Wad").
#'
#' Ambiguous site names—elements of `x` that match multiple thesaurus entries—
#' will cause an error.
#'
#' @return
#' `x` with elements matched to the thesaurus replaced by their canonical forms.
#'
#' Unless `quiet = TRUE`, the function also prints a list of values that were
#' changed. A warning will be issued if elements of `x` were not matched to the
#' thesaurus (regardless of the `quiet` argument).
#'
#' @family controlled vocabulary functions
#' @export
#'
#' @examples
#' sites <- c("Catalhoyuk", "Tepe Ganj Dareh", "Jericho")
#' swap_control_site_name(sites)
swap_control_site_name <- function(x, quiet = FALSE) {
  # Uses internal dataset `swap_site_name_thesaurus`
  # Generated by data-raw/swap_site_name_thesaurus.R
  # TODO: case insensitive versions of both?
  thesaurus <- dplyr::transmute(
      swap_site_name_thesaurus,
      .data$canon,
      exact_match = .data$variant,
      ascii_match = stringr::str_replace_all(.data$variant,
                                             "[[^[:ascii:]][:punct:][:space:]]",
                                             ".?"),
      ascii_match = paste0("^", .data$ascii_match, "$")
    )

  # Match x to thesaurus
  # Turn into a factor first so we can distinguish duplicates in the data from
  # duplicates from an ambiguous match
  x <- factor(x)

  tibble::tibble(x = levels(x)) %>%
    dplyr::left_join(dplyr::select(thesaurus, .data$canon, .data$exact_match),
                     by = c("x" = "exact_match"), keep = FALSE) %>%
    dplyr::rename(exact_match = .data$canon) %>%
    fuzzyjoin::regex_left_join(dplyr::select(thesaurus, .data$canon, .data$ascii_match),
                               by = c("x" = "ascii_match")) %>%
    dplyr::select(-.data$ascii_match) %>%
    dplyr::rename(ascii_match = .data$canon) %>%
    dplyr::mutate(canon = dplyr::coalesce(.data$exact_match, .data$ascii_match)) ->
    y

  # Attempt to recover from duplicated matches
  if (length(levels(x)) != nrow(y)) {
    y <- dplyr::distinct(y, .data$x, .data$canon, .keep_all = TRUE)
  }

  # Check for remaining ambiguous matches
  # Errors here are probably a problem with the thesaurus. Check for:
  # * Duplicate entries
  # * Variants that are already covered by the plain text matching heuristic
  if (length(levels(x)) != nrow(y)) {
    dupes <- y[duplicated(y$x) | duplicated(y$x, fromLast = TRUE),]

    dupes %>%
      dplyr::group_by(.data$x) %>%
      dplyr::mutate(canon = glue::glue('"{canon}"')) %>%
      dplyr::summarise(canon = glue::glue_collapse(.data$canon, ", ")) %>%
      glue::glue_data('"{x}"  \u2192 {canon}') %>%
      magrittr::set_names(rep("x", length(.))) %>%
      c("Elements of `x` matched multiple thesaurus entries:",
        .,
        i = "This is probably a problem with swapdata's thesaurus, not your data! Please submit an issue at https://github.com/joeroe/swapdata/issues") %>%
      rlang::abort(class = "swap_control_error")
  }

  # Message for replaced names
  # TODO: Group messages by match type?
  if (!quiet) {
    replaced <- dplyr::filter(y, .data$x != .data$canon)

    if (nrow(replaced) > 0) {
      replaced %>%
        glue::glue_data('"{x}" \u2192 "{canon}"') %>%
        magrittr::set_names(rep("i", length(.))) %>%
        c("Replaced site names:", .) %>%
        rlang::inform()
    }
    else {
      rlang::inform("No site names replaced.")
    }
  }

  # Warning for unmatched names
  y %>%
    dplyr::filter(is.na(.data$exact_match), is.na(.data$ascii_match)) %>%
    glue::glue_data('"{x}"') %>%
    magrittr::set_names(rep("x", length(.))) %>%
    c("Site names not matched in thesaurus:", .) %>%
    {if (length(.) > 1) rlang::warn(.)}

  # Return
  y <- dplyr::coalesce(y$canon, y$x)
  y[as.numeric(x)]
}

#' Generate site codes
#'
#' Generates abbreviated site codes from their full names.
#' The codes follow a set pattern designed to be predictable and probably (but
#' not necessarily) unique.
#'
#' @param x Character vector of site names.
#' @param max_length Numeric. Desired length of the generated codes. Must be
#'  greater than 1. Default: `4`.
#'
#' @details
#' `swap_site_code()` attempts to generate a fixed-width code consisting of the
#' abbreviation site name and, if present in the site name, a numeric suffix.
#' The pattern is determined by the value of the `max_length` parameter:
#'
#' * `max_length = 2`: `AB` or `A1`
#' * `max_length = 3`: `ABC` or `AB1`
#' * `max_length = 4`: `ABCD` or `AB01` (the default)
#' * `max_length = 5`: `ABCDE` or `ABC01`
#' * ...
#'
#' A fixed-width output isn't guaranteed, for example if the name of the site
#' is shorter than `max_length`.
#'
#' As long as they are preceded by a space, Roman numerals up to XXXIX (39) are
#' also recognised as a numeric suffix, and are converted to Arabic numbers.
#'
#' @return
#' A character vector of site codes derived from `x`.
#'
#' @export
#'
#' @examples
#' sites <- c("Çatalhöyük", "Chia Sabz", "Azraq 31", "Gilgal I", "Tell es-Sultan")
#' swap_site_code(sites)
swap_site_code <- function(x, max_length = 4) {
  checkmate::assert_number(max_length, lower = 2)

  if (max_length <= 3) {
    suffix_length <- 1
  }
  else {
    suffix_length <- 2
  }

  tibble::tibble(x = x) %>%
    # Work in uppercase from now on
    dplyr::mutate(
      x = toupper(x)
    ) %>%
    # Numeric suffix
    dplyr::mutate(
      num_suffix = as.numeric(stringr::str_extract(.data$x, "[0-9]+$")),
      roman_suffix = stringr::str_extract(.data$x, " [XVI]+$"),
      roman_suffix = stringr::str_remove_all(.data$roman_suffix, " "),
      roman_suffix = as.numeric(utils::as.roman(.data$roman_suffix)),
      suffix = dplyr::coalesce(.data$num_suffix, .data$roman_suffix),
      suffix = stringr::str_pad(.data$suffix, suffix_length, pad = "0"),
      suffix = stringr::str_replace_na(.data$suffix, "")
    ) %>%
    # Abbreviated site name
    dplyr::mutate(
      root_length = dplyr::if_else(.data$suffix == "",
                                   max_length,
                                   max_length - suffix_length),
      root = stringr::str_remove(.data$x, "( ?[0-9]+| [XVI]+)$"),
      words = stringr::str_split(.data$root, stringr::boundary("word")),
      short = purrr::map2_chr(.data$words, .data$root_length, swap_acronym)
    ) %>%
    # Concatenate
    dplyr::mutate(
      code = stringr::str_c(.data$short, .data$suffix)
    ) %>%
    dplyr::pull(.data$code)
}

swap_acronym <- function(x, max_length) {
  # Remove articles (al, es, etc)
  articles <- c("AL", "AT", "ATH", "AD", "ADH", "AR", "AZ", "AS", "ASH",
                "EL", "ET", "ETH", "ED", "EDH", "ER", "EZ", "ES", "ESH",
                "E", "I")
  y <- x[!x %in% articles]

  # Set abbreviations for common topographic elements
  y <- dplyr::recode(y,
                     TELL = "T",
                     TEL = "T",
                     TALL = "T",
                     TAL = "T",
                     TEPE = "T",
                     CHOGA = "CH",
                     CHOGHA = "CH",
                     CHAGA = "CH",
                     CHAGHA = "CH",
                     HOYUK = "H",
                     AIN = "A",
                     AYN = "A",
                     "'AIN" = "A",
                     "'AYN" = "A",
                     NAHAL = "N",
                     "H\u00D6Y\u00DCK" = "H",
                     JEBEL = "J",
                     JABAL = "J",
                     ABU = "AB",
                     UMM = "UM",
                     WADI = "W",
                     CAVE = "C",
                     TERRACE = "T",
                     ROCKSHELTER = "S",
                     NORTH = "N",
                     SOUTH = "S",
                     EAST = "E",
                     WEST = "W",
                     SURVEY = "S"
                     )

  # Determine desired length of abbreviations
  if (length(y) >= max_length) {
    y <- y[1:max_length]
    abbr_lengths <- rep(1, length(y))
  }
  else {
    abbr_lengths <- nchar(y)
    i <- length(y)
    while (sum(abbr_lengths) > max_length) {
      if (abbr_lengths[i] != 1) {
        abbr_lengths[i] <- abbr_lengths[i] - 1
      }
      i <- i - 1
      if (i == 0) i <- length(y)
    }
  }

  # Abbreviate individual words
  y <- swap_abbr(y, abbr_lengths)

  # Concatenate & return
  paste0(y, collapse = "")
}

#' Abbreviate a single word in a site name
#' @keywords internal
#' @noRd
swap_abbr <- function(x, max_length) {
  # Remove H at the end of words
  y <- stringr::str_remove(x, "H$")

  # Remove vowels unless they're initial
  # TODO: Turkish dotted/dotless I
  # TODO: Francophone romanizations of Arabic
  # TODO: Kurdish vowels
  y <- stringr::str_remove_all(y, "(?<!^)[AEIOU\u00D6\u00DCY']")

  # Check length
  # TODO: Re-add vowels if underlength?
  y <- stringr::str_sub(y, 1, max_length)
  x[nchar(x) > max_length] <- y[nchar(x) > max_length]

  return(x)
}