R/utils.R
In acroname: Engine for Acronyms and Initialisms

Documented in find_articles find_candidate first_char

#' Extract the first character from a string
#'
#' @description
#'
#' This helper function will extract the first character from a string. The element may be a letter, number, or special character but will be coerced to a character vector in the output.
#'
#' @param string Character vector from which the first character will be extracted
#'
#' @return Character vector with the first character from each element in the vector passed to the input "string" argument. This value will be the same length as the original vector.
#'
#' @export
#'
#' @md
#'
#' @examples
#'
#' first_char("purple")
#' first_char(c("purple","rain"))
#' first_char(c("nothing","compares","2u"))
#'
first_char <- function(string) {
  string %>%
    stringr::str_split(., pattern = "") %>%
    purrr::map_chr(., .f = function(x) utils::head(x, 1))
}


#' Helper to find articles
#'
#' @description
#'
#' This function will check if an input word is an article in the English language ('a', 'an', 'the').
#'
#' @param word Character vector of length 1 with word to check
#'
#' @return Logical vector of length one, `TRUE` if the word is an article and `FALSE` if not.
#' @md
#' @export
#'
#' @examples
#'
#' find_articles("the")
#' find_articles("then")
#' find_articles("whatever")
#'
find_articles <- function(word) {
  word <- tolower(word)
  any(c("a","an","the") %in% word)
}

#' Find candidate
#'
#' @description
#'
#' This is an unexported helper for \link[acroname]{acronym}. The function is used wrapped in a `tryCatch()` that uses \link[R.utils]{withTimeout} to manage maximum wait time for the candidate acronym search.
#'
#' @param collapsed The collapsed string of characters generated by \link[acroname]{mince}
#' @param acronym_length Number of characters in acronym; default is `3`
#' @param probs Vector of probabilities for selecting each character while generating candidate
#' @param dictionary Character vector containing dictionary of terms from which acronym should be created; default is `NULL` and `hunspell` "en_us" dictionary will be used
#' @param words_len Vector of the length of each word in the input
#'
#' @md
#' @return Named list with three elements:
#'
#' - **formatted**: The candidate acronym and string with letters used capitalized
#' - **prefix**: The candidate acronym
#' - **suffix**: Words used with letters in acronym capitalized
#'#'
find_candidate <- function(collapsed, acronym_length, probs, dictionary, words_len) {
  valid <- FALSE
  while(!valid) {
    ## sample the *indices* of possible letters to use
    ## use the length of the acronym for the size out
    ## use probability weighting
    indices <- sort(sample(1:nchar(collapsed), size = acronym_length, replace = FALSE, prob = probs))

    tmp_collapsed_split <-
      collapsed %>%
      strsplit(., split = "") %>%
      unlist(.)

    candidate <- paste0(tmp_collapsed_split[indices], collapse = "")

    if(candidate %in% dictionary) {
      valid <- TRUE
      ## now format the output to include the capitalized letter
      tmp_collapsed_split <- tolower(tmp_collapsed_split)
      tmp_collapsed_split[indices] <- toupper(tmp_collapsed_split[indices])
      ## now need to split the words up again
      last_letter_ind <- cumsum(words_len)
      tmp_collapsed_split[last_letter_ind[-length(last_letter_ind)]] <- paste0(tmp_collapsed_split[last_letter_ind[-length(last_letter_ind)]], " ")
      name <-  paste0(tmp_collapsed_split, collapse = "")
    } else {
      valid <- FALSE
      candidate <- ""
      name <- ""
    }
  }

  ## format with original
  formatted <- paste0(toupper(candidate), ": ", name)

  res <-
    list(
      formatted = formatted,
      prefix = toupper(candidate),
      suffix = name
    )

  return(res)
}


#' Prepare input string
#'
#' @description
#'
#' This helper is used by both \link[acroname]{acronym} and \link{initialism} to extract elements needed from the input string.
#'
#' If the function is used with `bow = TRUE` the input will be processed with a "bag of words" approach, by which words will be shuffled and sampled without replacement. In this case, the number of characters used will be determined by the proportion passed to "bow_prop".
#'
#' @param input Character vector with text to use as the input for the candidate
#' @param ignore_articles Logical indicating whether or not articles should be ignored ; default is `TRUE`
#' @param alnum_only Logical that specifes whether only alphanumeric should be used; default is `TRUE`
#' @param bow Logical for whether or not a "bag of words" approach should be used for "input" vector; default is `FALSE`
#' @param bow_prop Given `bow = TRUE` this specifies the proportion of words to sample; ignored if `bow = FALSE`; default is `0.5`
#'
#' @return Named list with the following elements:
#'
#' - **words**: Vector with one element per word to be used in the acronym or initialism
#' - **collapsed**: Vector of length 1 containing all characters from words collapsed
#' - **words_len**: Vector containing length of each word
#' - **first_chars**: Vector containing first character from each word
#'
#' @md
#' @export
#'
mince <- function(input, ignore_articles = TRUE, alnum_only = TRUE, bow = FALSE, bow_prop = 0.5) {

  ## check if the input character vector is > 1
  ## if so ... collapse together prior to splitting
  ## this allows input to be either "purple rain" or c("purple","rain")
  if(length(input) > 1) {
    input <- paste0(input, collapse = " ")
  }

  ## get the words split into a vector with one element per word
  words <-
    strsplit(input, " ") %>%
    unlist(.)

  ## handle alphanumeric only option
  ## replace any non-numeric with ""
  if(alnum_only) {
    words <-
      words %>%
      stringr::str_replace_all(., "[^[:alnum:]]", "")
  }

  ## use this vector of words to conditionally check for articles
  if(ignore_articles) {
    ## find the indices for words that are articles
    articles <-
      words %>%
      purrr::map_lgl(., .f = find_articles)

    ## find the indices for words that are *not* articles
    not_articles <- which(!articles)
    ## and keep them
    words <- words[not_articles]
  }

  ## handle the bag-of-words option
  ## if
  if(bow) {
    words <- sample(words, size = ceiling(bow_prop*length(words)), replace = FALSE)
  }

  ## find the number of letters in each word
  char_len <-
    words %>%
    purrr::map_dbl(., nchar)

  ## find the first characters of each word
  first_chars <-
    words %>%
    purrr::map_chr(., first_char)

  ## collapse words a vector with one element and all letters
  collapsed <-
    words %>%
    paste0(., collapse = "")

  ## return a named list with relevant prepped data
  list(
    words = words,
    collapsed = collapsed,
    words_len = char_len,
    first_chars = first_chars
  )

}