R/string.R

Defines functions to_snake_case first_word uc_first string_clean string_strip string_extract_all string_extract string_replace_all string_replace

Documented in first_word string_clean string_extract string_extract_all string_replace string_replace_all string_strip to_snake_case uc_first

#' @title Replace matched patterns in a string
#' @description Vectorised pattern replacement using Perl-compatible regular expressions.
#' @param string A character vector.
#' @param pattern Pattern to look for. A single element vector if `string_replace` or a named vector of patterns and replacements if `string_replace_all`.
#' @param replacement A character vector of replacement. Only applies to `string_replace`.
#' @return A character vector.
#' @examples
#' string_replace("10meters", "\\d+\\K", " ")
#'
#' patterns <- c("_" = " ", "\\pS" = "")
#' string_replace_all("L'Aigle_et_le_Hibou®", patterns)
#' @export
string_replace <- function(string, pattern, replacement) {
  gsub(pattern, replacement, string, perl = TRUE)
}


#' @rdname string_replace
#' @param ignore_case Should letter case be ignored? Only applies to `string_replace_all`.
#' @export
string_replace_all <- function(string, pattern, ignore_case = FALSE) {
  for (i in seq_along(pattern)) {
    x <- pattern[i]
    string <- gsub(names(x), x, string, perl = TRUE, ignore.case = ignore_case)
  }
  string
}


#' @title Replace matching patterns in a string
#' @description Vectorised extraction of matching patterns using Perl-compatible regular expressions.
#' @param string A character vector.
#' @param pattern A pattern to look for.
#' @return A character vector.
#' @examples
#' fables <- c("La cigale et la fourmi", "Le chêne et le roseau")
#' string_extract(fables, "\\b(?i:la|le)")
#'
#' string_extract_all(fables, "\\pL{3,}")
#' @export
string_extract <- function(string, pattern) {
  x <- regexpr(pattern, string, perl = TRUE)
  out <- rep(NA, length(string))
  out[x != -1] <- regmatches(string, x)
  out
}


#' @rdname string_extract
#' @param simplify If `FALSE`, the default, returns a list of character vectors. If `TRUE`, returns a character vector. Only applies to `string_replace_all`.
#' @return A character vector.
#' @export
string_extract_all <- function(string, pattern, simplify = FALSE) {
  x <- regmatches(string, gregexpr(pattern, string, perl = TRUE))
  if (isTRUE(simplify)) unlist(x) else x
}


#' @title Strip non-letter characters from the beginning and end of a string
#' @description Remove any non-letter characters from the beginning and end of a character string.
#' @param string A character vector.
#' @param rm_period Should ending periods be removed?
#' @param keep A single character vector of character(s) to keep. Only retains characters that directly precede the first and follow the last letter.
#' @param side Side of the character string to trim. One of `c("both", "right", "left")`.
#' @return A character vector.
#' @examples
#' string_strip("1. La Lionne et l'Ourse._ ")
#' string_strip("1. La Lionne et l'Ourse._ ", side = "right")
#' string_strip("1. (La Lionne et l'Ourse)_ ", keep = "()")
#' @export
string_strip <- function(string, rm_period = FALSE, keep = NULL, side = c("both", "left", "right")) {
  side <- match.arg(side)
  trim <- "[^\\pL]+"
  period <- if (isTRUE(rm_period)) "" else "(?!\\.)"

  if (is.null(keep)) {
    keep_left <- keep_right <- ""
  } else {
    keep <- vec_to_chr_class(keep)
    keep_left <- glue("(?<!{keep})")
    keep_right <- glue("(?!{keep})")
  }

  left <- glue("^{trim}{keep_left}")
  right <- glue("{period}{keep_right}{trim}$")
  both <- glue("{left}|{right}")

  pattern <- switch(side, "left" = left, "right" = right, "both" = both)
  string_replace(string, pattern, "")
}


#' @title Remove non-letter characters, underscore and repeated whitespace inside a string
#' @description Wrapper around [`string_strip()`] that also removes underscore and repeated whitespace inside a string.
#' @param string A character vector.
#' @param rm_period Should ending periods be removed?
#' @param keep A single character vector of character(s) to keep. Only retains characters that directly follow the first and last letter.
#' @param side Side of the character string to trim. One of `c("both", "right", "left")`.
#' @return A character vector.
#' @examples
#' string_clean("1. Le Loup et   l'Agneau. ")
#' @export
string_clean <- function(string, rm_period = FALSE, keep = NULL, side = c("both", "left", "right")) {
  patterns <- c(
    r"{\B_\B|(?<!['\"])(?<=[\p{Pe}\p{Po}])(?=\pL)|(?<=\pL)(?=[\p{Ps}&])}" = " ",
    "_" = "",
    "\\s+" = " "
  )
  string <- string_strip(string, rm_period, keep, side)
  string_replace_all(string, patterns)
}


#' @title Convert a string's first character to uppercase
#' @description Convert a string's first character to uppercase.
#' @param string A character vector.
#' @param to_lower Should other characters be converted to lowercase?
#' @return A character vector.
#' @export
uc_first <- function(string, to_lower = FALSE) {
  case <- if (isTRUE(to_lower)) "L" else "E"
  string_replace(string, "(\\pL)(.*)", paste0("\\U\\1\\", case, "\\2"))
}


#' @title Extract the first word in a character string
#' @description Extract the first word in a character string.
#' @param string A character vector.
#' @param compound_word If `TRUE`, also captures hyphenated compound word.
#' @return A character vector.
#' @examples
#' first_word("1. Le Corbeau et le Renard")
#' @export
first_word <- function(string, compound_word = TRUE) {
  h <- if (isTRUE(compound_word)) "-" else ""
  string_extract(string, glue(r"{^[^\pL]*\K[\pL{h}]+}"))
}


#' @title To snake case
#' @description Convert a character string to snake case.
#' @param x A character vector.
#' @return A character vector.
#' @examples
#' to_snake_case("laCigaleEtLaFourmi")
#' @export
to_snake_case <- function(x) {
  assert_that(is.character(x))
  x <- string_replace_all(x, c(
    r"{[^\pL\pN]+|(?<=\p{Lu})(?=\p{Lu}\p{Ll})|(?<=\p{Ll})(?=\p{Lu}|\pN)|(?<=\pN)(?!\pN)}" = "_",
    "^_|_$" = ""
  ))
  tolower(x)
}
arnaudgallou/toolkit documentation built on Nov. 25, 2022, 5:42 p.m.