#' @title Replace matched patterns in a string
#' @description Vectorised pattern replacement using Perl-compatible regular expressions.
#' @param string A character vector.
#' @param pattern Pattern to look for. A single element vector if `string_replace` or a named vector of patterns and replacements if `string_replace_all`.
#' @param replacement A character vector of replacement. Only applies to `string_replace`.
#' @return A character vector.
#' @examples
#' string_replace("10meters", "\\d+\\K", " ")
#'
#' patterns <- c("_" = " ", "\\pS" = "")
#' string_replace_all("L'Aigle_et_le_Hibou®", patterns)
#' @export
string_replace <- function(string, pattern, replacement) {
gsub(pattern, replacement, string, perl = TRUE)
}
#' @rdname string_replace
#' @param ignore_case Should letter case be ignored? Only applies to `string_replace_all`.
#' @export
string_replace_all <- function(string, pattern, ignore_case = FALSE) {
for (i in seq_along(pattern)) {
x <- pattern[i]
string <- gsub(names(x), x, string, perl = TRUE, ignore.case = ignore_case)
}
string
}
#' @title Replace matching patterns in a string
#' @description Vectorised extraction of matching patterns using Perl-compatible regular expressions.
#' @param string A character vector.
#' @param pattern A pattern to look for.
#' @return A character vector.
#' @examples
#' fables <- c("La cigale et la fourmi", "Le chêne et le roseau")
#' string_extract(fables, "\\b(?i:la|le)")
#'
#' string_extract_all(fables, "\\pL{3,}")
#' @export
string_extract <- function(string, pattern) {
x <- regexpr(pattern, string, perl = TRUE)
out <- rep(NA, length(string))
out[x != -1] <- regmatches(string, x)
out
}
#' @rdname string_extract
#' @param simplify If `FALSE`, the default, returns a list of character vectors. If `TRUE`, returns a character vector. Only applies to `string_replace_all`.
#' @return A character vector.
#' @export
string_extract_all <- function(string, pattern, simplify = FALSE) {
x <- regmatches(string, gregexpr(pattern, string, perl = TRUE))
if (isTRUE(simplify)) unlist(x) else x
}
#' @title Strip non-letter characters from the beginning and end of a string
#' @description Remove any non-letter characters from the beginning and end of a character string.
#' @param string A character vector.
#' @param rm_period Should ending periods be removed?
#' @param keep A single character vector of character(s) to keep. Only retains characters that directly precede the first and follow the last letter.
#' @param side Side of the character string to trim. One of `c("both", "right", "left")`.
#' @return A character vector.
#' @examples
#' string_strip("1. La Lionne et l'Ourse._ ")
#' string_strip("1. La Lionne et l'Ourse._ ", side = "right")
#' string_strip("1. (La Lionne et l'Ourse)_ ", keep = "()")
#' @export
string_strip <- function(string, rm_period = FALSE, keep = NULL, side = c("both", "left", "right")) {
side <- match.arg(side)
trim <- "[^\\pL]+"
period <- if (isTRUE(rm_period)) "" else "(?!\\.)"
if (is.null(keep)) {
keep_left <- keep_right <- ""
} else {
keep <- vec_to_chr_class(keep)
keep_left <- glue("(?<!{keep})")
keep_right <- glue("(?!{keep})")
}
left <- glue("^{trim}{keep_left}")
right <- glue("{period}{keep_right}{trim}$")
both <- glue("{left}|{right}")
pattern <- switch(side, "left" = left, "right" = right, "both" = both)
string_replace(string, pattern, "")
}
#' @title Remove non-letter characters, underscore and repeated whitespace inside a string
#' @description Wrapper around [`string_strip()`] that also removes underscore and repeated whitespace inside a string.
#' @param string A character vector.
#' @param rm_period Should ending periods be removed?
#' @param keep A single character vector of character(s) to keep. Only retains characters that directly follow the first and last letter.
#' @param side Side of the character string to trim. One of `c("both", "right", "left")`.
#' @return A character vector.
#' @examples
#' string_clean("1. Le Loup et l'Agneau. ")
#' @export
string_clean <- function(string, rm_period = FALSE, keep = NULL, side = c("both", "left", "right")) {
patterns <- c(
r"{\B_\B|(?<!['\"])(?<=[\p{Pe}\p{Po}])(?=\pL)|(?<=\pL)(?=[\p{Ps}&])}" = " ",
"_" = "",
"\\s+" = " "
)
string <- string_strip(string, rm_period, keep, side)
string_replace_all(string, patterns)
}
#' @title Convert a string's first character to uppercase
#' @description Convert a string's first character to uppercase.
#' @param string A character vector.
#' @param to_lower Should other characters be converted to lowercase?
#' @return A character vector.
#' @export
uc_first <- function(string, to_lower = FALSE) {
case <- if (isTRUE(to_lower)) "L" else "E"
string_replace(string, "(\\pL)(.*)", paste0("\\U\\1\\", case, "\\2"))
}
#' @title Extract the first word in a character string
#' @description Extract the first word in a character string.
#' @param string A character vector.
#' @param compound_word If `TRUE`, also captures hyphenated compound word.
#' @return A character vector.
#' @examples
#' first_word("1. Le Corbeau et le Renard")
#' @export
first_word <- function(string, compound_word = TRUE) {
h <- if (isTRUE(compound_word)) "-" else ""
string_extract(string, glue(r"{^[^\pL]*\K[\pL{h}]+}"))
}
#' @title To snake case
#' @description Convert a character string to snake case.
#' @param x A character vector.
#' @return A character vector.
#' @examples
#' to_snake_case("laCigaleEtLaFourmi")
#' @export
to_snake_case <- function(x) {
assert_that(is.character(x))
x <- string_replace_all(x, c(
r"{[^\pL\pN]+|(?<=\p{Lu})(?=\p{Lu}\p{Ll})|(?<=\p{Ll})(?=\p{Lu}|\pN)|(?<=\pN)(?!\pN)}" = "_",
"^_|_$" = ""
))
tolower(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.