inst/doc/regular-expressions.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(stringr)

## ----eval = FALSE-------------------------------------------------------------
#  # The regular call:
#  str_extract(fruit, "nana")
#  # Is shorthand for
#  str_extract(fruit, regex("nana"))

## -----------------------------------------------------------------------------
x <- c("apple", "banana", "pear")
str_extract(x, "an")

## -----------------------------------------------------------------------------
bananas <- c("banana", "Banana", "BANANA")
str_detect(bananas, "banana")
str_detect(bananas, regex("banana", ignore_case = TRUE))

## -----------------------------------------------------------------------------
str_extract(x, ".a.")

## -----------------------------------------------------------------------------
str_detect("\nX\n", ".X.")
str_detect("\nX\n", regex(".X.", dotall = TRUE))

## -----------------------------------------------------------------------------
# To create the regular expression, we need \\
dot <- "\\."

# But the expression itself only contains one:
writeLines(dot)

# And this tells R to look for an explicit .
str_extract(c("abc", "a.c", "bef"), "a\\.c")

## -----------------------------------------------------------------------------
x <- "a\\b"
writeLines(x)

str_extract(x, "\\\\")

## -----------------------------------------------------------------------------
x <- c("a.b.c.d", "aeb")
starts_with <- "a.b"

str_detect(x, paste0("^", starts_with))
str_detect(x, paste0("^\\Q", starts_with, "\\E"))

## -----------------------------------------------------------------------------
x <- "a\u0301"
str_extract(x, ".")
str_extract(x, "\\X")

## -----------------------------------------------------------------------------
str_extract_all("1 + 2 = 3", "\\d+")[[1]]

## -----------------------------------------------------------------------------
# Some Laotian numbers
str_detect("១២៣", "\\d")

## -----------------------------------------------------------------------------
(text <- "Some  \t badly\n\t\tspaced \f text")
str_replace_all(text, "\\s+", " ")

## -----------------------------------------------------------------------------
(text <- c('"Double quotes"', "«Guillemet»", "“Fancy quotes”"))
str_replace_all(text, "\\p{quotation mark}", "'")

## -----------------------------------------------------------------------------
str_extract_all("Don't eat that!", "\\w+")[[1]]
str_split("Don't eat that!", "\\W")[[1]]

## -----------------------------------------------------------------------------
str_replace_all("The quick brown fox", "\\b", "_")
str_replace_all("The quick brown fox", "\\B", "_")

## -----------------------------------------------------------------------------
str_detect(c("abc", "def", "ghi"), "abc|def")

## -----------------------------------------------------------------------------
str_extract(c("grey", "gray"), "gre|ay")
str_extract(c("grey", "gray"), "gr(e|a)y")

## -----------------------------------------------------------------------------
pattern <- "(..)\\1"
fruit %>% 
  str_subset(pattern)

fruit %>% 
  str_subset(pattern) %>% 
  str_match(pattern)

## -----------------------------------------------------------------------------
str_match(c("grey", "gray"), "gr(e|a)y")
str_match(c("grey", "gray"), "gr(?:e|a)y")

## -----------------------------------------------------------------------------
x <- c("apple", "banana", "pear")
str_extract(x, "^a")
str_extract(x, "a$")

## -----------------------------------------------------------------------------
x <- "Line 1\nLine 2\nLine 3\n"
str_extract_all(x, "^Line..")[[1]]
str_extract_all(x, regex("^Line..", multiline = TRUE))[[1]]
str_extract_all(x, regex("\\ALine..", multiline = TRUE))[[1]]

## -----------------------------------------------------------------------------
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_extract(x, "CC?")
str_extract(x, "CC+")
str_extract(x, 'C[LX]+')

## -----------------------------------------------------------------------------
str_extract(x, "C{2}")
str_extract(x, "C{2,}")
str_extract(x, "C{2,3}")

## -----------------------------------------------------------------------------
str_extract(x, c("C{2,3}", "C{2,3}?"))
str_extract(x, c("C[LX]+", "C[LX]+?"))

## -----------------------------------------------------------------------------
str_detect("ABC", "(?>A|.B)C")
str_detect("ABC", "(?:A|.B)C")

## -----------------------------------------------------------------------------
x <- c("1 piece", "2 pieces", "3")
str_extract(x, "\\d+(?= pieces?)")

y <- c("100", "$400")
str_extract(y, "(?<=\\$)\\d+")

## -----------------------------------------------------------------------------
str_detect("xyz", "x(?#this is a comment)")

## -----------------------------------------------------------------------------
phone <- regex("
  \\(?       # optional opening parens
  (\\d{3})   # area code
  \\)?       # optional closing parens
  (?:-|\\ )? # optional dash or space
  (\\d{3})   # another three numbers
  (?:-|\\ )? # optional dash or space
  (\\d{3})   # three more numbers
  ", comments = TRUE)

str_match(c("514-791-8141", "(514) 791 8141"), phone)

Try the stringr package in your browser

Any scripts or data that you put into this service are public.

stringr documentation built on Nov. 15, 2023, 1:08 a.m.