knitr::opts_chunk$set( collapse = FALSE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" ) library(fugly)
# Quick logo generation. Borrowed heavily from Nick Tierney's Syn logo process library(magick) library(showtext) font_add_google("Allura", "gf") # pkgdown::build_site(override = list(destination = "../coolbutuseless.github.io/package/minipdf"))
img <- image_read("man/figures/white.png") #%>% # image_transparent(color = "#f9fafb", fuzz = 10) %>% # image_trim() %>% # image_threshold() hexSticker::sticker(subplot = img, s_x = 0.92, s_y = 1.2, s_width = 1.5, s_height = 0.95, package = "fugly", p_x = 1, p_y = 1.15, p_color = "#223344", p_family = "gf", p_size = 23, h_size = 1.2, h_fill = "#ffffff", h_color = "#223344", filename = "man/figures/logo.png") image_read("man/figures/logo.png")
This package provides a single function (str_capture
) for using named capture
groups to extract values from strings. A key requirement for readability is that
the names of the capture groups are specified inline as part of the regex,
and not in an external vector or as separate names.
fugly::str_capture()
is implemented as a wrapper around
stringr. This is because stringr
itself does not yet do named capture groups (See issues for
stringr and
stringi).
fugly::str_capture()
is very similar to a number of existing packages. See
table below for a comparison.
| Method | Speed | Inline capture group naming | robust |
|-----------------------------|----------|-----------------------------|--------|
| fugly::str_capture
| Fast | Yes | No |
| rr4r::rr4r_extract_groups
| Fast | Yes | Yes |
| nc::capture_first_vec
| Fast | No | Yes |
| tidy::extract
| Fast | No | Yes |
| utils::strcapture
| Middling | No | Yes |
| unglue::unglue
| Slow | Yes | Yes |
| ore::ore_search
| Slow | Yes | Yes |
fugly::str_capture()
is unsafe/dodgy/non-robust?It doesn't adhere to standard regular expression syntax for named capture groups as used in perl, python etc.
It doesn't really adhere to glue
syntax (although it looks similar at a surface level).
If you specify delimiters which appear in your string input, then you're going to have a bad time.
It's generally only been tested on data which is:
fugly::str_capture(string, pattern, delim)
.*?
is used.You can install from GitHub with:
# install.package('remotes') remotes::install_github('coolbutuseless/fugly')
In the following example:
{}
by default.name
is unspecified, so .*?
will be usedage
is \d+
i.e. match must consist of 1-or-more digitslibrary(fugly) string <- c( "information: Name:greg Age:27 ", "information: Name:mary Age:34 " ) str_capture(string, pattern = "Name:{name} Age:{age=\\d+}")
A more complicated example:
.*?
in the pattern which is not returned as a resultstring <- c( '{"type":"Feature","properties":{"hash":"1348778913c0224a","number":"27","street":"BANAMBILA STREET","unit":"","city":"ARANDA","district":"","region":"ACT","postcode":"2614","id":"GAACT714851647"},"geometry":{"type":"Point","coordinates":[149.0826143,-35.2545558]}}', '{"type":"Feature","properties":{"hash":"dc776871c868bc7e","number":"139","street":"BOUVERIE STREET","unit":"UNIT 711","city":"CARLTON","district":"","region":"VIC","postcode":"3053","id":"GAVIC423944917"},"geometry":{"type":"Point","coordinates":[144.9617149,-37.8032551]}}', '{"type":"Feature","properties":{"hash":"8197f34a40ccad47","number":"6","street":"MOGRIDGE STREET","unit":"","city":"WARWICK","district":"","region":"QLD","postcode":"4370","id":"GAQLD155949502"},"geometry":{"type":"Point","coordinates":[152.0230999,-28.2230133]}}', '{"type":"Feature","properties":{"hash":"18edc96308fc1a8e","number":"22","street":"ORR STREET","unit":"UNIT 507","city":"CARLTON","district":"","region":"VIC","postcode":"3053","id":"GAVIC424282716"},"geometry":{"type":"Point","coordinates":[144.9653484,-37.8063371]}}' ) str_capture(string, pattern = '"number":"{number}","street":"{street}".*?"coordinates":\\[{coords}\\]')
I acknowledge that this isn't the greatest benchmark, but it is relevant to my current use-case.
nc with the PCRE regex engine is the fastest named capture I could find in R.
For large inputs (1000+ input strings), fugly
is significantly faster than unglue
, utils::strcapture
and `ore
The rust regex engine rr4r is slightly faster than fugly
unglue
is the slowest of the methods.
ore
lies somewhere between unglue
and utils::strcapture
As pointed out by Michael Barrowman, tidyr::extract()
will also do named capture into a data.frame.
utils::strcapture()
, the names are not specified inline with the regex, but are listed separately.# remotes::install_github("jonclayden/ore") # remotes::install_github("yutannihilation/rr4r") # remotes::install_github('qinwf/re2r') library(ore) library(rr4r) library(unglue) library(ggplot2) library(tidyr) # meaningless strings for benchmarking N <- 1000 string <- paste0("Information name:greg age:", seq(N)) res <- bench::mark( `fugly::str_capture()` = fugly::str_capture(string, "name:{name} age:{age=\\d+}"), `unglue::unglue()` = unglue::unglue_data(string, "Information name:{name} age:{age=\\d+}"), `utils::strcapture()` = utils::strcapture("Information name:(.*?) age:(\\d+)", string, proto = data.frame(name=character(), age=character())), `ore::ore_search()` = do.call(rbind.data.frame, lapply(ore_search(ore('name:(?<name>.*?) age:(?<age>\\d+)', encoding='utf8'), string, all=TRUE), function(x) {x$groups$matches})), `rr4r::rr4r_extract_groups()` = rr4r::rr4r_extract_groups(string, "name:(?P<name>.*?) age:(?P<age>\\d+)"), `nc::capture_first_vec() PCRE` = nc::capture_first_vec(string, "Information name:", name=".*?", " age:", age="\\d+", engine = 'PCRE'), `tidyr::extract()` = tidyr::extract(data.frame(x = string), x, into = c('name', 'age'), regex = 'name:(.*?) age:(\\d+)'), check = FALSE )
plot(res) + theme_bw() + theme(legend.position = 'bottom')
utils::strcapture()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.