knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
In this vignette I show comparisons between
namedCapture::df_match_variable
and its closest cousin in the R
package universe, tidyr::extract
. The two packages can be used to
compute the same result, but the code/syntax is different.
In this first comparison we use a syntax with each group name on the same line as its pattern. Here are some observations from the comparison:
namedCapture::df_match_variable
.convert
argument of tidyr::extract
, which uses
utils::type.convert
. Because type.convert
does not know how to
convert strings like 111,000
to integer, we first need to use
remove.commas
to create a new data.frame to use as input to
tidyr::extract
. In contrast namedCapture supports arbitrary
group-specific type conversion functions; we specify to.int
on the
same line as the corresponding name/pattern for the
chromStart/chromEnd groups.## First define data. (sacct.df <- data.frame( position=c( "chr10:213,054,000-213,055,000", "chrM:111,000-222,000", "this will not match", NA, # neither will this. "chr1:110-111 chr2:220-222"), # two possible matches. JobID=c( "13937810_25", "13937810_25.batch", "13937810_25.extern", "14022192_[1-3]", "14022204_[4]"), stringsAsFactors=FALSE)) remove.commas <- function(x)gsub(",", "", x) long.list <- list() ## namedCapture: 29 lines of code. range.list <- list( "\\[", task1="[0-9]+", as.integer, "(?:-",#begin optional end of range. taskN="[0-9]+", as.integer, ")?", #end is optional. "\\]") task.list <- list( "(?:",#begin alternate task="[0-9]+", as.integer, "|",#either one task(above) or range(below) range.list, ")")#end alternate to.int <- function(x)as.integer(remove.commas(x)) (long.list$namedCapture <- namedCapture::df_match_variable( sacct.df, JobID=list( job="[0-9]+", as.integer, "_", task.list, "(?:[.]", type=".*", ")?"), position=list( chrom="chr.*?", ":", chromStart=".*?", to.int, "-", chromEnd="[0-9,]*", to.int))) ## tidyr: 46 lines of code. range.vec <- c( "\\[", task1="[0-9]+", "(?:-",#begin optional end of range. taskN="[0-9]+", ")?", #end is optional. "\\]") task.vec <- c( "(?:",#begin alternate task="[0-9]+", "|",#either one task(above) or range(below) range.vec, ")")#end alternate regex.list <- list( JobID=c( job="[0-9]+", "_", task.vec, "(?:[.]", type=".*", ")?"), position=c( chrom="chr.*?", ":", chromStart=".*?", "-", chromEnd="[0-9,]*")) tidyr.input <- transform( sacct.df, position=remove.commas(position)) tidyr.df.list <- list(sacct.df) for(col.name in names(regex.list)){ regex.vec <- regex.list[[col.name]] is.group <- names(regex.vec)!="" format.vec <- ifelse(is.group, "(%s)", "%s") group.vec <- sprintf(format.vec, regex.vec) regex <- paste(group.vec, collapse="") group.names <- names(regex.vec)[is.group] result <- tidyr::extract( tidyr.input, col.name, group.names, regex, convert=TRUE) to.save <- result[, group.names, drop=FALSE] names(to.save) <- paste0(col.name, ".", group.names) tidyr.df.list[[col.name]] <- to.save } names(tidyr.df.list) <- NULL long.list$tidyr <- do.call(cbind, tidyr.df.list) ## Make sure the results are the same. t(sapply(long.list, names)) t(sapply(long.list, sapply, class)) long.list$tidyr$JobID.type <- ifelse( is.na(long.list$tidyr$JobID.type), "", long.list$tidyr$JobID.type) with(long.list, identical(tidyr, namedCapture))
Exercise for the reader use rematch2::bind_re_match
instead of
tidyr::extract
(you should only have to change a few lines of code
in the for loop).
This second comparison uses a syntax with the entire regex on one line. In my opinion this syntax makes the regular expressions more difficult to read/understand. Complicated regular expressions like the one used for matching the JobID column are not maintainable/understandable at all using this syntax.
## First define data. (sacct.df <- data.frame( position=c( "chr10:213,054,000-213,055,000", "chrM:111,000-222,000", "this will not match", NA, # neither will this. "chr1:110-111 chr2:220-222"), # two possible matches. JobID=c( "13937810_25", "13937810_25.batch", "13937810_25.extern", "14022192_[1-3]", "14022204_[4]"), stringsAsFactors=FALSE)) short.list <- list() ## tidyr alternate (13 lines total) e <- function(col.name, group.names, pattern){ result <- tidyr::extract( sacct.df, col.name, group.names, pattern, convert=TRUE) to.save <- result[, group.names, drop=FALSE] names(to.save) <- paste0(col.name, ".", group.names) to.save } short.list$tidyr <- do.call(cbind, list( sacct.df, e("JobID", c("job", "task", "task1", "taskN", "type"), "([0-9]+)_(?:([0-9]+)|\\[([0-9]+)(?:-([0-9]+))?\\])(?:[.](.*))?"), e("position", c("chrom", "chromStart", "chromEnd"), "(chr.*?):(.*?)-([0-9,]*)"))) ## namedCapture alternate (7 lines total) (short.list$namedCapture <- namedCapture::df_match_variable( sacct.df, JobID="(?P<job>[0-9]+)_(?:(?P<task>[0-9]+)|\\[(?P<task1>[0-9]+)(?:-(?P<taskN>[0-9]+))?\\])(?:[.](?P<type>.*))?", position="(?P<chrom>chr.*?):(?P<chromStart>.*?)-(?P<chromEnd>[0-9,]*)")) for(N in names(short.list$namedCapture)){ short.list$namedCapture[[N]] <- type.convert(short.list$namedCapture[[N]], as.is=TRUE) } ## Make sure the results are the same. t(sapply(short.list, names)) t(sapply(short.list, sapply, class)) short.list$tidyr$JobID.type <- ifelse( is.na(short.list$tidyr$JobID.type), "", short.list$tidyr$JobID.type) with(short.list, identical(tidyr, namedCapture))
rematch2::bind_re_match
is similar to tidyr::extract
but
additionally supports named capture regular expressions. Overall the
comparison shows that both packages can use a relatively verbose and
readable syntax to define complex regular expressions piece by piece:
range.list <- list( "\\[", task1="[0-9]+", as.integer, list( "-",#begin optional end of range. taskN="[0-9]+", as.integer ), "?", #end is optional. "\\]") namedCapture::df_match_variable(sacct.df, JobID=range.list) range.pat <- paste0( "\\[", "(?<task1>[0-9]+)", "(?:", "-",#begin optional end of range. "(?<taskN>[0-9]+)", ")?", #end is optional. "\\]") rematch2::bind_re_match(sacct.df, JobID, range.pat) task.list <- list( "_", list( task="[0-9]+", as.integer, "|",#either one task(above) or range(below) range.list)) namedCapture::df_match_variable(sacct.df, JobID=task.list) task.pat <- paste0( "_", "(?:", "(?<task>[0-9]+)", "|", #either one task(above) or range(below) range.pat, ")") rematch2::bind_re_match(sacct.df, JobID, task.pat) job.list <- list( job="[0-9]+", as.integer, task.list, list( "[.]", type=".*" ), "?") (job.namedCapture <- namedCapture::df_match_variable(sacct.df, JobID=job.list)) job.pat <- paste0( "(?<job>[0-9]+)", task.pat, "(?:", "[.]", "(?<type>.*)", ")?") (job.rematch2 <- rematch2::bind_re_match(sacct.df, JobID, job.pat)) pos.namedCapture <- namedCapture::df_match_variable( job.namedCapture, position=list( chrom="chr.*?", ":", chromStart=".*?", to.int, "-", chromEnd="[0-9,]*", to.int)) str(pos.namedCapture) pos.rematch2 <- rematch2::bind_re_match( job.rematch2, position, paste0( "(?<chrom>chr.*?)", ":", "(?<chromStart>.*?)", "-", "(?<chromEnd>[0-9,]*)")) str(pos.rematch2)
The main difference in syntax is that group names are specified in
the regular expression string literal for rematch2, whereas group
names are specified as R argument names for namedCapture
A difference in the result is that all columns of pos.rematch2
are
character, whereas some columns of pos.namedCapture
have already
been converted to integer. Using rematch2
type conversion may be
accomplished as a post-processing step:
converted.rematch2 <- transform( pos.rematch2, JobID.job=to.int(job), JobID.task1=to.int(task1), JobID.taskN=to.int(taskN), JobID.task=to.int(task), JobID.type=type, position.chrom=chrom, position.chromStart=to.int(chromStart), position.chromEnd=to.int(chromEnd), stringsAsFactors=FALSE) some.rematch2 <- converted.rematch2[, names(pos.namedCapture)] identical(some.rematch2, pos.namedCapture)
Exercise for the reader: convert all the rematch2::bind_re_match
calls in this section to tidyr::extract
calls.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.