In tdhock/namedCapture: Named Capture Regular Expressions

Comparing regex functions for data frames

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

In this vignette I show comparisons between namedCapture::df_match_variable and its closest cousin in the R package universe, tidyr::extract. The two packages can be used to compute the same result, but the code/syntax is different.

Longer more readable syntax

In this first comparison we use a syntax with each group name on the same line as its pattern. Here are some observations from the comparison:

The namedCapture code is shorter. The tidyr code is longer mostly because the for loop that you see below for tidyr is hidden inside the definition of namedCapture::df_match_variable.
Converting extracted character groups to numeric column types is specified via the convert argument of tidyr::extract, which uses utils::type.convert. Because type.convert does not know how to convert strings like 111,000 to integer, we first need to use remove.commas to create a new data.frame to use as input to tidyr::extract. In contrast namedCapture supports arbitrary group-specific type conversion functions; we specify to.int on the same line as the corresponding name/pattern for the chromStart/chromEnd groups.

## First define data.
(sacct.df <- data.frame(
  position=c(
    "chr10:213,054,000-213,055,000",
    "chrM:111,000-222,000",
    "this will not match",
    NA, # neither will this.
    "chr1:110-111 chr2:220-222"), # two possible matches.
  JobID=c(
    "13937810_25",
    "13937810_25.batch",
    "13937810_25.extern",
    "14022192_[1-3]",
    "14022204_[4]"),
  stringsAsFactors=FALSE))
remove.commas <- function(x)gsub(",", "", x)
long.list <- list()

## namedCapture: 29 lines of code.
range.list <- list(
  "\\[",
  task1="[0-9]+", as.integer,
  "(?:-",#begin optional end of range.
  taskN="[0-9]+", as.integer,
  ")?", #end is optional.
  "\\]")
task.list <- list(
  "(?:",#begin alternate
  task="[0-9]+", as.integer,
  "|",#either one task(above) or range(below)
  range.list,
  ")")#end alternate
to.int <- function(x)as.integer(remove.commas(x))
(long.list$namedCapture <- namedCapture::df_match_variable(
  sacct.df,
  JobID=list(
    job="[0-9]+", as.integer,
    "_",
    task.list,
    "(?:[.]",
    type=".*",
    ")?"),
  position=list(
    chrom="chr.*?",
    ":",
    chromStart=".*?", to.int,
    "-",
    chromEnd="[0-9,]*", to.int)))

## tidyr: 46 lines of code.
range.vec <- c(
  "\\[",
  task1="[0-9]+", 
  "(?:-",#begin optional end of range.
  taskN="[0-9]+", 
  ")?", #end is optional.
  "\\]")
task.vec <- c(
  "(?:",#begin alternate
  task="[0-9]+", 
  "|",#either one task(above) or range(below)
  range.vec,
  ")")#end alternate
regex.list <- list(
  JobID=c(
    job="[0-9]+", 
    "_",
    task.vec,
    "(?:[.]",
    type=".*",
    ")?"),
  position=c(
    chrom="chr.*?",
    ":",
    chromStart=".*?",
    "-",
    chromEnd="[0-9,]*"))
tidyr.input <- transform(
  sacct.df,
  position=remove.commas(position))
tidyr.df.list <- list(sacct.df)
for(col.name in names(regex.list)){
  regex.vec <- regex.list[[col.name]]
  is.group <- names(regex.vec)!=""
  format.vec <- ifelse(is.group, "(%s)", "%s")
  group.vec <- sprintf(format.vec, regex.vec)
  regex <- paste(group.vec, collapse="")
  group.names <- names(regex.vec)[is.group]
  result <- tidyr::extract(
    tidyr.input, col.name, group.names, regex, convert=TRUE)
  to.save <- result[, group.names, drop=FALSE]
  names(to.save) <- paste0(col.name, ".", group.names)
  tidyr.df.list[[col.name]] <- to.save
}
names(tidyr.df.list) <- NULL
long.list$tidyr <- do.call(cbind, tidyr.df.list)

## Make sure the results are the same.
t(sapply(long.list, names))
t(sapply(long.list, sapply, class))
long.list$tidyr$JobID.type <- ifelse(
  is.na(long.list$tidyr$JobID.type),
  "",
  long.list$tidyr$JobID.type)
with(long.list, identical(tidyr, namedCapture))

Exercise for the reader use rematch2::bind_re_match instead of tidyr::extract (you should only have to change a few lines of code in the for loop).

Shorter less readable syntax

This second comparison uses a syntax with the entire regex on one line. In my opinion this syntax makes the regular expressions more difficult to read/understand. Complicated regular expressions like the one used for matching the JobID column are not maintainable/understandable at all using this syntax.

## First define data.
(sacct.df <- data.frame(
  position=c(
    "chr10:213,054,000-213,055,000",
    "chrM:111,000-222,000",
    "this will not match",
    NA, # neither will this.
    "chr1:110-111 chr2:220-222"), # two possible matches.
  JobID=c(
    "13937810_25",
    "13937810_25.batch",
    "13937810_25.extern",
    "14022192_[1-3]",
    "14022204_[4]"),
  stringsAsFactors=FALSE))
short.list <- list()

## tidyr alternate (13 lines total)
e <- function(col.name, group.names, pattern){
  result <- tidyr::extract(
    sacct.df, col.name, group.names, pattern, convert=TRUE)
  to.save <- result[, group.names, drop=FALSE]
  names(to.save) <- paste0(col.name, ".", group.names)
  to.save
}
short.list$tidyr <- do.call(cbind, list(
  sacct.df,
  e("JobID", c("job", "task", "task1", "taskN", "type"),
    "([0-9]+)_(?:([0-9]+)|\\[([0-9]+)(?:-([0-9]+))?\\])(?:[.](.*))?"),
  e("position", c("chrom", "chromStart", "chromEnd"),
    "(chr.*?):(.*?)-([0-9,]*)")))

## namedCapture alternate (7 lines total)
(short.list$namedCapture <- namedCapture::df_match_variable(
  sacct.df,
  JobID="(?P<job>[0-9]+)_(?:(?P<task>[0-9]+)|\\[(?P<task1>[0-9]+)(?:-(?P<taskN>[0-9]+))?\\])(?:[.](?P<type>.*))?",
  position="(?P<chrom>chr.*?):(?P<chromStart>.*?)-(?P<chromEnd>[0-9,]*)"))
for(N in names(short.list$namedCapture)){
  short.list$namedCapture[[N]] <- type.convert(short.list$namedCapture[[N]], as.is=TRUE)
}

## Make sure the results are the same.
t(sapply(short.list, names))
t(sapply(short.list, sapply, class))
short.list$tidyr$JobID.type <- ifelse(
  is.na(short.list$tidyr$JobID.type),
  "",
  short.list$tidyr$JobID.type)
with(short.list, identical(tidyr, namedCapture))

Comparison with rematch2

rematch2::bind_re_match is similar to tidyr::extract but additionally supports named capture regular expressions. Overall the comparison shows that both packages can use a relatively verbose and readable syntax to define complex regular expressions piece by piece:

range.list <- list(
  "\\[",
  task1="[0-9]+", as.integer,
  list(
    "-",#begin optional end of range.
    taskN="[0-9]+", as.integer
  ), "?", #end is optional.
  "\\]")
namedCapture::df_match_variable(sacct.df, JobID=range.list)

range.pat <- paste0(
  "\\[",
  "(?<task1>[0-9]+)", 
  "(?:",
  "-",#begin optional end of range.
  "(?<taskN>[0-9]+)",
  ")?", #end is optional.
  "\\]")
rematch2::bind_re_match(sacct.df, JobID, range.pat)
task.list <- list(
  "_",
  list(
    task="[0-9]+", as.integer,
    "|",#either one task(above) or range(below)
    range.list))
namedCapture::df_match_variable(sacct.df, JobID=task.list)

task.pat <- paste0(
  "_",
  "(?:",
  "(?<task>[0-9]+)", 
  "|", #either one task(above) or range(below)
  range.pat,
  ")")
rematch2::bind_re_match(sacct.df, JobID, task.pat)

job.list <- list(
  job="[0-9]+", as.integer,
  task.list,
  list(
    "[.]",
    type=".*"
  ), "?")
(job.namedCapture <- namedCapture::df_match_variable(sacct.df, JobID=job.list))

job.pat <- paste0(
  "(?<job>[0-9]+)", 
  task.pat,
  "(?:",
  "[.]",
  "(?<type>.*)",
  ")?")
(job.rematch2 <- rematch2::bind_re_match(sacct.df, JobID, job.pat))

pos.namedCapture <- namedCapture::df_match_variable(
  job.namedCapture, position=list(
    chrom="chr.*?",
    ":",
    chromStart=".*?", to.int,
    "-",
    chromEnd="[0-9,]*", to.int))
str(pos.namedCapture)

pos.rematch2 <- rematch2::bind_re_match(
  job.rematch2,  position, paste0(
    "(?<chrom>chr.*?)",
    ":",
    "(?<chromStart>.*?)", 
    "-",
    "(?<chromEnd>[0-9,]*)"))
str(pos.rematch2)

The main difference in syntax is that group names are specified in the regular expression string literal for rematch2, whereas group names are specified as R argument names for namedCapture A difference in the result is that all columns of pos.rematch2 are character, whereas some columns of pos.namedCapture have already been converted to integer. Using rematch2 type conversion may be accomplished as a post-processing step:

converted.rematch2 <- transform(
  pos.rematch2,
  JobID.job=to.int(job),
  JobID.task1=to.int(task1),
  JobID.taskN=to.int(taskN),
  JobID.task=to.int(task),
  JobID.type=type,
  position.chrom=chrom,
  position.chromStart=to.int(chromStart),
  position.chromEnd=to.int(chromEnd),
  stringsAsFactors=FALSE)
some.rematch2 <- converted.rematch2[, names(pos.namedCapture)]
identical(some.rematch2, pos.namedCapture)