inst/doc/v2-recommended-syntax.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
subject.vec <- c(
  "chr10:213,054,000-213,055,000",
  "chrM:111,000",
  "this will not match",
  NA, # neither will this.
  "chr1:110-111 chr2:220-222") # two possible matches.
## Single line pattern, not so easy to read.
single.line.pattern <-
  "(?P<chrom>chr.*?):(?P<chromStart>[0-9,]+)(?:-(?P<chromEnd>[0-9,]+))?"
## Same pattern defined over multiple lines, easier to read.
chr.pos.pattern <- paste0(
  "(?P<chrom>chr.*?)",
  ":",
  "(?P<chromStart>[0-9,]+)",
  "(?:",
    "-",
    "(?P<chromEnd>[0-9,]+)",
  ")?")
identical(single.line.pattern, chr.pos.pattern)
namedCapture::str_match_named(subject.vec, chr.pos.pattern)

## -----------------------------------------------------------------------------
namedCapture::str_match_variable(
  subject.vec, 
  "(?P<chrom>chr.*?)",
  ":",
  "(?P<chromStart>[0-9,]+)",
  "(?:",
    "-",
    "(?P<chromEnd>[0-9,]+)",
  ")?")

## -----------------------------------------------------------------------------
namedCapture::str_match_variable(
  subject.vec, 
  chrom="chr.*?",
  ":",
  chromStart="[0-9,]+",
  "(?:",
    "-",
    chromEnd="[0-9,]+",
  ")?")

## -----------------------------------------------------------------------------
keep.digits <- function(x)as.integer(gsub("[^0-9]", "", x))
(match.df <- namedCapture::str_match_variable(
  subject.vec, 
  chrom="chr.*?",
  ":",
  chromStart="[0-9,]+", keep.digits,
  "(?:",
    "-",
    chromEnd="[0-9,]+", keep.digits,
  ")?"))

## -----------------------------------------------------------------------------
pos.pattern <- list("[0-9,]+", keep.digits)
namedCapture::str_match_variable(
  subject.vec, 
  chrom="chr.*?",
  ":",
  chromStart=pos.pattern,
  "(?:",
    "-",
    chromEnd=pos.pattern,
  ")?")

## -----------------------------------------------------------------------------
namedCapture::str_match_variable(
  subject.vec, 
  chrom="chr.*?",
  ":",
  chromStart=pos.pattern,
  list(
    "-",
    chromEnd=pos.pattern
  ), "?")

## -----------------------------------------------------------------------------
(L <- namedCapture::variable_args_list(
  chrom="chr.*?",
  ":",
  chromStart=pos.pattern,
  list(
    "-",
    chromEnd=pos.pattern
  ), "?"))
identical(L$pattern, single.line.pattern)

## -----------------------------------------------------------------------------
trackDb.txt.gz <- system.file(
  "extdata", "trackDb.txt.gz", package="namedCapture")
trackDb.vec <- readLines(trackDb.txt.gz)

## -----------------------------------------------------------------------------
cat(trackDb.vec[78:107], sep="\n")

## -----------------------------------------------------------------------------
fields.df <- namedCapture::str_match_all_variable(
  trackDb.vec,
  "track ",
  name="\\S+",
  fields="(?:\n[^\n]+)*",
  "\n")

## -----------------------------------------------------------------------------
head(fields.df)

## -----------------------------------------------------------------------------
fields.list <- namedCapture::str_match_all_named(
  fields.df[, "fields"], paste0(
    "\\s+",
    "(?P<name>.*?)",
    " ",
    "(?P<value>[^\n]+)"))

## -----------------------------------------------------------------------------
fields.list[12:14]

## -----------------------------------------------------------------------------
fields.list$bcell_McGill0091Coverage["bigDataUrl",]
fields.list$monocyte_McGill0001Peaks["color",]
has.bigDataUrl <- sapply(fields.list, function(m)"bigDataUrl" %in% rownames(m))
bigDataUrl.list <- fields.list[has.bigDataUrl]
length(bigDataUrl.list)
length(fields.list)

## -----------------------------------------------------------------------------
name.pattern <- list(
  cellType=".*?",
  "_",
  sampleName=list(
    "McGill",
    sampleID="[0-9]+", as.integer),
  dataType="Coverage|Peaks",
  "|",
  "[^\n]+")
match.df <- namedCapture::str_match_all_variable(
  trackDb.vec,
  "track ",
  name=name.pattern,
  "(?:\n[^\n]+)*",
  "\\s+bigDataUrl ",
  bigDataUrl="[^\n]+")
head(match.df)

## -----------------------------------------------------------------------------
(sacct.df <- data.frame(
  Elapsed = c(
    "07:04:42", "07:04:42", "07:04:49",
    "00:00:00", "00:00:00"),
  JobID=c(
    "13937810_25",
    "13937810_25.batch",
    "13937810_25.extern",
    "14022192_[1-3]",
    "14022204_[4]"),
  stringsAsFactors=FALSE))

## -----------------------------------------------------------------------------
## Define some sub-patterns separately for clarity.
range.pattern <- list(
  "[[]",
  task1="[0-9]+", as.integer,
  "(?:-",#begin optional end of range.
  taskN="[0-9]+", as.integer,
  ")?", #end is optional.
  "[]]")
task.pattern <- list(
  "(?:",#begin alternate
  task="[0-9]+", as.integer,
  "|",#either one task(above) or range(below)
  range.pattern,
  ")")#end alternate
(task.df <- namedCapture::df_match_variable(
  sacct.df,
  JobID=list(
    job="[0-9]+", as.integer,
    "_",
    task.pattern,
    "(?:[.]",
    type=".*",
    ")?"),
  Elapsed=list(
    hours="[0-9]+", as.integer,
    ":",
    minutes="[0-9]+", as.integer,
    ":",
    seconds="[0-9]+", as.integer)))

## -----------------------------------------------------------------------------
library(data.table)
sacct.dt <- data.table(sacct.df)
(task.dt <- namedCapture::df_match_variable(
  sacct.dt,
  JobID=list(
    job="[0-9]+", as.integer,
    "_",
    task.pattern,
    "(?:[.]",
    type=".*",
    ")?"),
  Elapsed=list(
    hours="[0-9]+", as.integer,
    ":",
    minutes="[0-9]+", as.integer,
    ":",
    seconds="[0-9]+", as.integer)))

Try the namedCapture package in your browser

Any scripts or data that you put into this service are public.

namedCapture documentation built on April 2, 2020, 1:07 a.m.