R/capture_first_glob.R

capture_first_glob <- structure(function
### Glob files, then use capture_first_vec to get meta-data from each
### file name, and combine with contents of each file.
(glob, 
### string: glob specifying files to read.
  ..., 
### pattern passed to capture_first_vec, used to get meta-data from
### file names.
  READ=fread
### function of one argument (file name) which returns a data table,
### default data.table::fread.
){
  file.vec <- Sys.glob(glob)
  meta.dt <- capture_first_vec(file.vec, ...)
  result.list <- list()
  for(file.i in seq_along(file.vec)){
    meta.i <- meta.dt[file.i]
    result.i <- READ(file.vec[[file.i]])
    result.list[[file.i]] <- data.table(meta.i, result.i)
  }
  rbindlist(result.list)
### Data table with columns of meta-data specified by pattern, plus
### contents of all files specified by glob.
}, ex=function(){

  data.table::setDTthreads(1)

  ## Example 1: simple pattern.
  db <- system.file("extdata/chip-seq-chunk-db", package="nc", mustWork=TRUE)
  suffix <- if(interactive())"gz" else "head"
  glob <- paste0(db, "/*/*/counts/*", suffix)
  read.bedGraph <- function(f)data.table::fread(
    f, skip=1, col.names = c("chrom","start", "end", "count"))
  data.chunk.pattern <- list(
    data="H.*?",
    "/",
    chunk="[0-9]+", as.integer)
  (data.chunk.dt <- nc::capture_first_glob(glob, data.chunk.pattern, READ=read.bedGraph))

  ## Write same data set in Hive partition, then re-read.
  if(requireNamespace("arrow")){
    path <- tempfile()
    max_rows_per_file <- if(interactive())3 else 1000
    arrow::write_dataset(
      dataset=data.chunk.dt,
      path=path,
      format="csv",
      partitioning=c("data","chunk"),
      max_rows_per_file=max_rows_per_file)
    hive.glob <- file.path(path, "*", "*", "*.csv")
    hive.pattern <- list(
      nc::field("data","=",".*?"),
      "/",
      nc::field("chunk","=",".*?", as.integer),
      "/",
      nc::field("part","-","[0-9]+", as.integer))
    hive.dt <- nc::capture_first_glob(hive.glob, hive.pattern)
    hive.dt[, .(rows=.N), by=.(data,chunk,part)]
  }

  ## Example 2: more complex pattern.
  count.dt <- nc::capture_first_glob(
    glob,
    data.chunk.pattern,
    "/counts/", 
    name=list("McGill", id="[0-9]+", as.integer),
    READ=read.bedGraph)
  count.dt[, .(count=.N), by=.(data, chunk, name, chrom)]

  if(require(ggplot2)){
    ggplot()+
      facet_wrap(~data+chunk+name+chrom, labeller=label_both, scales="free")+
      geom_step(aes(
        start, count),
        data=count.dt)
  }

  ## Example 3: parsing non-CSV data.
  vignettes <- system.file("extdata/vignettes", package="nc", mustWork=TRUE)
  non.greedy.lines <- list(
    list(".*\n"), "*?")
  optional.name <- list(
    list(" ", chunk_name="[^,}]+"), "?")
  chunk.pattern <- list(
    before=non.greedy.lines,
    "```\\{r",
    optional.name,
    parameters=".*",
    "\\}\n",
    code=non.greedy.lines,
    "```")
  chunk.dt <- nc::capture_first_glob(
    paste0(vignettes, "/*.Rmd"),
    "/v",
    vignette_number="[0-9]", as.integer,
    "-",
    vignette_name=".*?",
    ".Rmd",
    READ=function(f)nc::capture_all_str(f, chunk.pattern))
  chunk.dt[, chunk_number := seq_along(chunk_name), by=vignette_number]
  chunk.dt[, .(
    vignette_number, vignette_name, chunk_number, chunk_name, 
    lines=nchar(code))]
  cat(chunk.dt$code[2])
    
})

Try the nc package in your browser

Any scripts or data that you put into this service are public.

nc documentation built on Sept. 1, 2023, 1:07 a.m.