test-CRAN-df.R
In nc: Named Capture to Data Tables

library(nc)
library(testthat)
context("df")
source(system.file("test_engines.R", package="nc", mustWork=TRUE), local=TRUE)

if(requireNamespace("stringi"))test_engines("default/specified engine respected", {
  subject.df <- data.frame(f="foo", b="bar")
  match.dt <- capture_first_df(
    subject.df,
    f=list(icu="[\\p{Letter}]"),#only works with ICU
    b=list(re2=".(?R)?", engine="PCRE"),#only works with PCRE
    engine="ICU")
  expect_identical(match.dt[["icu"]], "f")
  expect_identical(match.dt[["re2"]], "bar")
})

subject.df <- data.frame(
  JobID=c(
    "13937810_25",
    "13937810_25.batch",
    "13937810_25.extern",
    "14022192_[1-3]",
    "14022204_[4]"),
  subject=c(
    ten="chr10:213,054,000-213,055,000",
    chrNA="chrNA:111,000-222,000",
    no.match="foo bar",
    missing=NA,
    two="chr1:110-111 chr2:220-222"),
  stringsAsFactors=FALSE)
range.pattern <- list(
  "\\[",
  task1="[0-9]+", as.integer,
  "(?:-",#begin optional end of range.
  taskN="[0-9]+", as.integer,
  ")?", #end is optional.
  "\\]")
test_engines("capture_first_df returns data.table", {
  match.dt <- capture_first_df(
    subject.df,
    JobID=list(
      job="[0-9]+", as.integer,
      "_",
      "(?:",#begin alternate
      task="[0-9]+", as.integer,
      "|",#either one task(above) or range(below)
      range.pattern,
      ")",#end alternate
      "(?:[.]",
      type=".*",
      ")?"),
    subject=list(
      nomatch.error=FALSE,
      chrom="chr.*?",
      ":",
      chromStart=".*?",
      "-",
      chromEnd="[0-9,]*"))
  expect_is(match.dt, "data.table")
  expect_identical(names(match.dt), c(
    "JobID", "subject",
    "job", "task", "task1", "taskN", "type",
    "chrom", "chromStart", "chromEnd"))
  expect_identical(match.dt$job, as.integer(c(
    13937810, 13937810, 13937810, 14022192, 14022204)))
  expect_identical(match.dt$task, as.integer(c(
    25, 25, 25, NA, NA)))
  expect_identical(match.dt$task1, as.integer(c(
    NA, NA, NA, 1, 4)))
  expect_identical(match.dt$taskN, as.integer(c(
    NA, NA, NA, 3, NA)))
  expect_identical(match.dt$type, c(
    "", "batch", "extern", "", ""))
  expect_identical(match.dt$chrom, c(
    "chr10", "chrNA", NA, NA, "chr1"))
  expect_identical(match.dt$chromStart, c(
    "213,054,000", "111,000", NA, NA, "110"))
  expect_identical(match.dt$chromEnd, c(
    "213,055,000", "222,000", NA, NA, "111"))
})

range.square <- list(
  "[[]",
  task1="[0-9]+", as.integer,
  "(?:-",#begin optional end of range.
  taskN="[0-9]+", as.integer,
  ")?", #end is optional.
  "[]]")
test_engines("capture_first_df square brackets pattern", {
  full.square <- list(
    job="[0-9]+", as.integer,
    "_",
    "(?:",#begin alternate
    task="[0-9]+", as.integer,
    "|",#either one task(above) or range(below)
    range.square,
    ")",#end alternate
    "(?:[.]",
    type=".*",
    ")?")
  if(identical(getOption("nc.engine"), "ICU")){
    expect_error({
      capture_first_df(subject.df, JobID=full.square)
    }, "when matching pattern above with ICU engine")
  }else{
    match.dt <- capture_first_df(subject.df, JobID=full.square)
    expect_identical(names(match.dt), c(
      "JobID", "subject",
      "job", "task", "task1", "taskN", "type"))
    expect_identical(match.dt$job, as.integer(c(
      13937810, 13937810, 13937810, 14022192, 14022204)))
    expect_identical(match.dt$task, as.integer(c(
      25, 25, 25, NA, NA)))
    expect_identical(match.dt$task1, as.integer(c(
      NA, NA, NA, 1, 4)))
    expect_identical(match.dt$taskN, as.integer(c(
      NA, NA, NA, 3, NA)))
    expect_identical(match.dt$type, c(
      "", "batch", "extern", "", ""))
  }
})

no.rownames <- data.frame(
  JobID=c(
    "13937810_25",
    "13937810_25.batch",
    "13937810_25.extern",
    "14022192_[1-3]",
    "14022204_[4]"),
  position=c(
    "chr10:213,054,000-213,055,000",
    "chrNA:111,000-222,000",
    "foo bar",
    NA,
    "chr1:110-111 chr2:220-222"),
  stringsAsFactors=FALSE)
test_engines("capture_first_df returns data.frame with default rownames", {
  match.df <- capture_first_df(
    no.rownames,
    JobID=list(
      job="[0-9]+", as.integer,
      "_",
      "(?:",#begin alternate
      task="[0-9]+", as.integer,
      "|",#either one task(above) or range(below)
      range.pattern,
      ")",#end alternate
      "(?:[.]",
      type=".*",
      ")?"),
    position=list(
      nomatch.error=FALSE,
      chrom="chr.*?",
      ":",
      chromStart=".*?",
      "-",
      chromEnd="[0-9,]*"))
  expect_identical(names(match.df), c(
    "JobID", "position",
    "job", "task", "task1", "taskN", "type",
    "chrom", "chromStart", "chromEnd"))
  expect_identical(match.df$job, as.integer(c(
    13937810, 13937810, 13937810, 14022192, 14022204)))
  expect_identical(match.df$task, as.integer(c(
    25, 25, 25, NA, NA)))
  expect_identical(match.df$task1, as.integer(c(
    NA, NA, NA, 1, 4)))
  expect_identical(match.df$taskN, as.integer(c(
    NA, NA, NA, 3, NA)))
  expect_identical(match.df$type, c(
    "", "batch", "extern", "", ""))
  expect_identical(match.df$chrom, c(
    "chr10", "chrNA", NA, NA, "chr1"))
  expect_identical(match.df$chromStart, c(
    "213,054,000", "111,000", NA, NA, "110"))
  expect_identical(match.df$chromEnd, c(
    "213,055,000", "222,000", NA, NA, "111"))
  expect_identical(rownames(match.df), paste(1:5))
})

named.uniq.chr <- data.frame(
  JobID=c(
    foo="13937810_25",
    bar="13937810_25.batch",
    baz="13937810_25.extern",
    sars="14022192_[1-3]",
    last="14022204_[4]"),
  position=c(
    "chr10:213,054,000-213,055,000",
    "chrNA:111,000-222,000",
    "chr2:1-2",
    "chr3:4-5",
    "chr1:110-111 chr2:220-222"),
  stringsAsFactors=FALSE)
keep.digits <- function(x)as.integer(gsub("[^0-9]", "", x))
test_engines("two name groups not OK with named subject", {
  expect_error({
    capture_first_df(
      named.uniq.chr,
      JobID=list(
        name="[^.]+[.].|[0-9]+",
        rest=".*"),
      position=list(
        name="chr.*?",
        ":",
        chromStart=".*?", keep.digits,
        "-",
        chromEnd="[0-9,]*", keep.digits))
  }, "must not conflict with existing column names")
})

test_engines("type.convert OK inside capture_first_df list", {
  type.conv.result <- capture_first_df(
    named.uniq.chr,
    JobID=list(
      job="[0-9]+", 
      "_",
      "(?:",#begin alternate
      task="[0-9]+", 
      "|",#either one task(above) or range(below)
      range.pattern,
      ")",#end alternate
      "(?:[.]",
      type=".*", identity,
      ")?",
      type.convert=as.numeric),
    position=list(
      name="chr.*?",
      ":",
      chromStart=".*?", keep.digits,
      "-",
      chromEnd="[0-9,]*", keep.digits))
  computed.cls <- sapply(type.conv.result, class)
  expected.cls <- c(
    JobID = "character",
    position = "character",
    job = "numeric", 
    task = "numeric",
    task1 = "integer",
    taskN = "integer",
    type = "character", 
    name = "character",
    chromStart = "integer",
    chromEnd = "integer")
  expect_identical(computed.cls, expected.cls)
})

test_engines("type.convert OK as capture_first_df arg", {
  type.conv.result <- capture_first_df(
    named.uniq.chr,
    JobID=list(
      job="[0-9]+", 
      "_",
      "(?:",#begin alternate
      task="[0-9]+", 
      "|",#either one task(above) or range(below)
      range.pattern,
      ")",#end alternate
      "(?:[.]",
      type=".*", identity,
      ")?"),
    position=list(
      name="chr.*?",
      ":",
      chromStart=".*?", keep.digits,
      "-",
      chromEnd="[0-9,]*", keep.digits),
    type.convert=as.factor)
  computed.cls <- sapply(type.conv.result, class)
  expected.cls <- c(
    JobID = "character",
    position = "character",
    job = "factor", 
    task = "factor",
    task1 = "integer",
    taskN = "integer",
    type = "character", 
    name = "factor",
    chromStart = "integer",
    chromEnd = "integer")
  expect_identical(computed.cls, expected.cls)
})

test_engines("error for no pattern", {
  expect_error({
    capture_first_df(named.uniq.chr)
  }, "no patterns specified in ...")
})

test_engines("error for un-named list", {
  expect_error({
    capture_first_df(named.uniq.chr, list())
  }, "each pattern in ... must be named using a column name of subject")
})

test_engines("error for un-named list with name", {
  expect_error({
    capture_first_df(named.uniq.chr, list(foo="bar"))
  }, "each pattern in ... must be named using a column name of subject")
})

test_engines("error for un-recognized name", {
  expect_error({
    capture_first_df(named.uniq.chr, foo="bar")
  }, "each pattern in ... must be named using a column name of subject")
})

test_engines("error for non-df subject", {
  expect_error({
    capture_first_df(c("foo", "bar"), list(foo="bar"))
  }, "subject must be a data.frame with character columns to match")
})

test_engines("error for non-df subject", {
  expect_error({
    capture_first_df(c("foo", "bar"), foo="bar")
  }, "subject must be a data.frame with character columns to match")
})

test_engines("error for factor column", {
  fac.df <- data.frame(foo="bar", stringsAsFactors=TRUE)
  expect_error({
    capture_first_df(fac.df, foo=list(baz="sars"))
  }, "problem for subject column foo: Error in stop_for_subject(subject): subject has class=factor and length=1 but should be a character vector with length>0", fixed=TRUE)
})

test_engines("error for same column name twice", {
  expect_error({
    capture_first_df(
      named.uniq.chr,
      JobID=list(f="baz"),
      JobID="foo")
  }, "each argument / subject column name should be unique")
})

test_engines("error for named subject", {
  expect_error({
    capture_first_df(
      JobID=named.uniq.chr,
      JobID=list(f="[0-9]+"))
  },
  "first argument (subject data.frame) should not be named",
  fixed=TRUE)
})

in.df <- data.frame(bar="foobar", stringsAsFactors=FALSE)
test_engines("df only one group = name", {
  out.dt <- capture_first_df(
    in.df,
    bar=list(
      name="foo"))
  expect_identical(out.dt$name, "foo")
})

matching.subjects <- c(
  "chr10:213,054,000-213,055,000",
  "chrM:111,000",
  "chr1:110-111 chr2:220-222") # two possible matches.
test_engines("df subject no error if nomatch.error=TRUE and all matches", {
  subject.df <- data.frame(
    subject.col=matching.subjects, stringsAsFactors=FALSE)
  match.dt <- capture_first_df(
    subject.df,
    subject.col=list(
      nomatch.error=TRUE,
      chrom="chr.*?",
      ":",
      chromStart="[0-9,]+", keep.digits,
      list(
        "-",
        chromEnd="[0-9,]+", keep.digits
      ), "?"))
  expect_identical(
    match.dt$chromEnd,
    as.integer(c(213055000, NA, 111)))
})
chr.pos.nomatch.vec <- c(
  "chr10:213,054,000-213,055,000",
  "chrM:111,000",
  "this will not match",
  NA, # neither will this.
  "chr1:110-111 chr2:220-222") # two possible matches.
test_engines("df subject stop if nomatch.error=TRUE and no match", {
  subject.df <- data.frame(chr.pos.nomatch.vec, stringsAsFactors=FALSE)
  expect_error({
    capture_first_df(
      subject.df,
      chr.pos.nomatch.vec=list(
        nomatch.error=TRUE,
        chrom="chr.*?",
        ":",
        chromStart="[0-9,]+", keep.digits,
        list(
          "-",
          chromEnd="[0-9,]+", keep.digits
        ), "?"))
  }, "subject(s) 3,4 (2 total) did not match regex below", fixed=TRUE)
})