tests/testthat/test-tokenizer.R

context("File tokenizer")

test_that("call without filename gives warning and object as with closed file", {
          expect_warning(tok<-Tokenizer$new())
          expect_output(tok$print(), "No file open. Please create a new object for reading.")
          })

test_that("call with inexistent file gives warning and object as with closed file", {
  expect_warning(tok<-Tokenizer$new("nofile.txt"))
  expect_output(tok$print(), "No file open. Please create a new object for reading.")
})

test_that("opening and closing a file works", {
  tok<-Tokenizer$new("token.txt")
  expect_output(tok$print(), "File open.*token.txt")
  tok$close()
  expect_output(tok$print(), "No file open. Please create a new object for reading.")
})

test_that("reading from a closed file returns NA", {
  expect_warning(tok<-Tokenizer$new())
  expect_identical(tok$nextToken(), NA)
})

test_that("new tokenizer has default delimiters", {
  expect_warning(tok<-Tokenizer$new())
  expect_identical(tok$getDelimiters(), as.integer(c(9, 10, 13, 32)))
})

test_that("modifying delimiters works", {
  expect_warning(tok<-Tokenizer$new())
  tmp<-tok$getDelimiters()
  tmp[3]<-as.integer(42)
  tok$setDelimiters(tmp)
  expect_identical(tok$getDelimiters(), as.integer(c(9, 10, 42, 32)))
})

test_that("Only ASCII-characters (numerical value < 256) accepted.", {
  expect_warning(tok<-Tokenizer$new())
  expect_error(tok$setDelimiters(c(3, 4, 5, 12345)), "Only ASCII.*")
  expect_identical(tok$getDelimiters(), as.integer(c(9, 10, 13, 32)))
  expect_error(tok$setDelimiters(as.integer(c(3, 4, 5, 12345))), "Only ASCII.*")
  expect_identical(tok$getDelimiters(), as.integer(c(9, 10, 13, 32)))
  expect_error(tok$setDelimiters(c(3, 4, -5, 8)), "Only ASCII.*")
  expect_identical(tok$getDelimiters(), as.integer(c(9, 10, 13, 32)))
  expect_error(tok$setDelimiters(c(3, 4, 7.2, 8)), "Only ASCII.*")
  expect_identical(tok$getDelimiters(), as.integer(c(9, 10, 13, 32)))
})

test_that("reading tokens works as expected", {
  tok<-Tokenizer$new("token.txt")
  expect_identical(tok$nextToken(), "Hi,")
  expect_identical(tok$nextToken(), "use")
  expect_identical(tok$nextToken(), "me")
  tok$close()
})


test_that("EOF is detected", {
  tok<-Tokenizer$new("token.txt", FALSE)
  i<-0
  token <- tok$nextToken()
  while ( (token != "without") && (i < 100)){
    token <- tok$nextToken()
    i<-i+1
  }
  expect_equal(tok$nextToken(), "newline.")
  expect_equal(tok$nextToken(), NA)
  expect_lte(i, 41)
  tok$close()
})

test_that("EOF is detected when the last token is silent", {
  tok<-Tokenizer$new("token.txt")
  i<-0
  tok$setDelimiters(as.integer(charToRaw(".")))
  token <- "foo"
  while ( (!is.na(token)) && (i < 100)){
    token <- tok$nextToken()
    i<-i+1
  }
  expect_equal(i, 5)
  tok$close()
})


test_that("empty tokens are skipped", {
  tok<-Tokenizer$new("token.txt")
  tok$setDelimiters(c(0x69L, 0x2cL, 0x9L))
  expect_equal(tok$nextToken(), "H")
  expect_equal(tok$nextToken(), " use me to")
  tok$close()
})

test_that("empty tokens are not skipped on request", {
  tok<-Tokenizer$new("token.txt", FALSE)
  tok$setDelimiters(c(0x69L, 0x2cL, 0x9L))
  expect_equal(tok$nextToken(), "H")
  expect_equal(tok$nextToken(), "")
  expect_equal(tok$nextToken(), " use me to")
  tok$close()
})

## testing the reading functions
test_that("readTokens() works", {
  ref <-
    c("Hi,", "use", "me", "to", "test", "the", "tokeniser.", "Maybe,use;comma,or;semicolon",
      "or", "XZG", "or", "whatever", "you", "like.", "I", "use", "Windows",
      "line", "endings", "(CR+LF)", "so", "you", "can", "test", "tokenising",
      "on", "different", "combinations.", "xx", "yy", "zz", "EOF",
      "without", "newline.")
  s<-readTokens(Tokenizer$new("token.txt"))
  expect_equal(s, ref)
  t<-readTokens("token.txt")
  expect_equal(t, ref)
  tok<-Tokenizer$new("token.txt")
  tok$nextToken()
  u<-readTokens(tok)
  expect_equal(u, ref[-1])
})

test_that("readTokens() takes custom delimiter", {
  ref<-c("Hi", " use me to\ttest the\r\ntokeniser.\r\nMaybe", "use", "comma",
        "or", "semicolon or XZG or whatever you like.\r\nI use Windows line endings (CR",
        "LF) so you can test tokenising on different combinations.\r\nxx\r\nyy\r\nzz\r\nEOF without newline.")
  s<-readTokens(Tokenizer$new("token.txt"),delims=as.integer(charToRaw(",;+")))
  expect_equal(s, ref)
  t<-readTokens("token.txt",delims=as.integer(charToRaw(",;+")))
  expect_equal(t, ref)
  ref[2]<-"use me to\ttest the\r\ntokeniser.\r\nMaybe"
  tok<-Tokenizer$new("token.txt")
  tok$nextToken()
  u<-readTokens(tok,as.integer(charToRaw(",;+")))
  expect_equal(u, ref[-1])
})

test_that("readTokens() handles invalid input gracefully", {
  s<-readTokens()
  expect_identical(s, as.character(c()))
  t<-readTokens(3)
  expect_identical(t, as.character(c()))
  expect_warning(u<-readTokens("foo"))
  expect_identical(u, as.character(c()))
})

test_that("readlines() works", {
  s<-readlines()
  expect_identical(s, as.character(c()))
  ref<-c("Hi, use me to\ttest the", "tokeniser.",
    "Maybe,use;comma,or;semicolon or XZG or whatever you like.",
    "I use Windows line endings (CR+LF) so you can test tokenising on different combinations.",
    "xx", "yy", "zz", "EOF without newline.")
  t<-readlines("token.txt")
  expect_identical(t, ref)
  expect_warning(u<-readlines("foo"))
  expect_identical(u, as.character(c()))
})

test_that("resetting the ptr works", {
  tok<-Tokenizer$new("token.txt")
  t1.p<-tok$getOffset()
  expect_equal(t1.p, c(0,0))
  t1.t<-tok$nextToken()
  t2.p<-tok$getOffset()
  expect_equal(t2.p, c(0,4))
  t2.t<-tok$nextToken()
  tok$setOffset(t1.p)
  expect_identical(t1.p, tok$getOffset())
  expect_identical(t1.t, tok$nextToken())
  expect_identical(t2.p, tok$getOffset())
  expect_identical(t2.t, tok$nextToken())
  tok$setOffset(c(0,25))
  expect_identical("okeniser.", tok$nextToken())
})

test_that("ptr boundary checking works", {
  tok<-Tokenizer$new("token.txt")
  tok$setOffset(c(0,12345))
  expect_true(is.na(tok$nextToken()))
  tok$close()
  tok<-Tokenizer$new("token.txt")
  tok$setOffset(c(0,-12345))
  expect_true(is.na(tok$nextToken()))
})
wamserma/R-wmisc documentation built on March 19, 2023, 3:06 a.m.