test-tokenizer.R
In tok: Fast Text Tokenization

test_that("Can use a tokenizer from a file", {
  
  tok <- tokenizer$from_file(test_path("assets/tokenizer.json"))
  input <- "hello world"
  enc <- tok$encode(input)
  
  expect_equal(enc$ids, c(31373, 995))
  expect_equal(class(enc$ids), "integer")
  
  expect_equal(tok$decode(enc$ids), input)
})

test_that("batch encoder/decoder", {
  
  tok <- tokenizer$from_file(test_path("assets/tokenizer.json"))
  input <- c("hello world", "world hello")
  
  enc <- tok$encode_batch(input)
  
  # returns a list of encoding objects
  expect_equal(class(enc), "list")
  expect_equal(class(enc[[1]]), c("tok_encoding", "R6"))
  
  sequences <- lapply(enc, function(x) x$ids)
  
  expect_equal(tok$decode_batch(sequences), input)
})

test_that("from_pretrained", {
  skip_on_cran()
  tok <- tokenizer$from_pretrained("gpt2")
  input <- "hello world"
  enc <- tok$encode(input)
  
  expect_equal(tok$decode(enc$ids), input)
})

test_that("train a tokenizer on files", {
  
  tmp <- tempfile()
  writeLines(c("hello world", "bye bye"), tmp)
  
  tok <- tokenizer$new(model_bpe$new())
  tok$pre_tokenizer <- pre_tokenizer_whitespace$new()
  tok$train(tmp, trainer_bpe$new())
  
  expect_equal(tok$encode("hello")$ids, 17)
  expect_equal(tok$encode("world")$ids, 18)
  expect_equal(tok$encode("bye bye")$ids, c(10, 10))
})

test_that("can train a tokenizer from memory", {
  
  tok <- tokenizer$new(model_bpe$new())
  tok$pre_tokenizer <- pre_tokenizer_whitespace$new()
  tok$train_from_memory(c("hello world", "bye bye"), trainer_bpe$new())
  expect_equal(tok$get_vocab_size(), 19)
  
  expect_equal(tok$encode("hello")$ids, 17)
  expect_equal(tok$encode("world")$ids, 18)
  expect_equal(tok$encode("bye bye")$ids, c(10, 10))
  
  tok <- tokenizer$new(model_bpe$new())
  text <- "model <- hello + world"
  tok$train_from_memory(text, trainer_bpe$new())
  expect_equal(
    tok$decode(tok$encode(text)$ids),
    text
  )
})



test_that("can serialize a tokenizer and load back", {
  
  tok <- tokenizer$from_file(test_path("assets/tokenizer.json"))
  input <- "hello world"
  enc <- tok$encode(input)
  
  tmp <- tempfile(fileext = ".json")
  tok$save(tmp)
  
  tok2 <- tokenizer$from_file(tmp)
  enc2 <- tok$encode(input)
  
  expect_equal(enc$ids, enc2$ids)
})

test_that("enable padding works", {
  tok <- tokenizer$from_file(test_path("assets/tokenizer.json"))
  
  expect_null(tok$padding)
  
  tok$enable_padding(length = 20, pad_id = 5)
  input <- "hello world"
  enc <- tok$encode(input)
  
  expect_equal(length(enc$ids), 20)
  expect_equal(enc$ids[3], 5)
  
  expect_equal(tok$padding$length, 20)
  
  tok$no_padding()
  expect_null(tok$padding)
})

test_that("truncation works", {
  
  tok <- tokenizer$from_file(test_path("assets/tokenizer.json"))
  expect_null(tok$truncation)
  
  tok$enable_truncation(3)
  
  input <- "hello world I'm a new tokenizer called tok"
  enc <- tok$encode(input)
  
  expect_equal(length(enc$ids), 3)
  
  tok$no_truncation()
  expect_null(tok$padding)
  
  enc <- tok$encode(input)
  expect_true(length(enc$ids) > 3)
  
})