tests/testthat/test-3-spacy_tokenize.R

context("test spacy_tokenize")
source("utils.R")

test_that("spacy_tokenize returns either data.frame and list", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt <- "This is a test for document names."
  tokens_list <- spacy_tokenize(txt, output = "list")
  expect_identical(
    length(tokens_list[[1]]),
    8L
  )
  expect_true(
    is.list(tokens_list)
  )
  
  tokens_df <- spacy_tokenize(txt, output = "data.frame")
  expect_identical(
    nrow(tokens_df),
    8L
  )
  expect_true(
    is.data.frame(tokens_df)
  )
  
})

test_that("spacy_tokenize works with a TIF formatted data.frame", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt1 <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.",
            doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.")
  txt1_df <- data.frame(doc_id = names(txt1), text = txt1, stringsAsFactors = FALSE)
  
  expect_equal(
    spacy_tokenize(txt1_df, output = "list"),
    spacy_tokenize(txt1, output = "list")
  )
  
  txt1_df_err <- data.frame(doc_name = names(txt1), text = txt1, stringsAsFactors = FALSE)
  expect_error(
    spacy_tokenize(txt1_df_err, output = "data.frame"),
    "input data.frame does not conform to the TIF standard"
  )
  
})


test_that("spacy_tokenize docnames work as expected", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt <- "This is a test for document names."
  expect_identical(
    names(spacy_tokenize(txt)), "text1"
  )
  expect_identical(
    names(spacy_tokenize(c(onlydoc = txt))),
    "onlydoc"
  )
  expect_identical(
    names(spacy_tokenize(c(doc1 = txt, doc2 = txt))),
    c("doc1", "doc2")
  )
})

test_that("spacy_tokenize remove_punct argument work as expected", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt <- "This: £ = GBP! 15% not! > 20 percent?"
  expect_equivalent(
    spacy_tokenize(txt, remove_punct = FALSE),
    list(c("This", ":", "£", "=", "GBP", "!", "15", "%", "not", "!", ">", "20", "percent", "?"))
  )
  expect_equivalent(
    spacy_tokenize(txt, remove_punct = TRUE, padding = FALSE),
    list(c("This", "£", "=", "GBP", "15", "not", ">", "20", "percent"))
  )
})

test_that("spacy_tokenize remove_symbols argument work as expected", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt <- "This: £ = GBP! 15% not! > 20 percent?"
  expect_equivalent(
    spacy_tokenize(txt, remove_symbols = FALSE),
    list(c("This", ":", "£", "=", "GBP", "!", "15", "%", "not", "!", ">", "20", "percent", "?"))
  )
  skip("behaviour of remove_symbols has changed")
  expect_equivalent(
    spacy_tokenize(txt, remove_symbols = TRUE, padding = FALSE),
    list(c("This", ":",  "GBP", "!", "15", "%", "not", "!",
           ">", "20", "percent", "?"))
  )
})


test_that("spacy_tokenize padding argument work as expected", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt <- "This: a test."
  expect_equivalent(
    spacy_tokenize(txt, remove_punct = FALSE, padding = TRUE),
    list(c("This", ":", "a", "test", "."))
  )
  expect_equivalent(
    spacy_tokenize(txt, remove_punct = TRUE, padding = FALSE),
    list(c("This", "a", "test"))
  )
  expect_equivalent(
    spacy_tokenize(txt, remove_punct = TRUE, padding = TRUE),
    list(c("This", "", "a", "test", ""))
  )
})

test_that("spacy_tokenize remove_punct works as expected", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt <- "My favorite: the very! nice? ±2 for €5 beers."
  expect_equivalent(
    spacy_tokenize(txt, remove_punct = TRUE, padding = FALSE),
    list(c("My", "favorite", "the", "very", "nice", "±2", "for", "€", "5", "beers"))
  )
})

test_that("spacy_tokenize remove_url works as expected", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt <- c(doc1 = "test@unicode.org can be seen at https://bit.ly/2RDxcxs?not=FALSE.")
  expect_equivalent(
    spacy_tokenize(txt, remove_url = FALSE, padding = FALSE, remove_punct = FALSE),
    list(c("test@unicode.org", "can", "be", "seen", "at", "https://bit.ly/2RDxcxs?not=FALSE", "."))
  )
  expect_equivalent(
    spacy_tokenize(txt, remove_url = FALSE, padding = FALSE, remove_punct = TRUE),
    list(c("test@unicode.org", "can", "be", "seen", "at", "https://bit.ly/2RDxcxs?not=FALSE"))
  )
  expect_equivalent(
    spacy_tokenize(txt, remove_url = TRUE, padding = FALSE, remove_punct = FALSE),
    list(c("can", "be", "seen", "at", "."))
  )
})

test_that("spacy_tokenize remove_numbers works as expected", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt <- c(doc1 = "99 red ballons 4ever £5 gr8!!")
  expect_equivalent(
    spacy_tokenize(txt, remove_numbers = FALSE, padding = FALSE),
    list(c("99", "red", "ballons", "4ever", "£", "5", "gr8", "!", "!"))
  )
  expect_equivalent(
    spacy_tokenize(txt, remove_numbers = TRUE, padding = FALSE),
    list(c("red", "ballons", "4ever", "£", "gr8", "!", "!"))
  )
  expect_equivalent(
    spacy_tokenize(txt, remove_numbers = TRUE, padding = FALSE),
    quanteda::tokens(txt, remove_numbers = TRUE) %>% quanteda::as.list()
  )
})

test_that("spacy_tokenize remove_separators works as expected", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt <- c(doc1 = "Sentence  one\ttwo\nNew paragraph\u2029Last paragraph")
  expect_equivalent(
    spacy_tokenize(txt, remove_separators = FALSE),
    list(c("Sentence", " ", " ", "one", "\t", "two", "\n",
           "New", " ", "paragraph", "\u2029",
           "Last", " ", "paragraph"))
  )
  expect_equivalent(
    spacy_tokenize(txt, remove_separators = TRUE),
    list(c("Sentence", "one", "two", "New", "paragraph", "Last", "paragraph"))
  )
  expect_equivalent(
    spacy_tokenize(txt, remove_separators = TRUE),
    quanteda::tokens(txt, remove_separators = TRUE) %>% quanteda::as.list()
  )
})

test_that("spacy_tokenize multithread = TRUE does not change value", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  expect_identical(
    spacy_tokenize(data_char_paragraph, multithread = TRUE),
    spacy_tokenize(data_char_paragraph, multithread = FALSE)
  )
})

test_that("spacy_tokenize multithread = TRUE is faster than when FALSE", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  skip("multithread = TRUE performance test skipped because takes so long")
  txt <- rep(data_char_paragraph, 5000)
  expect_lt(
    system.time(spacy_tokenize(txt, multithread = TRUE))["elapsed"],
    system.time(spacy_tokenize(txt, multithread = FALSE))["elapsed"]
  )
})

test_that("spacy_tokenize what = 'sentence' works as expected", {
  skip_on_cran()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  txt <- "Sentence one!  This: is a test.\n\nYeah, right.  What, Mr. Jones?"
  expect_equivalent(
    spacy_tokenize(txt, what = "sentence", remove_punct = TRUE,
                   remove_separators = TRUE),
    list(c(
      "Sentence one!",
      "This: is a test.",
      "Yeah, right.",
      "What, Mr. Jones?"
    ))
  )
  expect_equivalent(
    spacy_tokenize(txt, what = "sentence", remove_punct = TRUE,
                   remove_separators = FALSE),
    list(c(
      "Sentence one!  ",
      "This: is a test.\n\n",
      "Yeah, right.  ",
      "What, Mr. Jones?"
    ))
  )
  expect_equivalent(
    spacy_tokenize(txt, what = "sentence", remove_separators = TRUE),
    quanteda::tokens(txt, what = "sentence", remove_separators = TRUE) %>% as.list()
  )
})
quanteda/spacyr documentation built on April 13, 2024, 2:27 p.m.