tests/testthat/test-5-nounphrase-functions.R

context("testing nounphrase functions")
source("utils.R")

test_that("spacy_extract_nounphrases data.frame works", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.",
            doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.")
  noun_phrases <- spacy_extract_nounphrases(txt1, output = "data.frame")
  skip("behaviour changed in spaCy")
  expect_equal(
    noun_phrases$text,
    c("The history", "natural language processing", "the 1950s",
      "work", "earlier periods", "Alan Turing", "an article", "what",
      "a criterion", "intelligence"))
  expect_equal(
    noun_phrases$root_text,
    c("history", "processing", "1950s", "work", "periods", "Turing",
      "article", "what", "criterion", "intelligence"))
  
  expect_error(
    spacy_extract_nounphrases(c(doc1 = "Hello", doc1 = "world"), output = "data.frame"),
    "duplicated"
  )
  
  expect_error(
    spacy_extract_nounphrases(c(doc1 = "Hello", "world"), output = "data.frame"),
    "missing"
  )
  
  expect_silent(spacy_finalize())
})


test_that("spacy_extract_nounphrases data.frame works properly when there is no noun-phrase", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- c(doc1 = "Hello")
  expect_message(
    spacy_extract_nounphrases(txt1, output = "data.frame"),
    "No noun phrase")
  expect_equivalent(
    spacy_extract_nounphrases(txt1, output = "data.frame"),
    NULL)
})

test_that("spacy_extract_nounphrases works with a TIF formatted data.frame", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  # comment out following line to increase a coverage
  #expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.",
            doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.")
  txt1_df <- data.frame(doc_id = names(txt1), text = txt1, stringsAsFactors = FALSE)
  
  expect_equal(
    spacy_extract_nounphrases(txt1_df, output = "data.frame"),
    spacy_extract_nounphrases(txt1, output = "data.frame")
  )
  
  txt1_df_err <- data.frame(doc_name = names(txt1), text = txt1, stringsAsFactors = FALSE)
  expect_error(
    spacy_extract_nounphrases(txt1_df_err, output = "data.frame"),
    "input data.frame does not conform to the TIF standard"
  )
  
  expect_silent(spacy_finalize())
})


test_that("spacy_extract_nounphrases with atomic text string", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods."
  noun_phrases <- spacy_extract_nounphrases(txt1, output = "data.frame")
  
  expect_true(
    all.equal(noun_phrases$doc_id[1], "text1")
  )
  expect_equal(
    noun_phrases$text,
    c("The history", "natural language processing", "the 1950s",
      "work", "earlier periods")
  )
  expect_equal(
    noun_phrases$root_text,
    c("history", "processing", "1950s", "work", "periods")
  )
  
  expect_silent(spacy_finalize())
})


test_that("spacy_extract_nounphrases list works", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.",
            doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.")
  noun_phrases <- spacy_extract_nounphrases(txt1, output = "list")
  skip("behaviour changed in spaCy")
  expect_equal(
    noun_phrases,
    list(doc1 = c("The history", "natural language processing", "the 1950s", "work", "earlier periods"),
         doc2 = c("Alan Turing", "an article", "what", "a criterion", "intelligence")))
  
  expect_silent(spacy_finalize())
})

test_that("spacy_extract_nounphrases data.frame and list returns the same nounphrases", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.",
            doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.")
  noun_phrases_dataframe <- spacy_extract_nounphrases(txt1, output = "data.frame")
  noun_phrases_list <- spacy_extract_nounphrases(txt1, output = "list")
  
  expect_equal(
    noun_phrases_dataframe$text,
    unname(unlist(noun_phrases_list))
  )
  
  expect_equal(
    unname(sapply(noun_phrases_list, length)),
    as.vector(unclass(unname(table(noun_phrases_dataframe$doc_id))))
  )
  
  expect_silent(spacy_finalize())
})

test_that("spacy_extract_nounphrases.data.frame() works", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  expect_message(spacy_initialize(), "successfully|already")
  
  txt <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.",
           doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.")
  txt_df <- data.frame(doc_id = paste0("doc", 1:2),
                       text = txt, stringsAsFactors = FALSE)
  
  expect_equal(
    spacy_extract_nounphrases(txt),
    spacy_extract_nounphrases(txt_df)
  )
})


test_that("spacy_parse nounphrase = TRUE works", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.",
            doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.")
  parsed <- spacy_parse(txt1, nounphrase = TRUE)
  
  expect_true(
    "nounphrase" %in% names(parsed)
  )
  expect_true(
    "whitespace" %in% names(parsed)
  )
  skip("behaviour changed in spaCy")
  expect_identical(
    sum(grepl("beg", parsed$nounphrase)),
    10L
  )
  
  
  expect_silent(spacy_finalize())
})


test_that("spacy_parse nounphrase = TRUE return message when there is no nounphrase", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- c(doc1 = "hello",
            doc2 = "hello")
  
  expect_message(
    spacy_parse(txt1, nounphrase = TRUE),
    "No noun phrase"
  )
  expect_false(
    "nounphrase" %in% names(spacy_parse(txt1, nounphrase = TRUE))
  )
  expect_false(
    "whitespace" %in% names(spacy_parse(txt1, nounphrase = TRUE))
  )
})

test_that("nounphrase_extract() on parsed object works", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.",
            doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.")
  parsed <- spacy_parse(txt1, nounphrase = TRUE)
  
  expect_silent(
    nounphrase_extract(parsed)
  )
  
  parsed_without_nounphrase <- spacy_parse(txt1, nounphrase = FALSE)
  expect_error(
    nounphrase_extract(parsed_without_nounphrase),
    "no nounphrases in parsed object"
  )
})

test_that("compare nounphrase_extract(spacy_parse()) and spacy_extract_nounphrases()", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  
  expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.",
            doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.")
  parsed <- spacy_parse(txt1, nounphrase = TRUE)
  
  noun_phrases_1 <- nounphrase_extract(parsed, concatenator = " ")
  noun_phrases_2 <- spacy_extract_nounphrases(txt1, output = "data.frame")
  expect_equal(
    noun_phrases_1$nounphrase,
    noun_phrases_2$text
  )
  expect_equal(
    parsed$token[grep("root", parsed$nounphrase)],
    noun_phrases_2$root_text
  )
  
  expect_silent(spacy_finalize())
})


test_that("nounphrase consolidation works", {
  skip_on_cran()
  # skip_on_appveyor()
  skip_on_os("solaris")
  try_spacy_initialize()
  expect_message(spacy_initialize(), "successfully|already")
  
  txt1 <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.",
            doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.")
  
  parsed <- spacy_parse(txt1, entity = TRUE, nounphrase = TRUE)
  expect_equal(
    nounphrase_consolidate(parsed)$token[c(1, 3, 15)],
    c("The_history", "natural_language_processing", "earlier_periods")
  )
  expect_equal(
    nounphrase_consolidate(parsed, concatenator = " ")$token[c(1, 3, 15)],
    c("The history", "natural language processing", "earlier periods")
  )
  skip("behaviour changed in spaCy")
  expect_equal(
    nounphrase_consolidate(parsed)$token_id,
    c(1:16, 1:22)
  )
  
  expect_identical( c("nounphrase", "whitespace", "entity")  %in% names(nounphrase_consolidate(parsed)),
                    rep(FALSE, 3))
  
  parsed <- spacy_parse(txt1, nounphrase = TRUE, pos = TRUE, tag = TRUE)
  expect_equal(
    nounphrase_consolidate(parsed)$pos[c(1, 3, 7, 10)],
    rep("nounphrase", 4)
  )
  expect_equal(
    nounphrase_consolidate(parsed)$tag[c(1, 3, 7, 10)],
    rep("nounphrase", 4)
  )
  expect_equal(
    nounphrase_consolidate(parsed)$lemma[c(1, 3, 7)],
    c("the_history", "natural_language_processing", "the_1950")
  )
  
  parsed <- spacy_parse(txt1, nounphrase = TRUE, dependency = TRUE)
  expect_message(
    nounphrase_consolidate(parsed),
    "emoving head_token_id, dep_rel for nounphrases"
  )
  
  expect_silent(spacy_finalize())
})
quanteda/spacyr documentation built on April 13, 2024, 2:27 p.m.