tests/testthat/test-2-spacy_parse.R

context("test spacy_parse")
source("utils.R")

test_that("spacy_parse handles newlines and tabs ok", {
    skip_on_cran()
    # skip_on_appveyor()
    skip_on_os("solaris")
    skip_if_no_python_or_no_spacy()
    expect_message(spacy_initialize(), "successfully")

    txt1 <- c(doc1 = "Sentence one.\nSentence two.",
              doc2 = "Sentence\tthree.")
    expect_equal(
        dim(spacy_parse(txt1, dependency = TRUE)),
        c(11, 9)
    )
    txt2 <- c(doc1 = "Sentence one.\\nSentence two.",
              doc2 = "Sentence\\tthree.")
    expect_equal(
        dim(spacy_parse(txt2, dependency = TRUE)),
        c(11, 9)
    )

    ## multiple tagsets
    expect_equal(
        names(tag1 <- spacy_parse(txt2, pos = TRUE, tag = FALSE, entity = FALSE)),
        c("doc_id", "sentence_id", "token_id", "token", "lemma", "pos")
    )
    expect_equal(
        names(tag2 <- spacy_parse(txt2, pos = FALSE, tag = TRUE, entity = FALSE)),
        c("doc_id", "sentence_id", "token_id", "token", "lemma", "tag")
    )
    expect_false(any(tag1$pos == tag2$tag))

    expect_silent(spacy_finalize())
})

test_that("spacy_parse handles quotes ok", {
    skip_on_cran()
    # skip_on_appveyor()
    skip_on_os("solaris")
    skip_if_no_python_or_no_spacy()
    expect_message(spacy_initialize(), "successfully")

    txt1 <- c(doc1 = "Sentence \"quoted\" one.",
              doc2 = "Sentence \'quoted\' two.")
    expect_true("dep_rel" %in% names(spacy_parse(txt1, dependency = TRUE)))

    txt2 <- c(doc1 = "Sentence \\\"quoted\\\" one.")
    expect_equal(
        dim(spacy_parse(txt2, dependency = TRUE)),
        c(6, 9)
    )

    txt3 <- c(doc1 = "Second sentence \\\'quoted\\\' example.")
    expect_equal(
        dim(spacy_parse(txt3, dependency = TRUE)),
        c(7, 9)
    )

    txt4 <- c(doc1 = "Sentence \\\"quoted\\\" one.",
              doc2 = "Sentence \\\'quoted\\\' two.")
    expect_equal(
        dim(spacy_parse(txt4, dependency = TRUE)),
        c(12, 9)
    )
    expect_silent(spacy_finalize())
})

test_that("spacy_parse returns the same regardless of the value of 'entity' option at the initialization", {
    skip_on_cran()
    # skip_on_appveyor()
    skip_on_os("solaris")
    skip_if_no_python_or_no_spacy()

    txt1 <- c(doc1 = "Sentence one.\nSentence two.",
              doc2 = "Sentence\tthree.")
    expect_message(spacy_initialize(entity = TRUE), "successfully")
    data_not_omit_entity <- spacy_parse(txt1, entity = FALSE)
    expect_silent(spacy_finalize())

    expect_message(spacy_initialize(entity = FALSE), "successfully")
    data_omit_entity <- spacy_parse(txt1, entity = FALSE)

    expect_equal(
        data_omit_entity,
        data_not_omit_entity
    )

    expect_silent(spacy_finalize())
})

test_that("spacy_parse returns the message if 'entity' option is not consistent with the initialization options", {
    skip_on_cran()
    # skip_on_appveyor()
    skip_on_os("solaris")
    skip_if_no_python_or_no_spacy()

    txt1 <- c(doc1 = "Sentence one.\nSentence two.",
              doc2 = "Sentence\tthree.")
    expect_message(spacy_initialize(entity = FALSE), "successfully")
    expect_message(spacy_parse(txt1, entity = TRUE), "entity == TRUE is ignored")

    expect_silent(spacy_finalize())
    expect_message(spacy_initialize(entity = TRUE), "successfully")
    expect_silent(tmp <- spacy_parse(txt1, entity = TRUE))

    expect_silent(spacy_finalize())
})

test_that("spacy_parse additional_attributes option works as intended", {
    skip_on_cran()
    # skip_on_appveyor()
    skip_on_os("solaris")
    skip_if_no_python_or_no_spacy()

    expect_message(spacy_initialize(), "successfully")

    txt1 <- c(doc1 = "I would have accepted without question the information that Gatsby sprang from the swamps of Louisiana or from the lower East Side of New York.",
              doc2 = "I graduated from New Haven in 1915, just a quarter of a century after my father, and a little later I participated in that delayed Teutonic migration known as the Great War.")

    spacy_parsed <- spacy_parse(txt1, additional_attributes = "is_title")

    expect_true("is_title" %in% names(spacy_parsed))
    expect_equal(sum(spacy_parsed$is_title), 14)

    spacy_parsed <- spacy_parse(txt1, additional_attributes = c("is_title", "like_num"))

    expect_true("like_num" %in% names(spacy_parsed))
    expect_equal(spacy_parsed[spacy_parsed$token == "1915", "like_num"],
                 TRUE)

    expect_error(spacy_parse(txt1, additional_attributes = c("random_attribute_name")),
                 "object has no attribute")

    expect_silent(spacy_finalize())
})

test_that("spacy_parse can handle data.frame properly", {
    skip_on_cran()
    # skip_on_appveyor()
    skip_on_os("solaris")
    skip_if_no_python_or_no_spacy()

    expect_message(spacy_initialize(), "successfully")

    df <- data.frame(doc_id = paste0("doc", seq(2)),
                     text = c("Sentence one.\nSentence two.",
                              "Sentence\tthree."),
                     stringsAsFactors = FALSE)
    expect_equal(dim(spacy_parse(df)), c(11, 7))

    df_impropoer <- data.frame(id = paste0("doc", seq(2)),
                               text = c("Sentence one.\nSentence two.",
                                        "Sentence\tthree."),
                               stringsAsFactors = FALSE)
    expect_error(spacy_parse(df_impropoer),
                 "input data.frame does not conform to the TIF standard")

    expect_silent(spacy_finalize())
})

# see https://stackoverflow.com/questions/55518184/how-to-feed-a-tibble-to-spacyr
test_that("spacy_parse handles numbers/tibbles correctly", {
    skip_on_cran()
    skip_on_os("solaris")
    skip_if_no_python_or_no_spacy()
    skip_if_not_installed("tibble")
    bogustib <- tibble::tibble(doc_id = c(1, 2, 3),
                               text = c("bug", "one love", "838383838"))
    expect_equivalent(
        spacy_parse(bogustib)[, c("doc_id", "sentence_id", "token_id", "token", "pos")],
        data.frame(
            doc_id = as.character(c(1, 2, 2, 3)),
            sentence_id = rep(1, 4),
            token_id = c(1, 1, 2, 1),
            token = c("bug", "one", "love", "838383838"),
            pos = c("NOUN", "NUM", "NOUN", "NUM"),
            stringsAsFactors = FALSE)
    )
})

Try the spacyr package in your browser

Any scripts or data that you put into this service are public.

spacyr documentation built on March 26, 2020, 5:25 p.m.