tests/testthat/test_workflow_add_anno_tokenized_data.R

library(data.table)

testthat::context("Adding annotation layer to tokenized data")

test_that(
  "Workflow #1 explained in the vignette",
  {
    ts_ws <- lapply(
      split(reuters_dt, f = reuters_dt[["doc_id"]]),
      function(tab) paste(tab[["word"]], collapse = " ")
    )
    
    properties_list <- list(
      "annotators" = "tokenize, ssplit",
      "tokenize.whitespace" = "true"
    )
    Pipe <- StanfordCoreNLP$new(properties = properties_list, output_format = "conll")
    
    annoli <- AnnotationList$new(ts_ws)
    Pipe$annotate(annoli)
    reuters_dt_v2 <- annoli$as.data.table()
    
    # Check that the number of tokens is identical
    expect_identical(nrow(reuters_dt), nrow(reuters_dt_v2))
    
    # Check that the tokens are identical
    expect_identical(reuters_dt[["word"]], reuters_dt_v2[["word"]])
  }
)
PolMine/bignlp documentation built on Jan. 29, 2021, 1:14 a.m.