test_ngrams.R
In polmineR: Verbs and Nouns for Corpus Analysis

library(polmineR)
use(pkg = "RcppCWB", corpus = "REUTERS")

testthat::context("ngrams-method")

test_that("ngrams",{
  o <- corpus("REUTERS")
  n <- ngrams(o, n = 2)
  dt <- data.table::as.data.table(n)
  data.table::setorderv(dt, cols = "count", order = -1L)
  dt_min <- dt[!dt[["word_1"]] %in% tm::stopwords("en")]
  for (i in 1L:5L){
    n <- count(
      corpus("REUTERS"),
      query = sprintf('"%s" "%s"', dt_min[["word_1"]][i], dt_min[["word_2"]][i])
    )[["count"]]
    expect_equal(n, dt_min[["count"]][i])
  }
  
  
  p <- partition("REUTERS", places = "saudi-arabia", regex = TRUE)
  n <- ngrams(p, n = 2)
  dt <- data.table::as.data.table(n)
  data.table::setorderv(dt, cols = "count", order = -1L)
  dt_min <- dt[!dt[["word_1"]] %in% tm::stopwords("en")]
  for (i in 1L:5L){
    n <- count(
      p,
      query = sprintf('"%s" "%s"', dt_min[["word_1"]][i], dt_min[["word_2"]][i])
    )[["count"]]
    expect_equal(n, dt_min[["count"]][i])
  }
})


test_that("ngrams - character",{
  o <- corpus("REUTERS")
  n <- ngrams(o, n = 3, char = "")
  expect_identical(
    length(grep("oil", get_token_stream(o, p_attribute = "word"))),
    n["oil",][["count"]]
  )
})

test_that("ngrams - character, bundle",{
  a <- corpus("REUTERS") %>%
    split(s_attribute = "id") %>%
    ngrams(n = 3L, char = "")
  
  b <- corpus("REUTERS") %>%
    subset(id == "127") %>%
    ngrams(n = 3L, char = "")
  
  expect_identical(a[["127"]]@stat, b@stat)
})