mlvocab: Vocabulary and Corpus Preprocessing for Natural Language Pipelines

context("vocab")

corpus <- list(a = c("The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"),
               b = c("the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog",
                     "the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"))
nms <- unique(unlist(corpus))
scorpus <- sapply(corpus, paste, collapse = " ")
dcorpus <- data.frame(names = names(corpus))
dcorpus$corpus <- corpus
dscorpus <- data.frame(names = names(scorpus), corpus = unname(scorpus), stringsAsFactors = F)

test_that("vocab is computed correctly", {

  v <- vocab(corpus, regex = " ")
  expect_equal(v$term, c("the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog", "The"))
  expect_equal(v$term_count, c(5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L))
  expect_equal(v$doc_count, c(2L, 2L, 2L, 2L, 2L, 2L, 2L,  2L, 1L))

  v <- vocab(corpus, ngram = c(2, 3), ngram_sep = " ", regex = " ")
  expect_equal(v$term, c("quick brown", "quick brown fox", "brown fox", "brown fox jumps",
                         "fox jumps", "fox jumps over", "jumps over", "jumps over the",
                         "over the", "over the lazy", "the lazy", "the lazy dog", "lazy dog",
                         "the quick brown", "the quick", "dog the quick", "dog the", "The quick",
                         "lazy dog the", "The quick brown"))
  expect_equal(v$term_count, c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 1L, 1L, 1L, 1L, 1L))
  expect_equal(v$doc_count, c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L))

})

test_that("vocab adds new terms to the end", {
  v <- vocab(corpus, regex=" ")
  sv <- vocab(scorpus, regex=" ")
  dv <- vocab(dcorpus, regex=" ")
  dsv <- vocab(dscorpus, regex=" ")
  expect_equal(v, sv)
  expect_equal(v, dv)
  expect_equal(v, dsv)
  extras <- list(extras = c("apples", "oranges"))
  v2 <- vocab(c(corpus, extras), regex = " ")
  expect_equal(v2$term[-c(1:nrow(v))], extras$extras)
  expect_equal(v2, update_vocab(v, extras))
  sv2 <- vocab(c(scorpus, paste(extras[[1]], collapse = "   ")), regex = " ")
  expect_equal(v2, sv2)
})

test_that("prune_vocab works as expected", {
  v <- vocab(corpus)
  sv <- vocab(scorpus, regex = " ")
  dv <- vocab(dcorpus)
  dsv <- vocab(dscorpus, regex = " ")
  expect_equal(prune_vocab(v, max_terms = 8)$term,
               prune_vocab(sv, max_terms = 8)$term)
  expect_equal(prune_vocab(v, max_terms = 8)$term,
               prune_vocab(dv, max_terms = 8)$term)
  expect_equal(prune_vocab(v, max_terms = 8)$term,
               prune_vocab(dsv, max_terms = 8)$term)
  expect_equal(prune_vocab(v, max_terms = 8)$term,
               c("the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog"))
  expect_equal(prune_vocab(v, term_count_min = 2)$term,
               c("the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog"))
  expect_equal(prune_vocab(v, term_count_max = 3)$term,
               c("quick", "brown", "fox", "jumps", "over", "lazy", "dog", "The"))
})


test_that("prune_vocab adds buckets correctly", {

  v <- vocab(corpus)
  vb <- prune_vocab(v, max_terms = 2, nbuckets = 0)
  expect_equal(attr(vb, "nbuckets"), 0)
  expect_true(all(colSums(v[, 2:3]) >  colSums(vb[, 2:3])))
  expect_equal(nrow(vb), 2)
  expect_true("the" %in% vb$term)

  v <- vocab(scorpus, regex = " ")
  vb <- prune_vocab(v, max_terms = 2, nbuckets = 3)
  expect_equal(attr(vb, "nbuckets"), 3)
  expect_equal(colSums(v[, 2:3]), colSums(vb[, 2:3]))
  expect_equal(nrow(vb), 5)
  expect_true("the" %in% vb$term)

  v <- vocab(scorpus, c(1, 2), regex = " ")
  vb <- prune_vocab(v, max_terms = 10, nbuckets = 3)
  expect_equal(attr(vb, "nbuckets"), 3)
  expect_equal(colSums(v[, 2:3]), colSums(vb[, 2:3]))
  expect_equal(nrow(vb), 13)
  expect_true("the" %in% vb$term)

})

test_that("update_vocab fails on pruned vocabularies", {
  v <- vocab(corpus, c(1, 2))
  v <- prune_vocab(v, max_terms = 10, nbuckets = 3)
  expect_error(update_vocab(v, corpus))
})

test_that("prune_vocab puts unknown buckets at the end", {
  v <- vocab(corpus, c(1, 2))
  v10 <- prune_vocab(v, max_terms = 10, nbuckets = 3)
  v2a <- prune_vocab(v10[sample(nrow(v10), nrow(v10)), ], max_terms = 1)
  v2b <- prune_vocab(v10, max_terms = 1)
  expect_equal(v2a, v2b)
})

test_that("prune_vocab works incrementally", {

  v <- vocab(corpus, c(1, 2))
  sv <- vocab(corpus, c(1, 2))
  vb2 <- prune_vocab(v, max_terms = 2, nbuckets = 3)
  vb10 <- prune_vocab(sv, max_terms = 10, nbuckets = 3)

  expect_error(prune_vocab(vb10, max_terms = 3, nbuckets = 2))

  vb10_2 <- prune_vocab(vb10, max_terms = 2, nbuckets = 3)
  expect_equal(vb2, vb10_2)

  tvb <- prune_vocab(vb10, doc_count_min = 3)
  expect_equal(nrow(tvb), 3)
  expect_equal(colSums(tvb[, 2:3]), colSums(v[, 2:3]))

  tvb10 <- prune_vocab(v, max_terms = 10, nbuckets = 0)
  tvb2 <- prune_vocab(tvb10, max_terms = 2, nbuckets = 3)
  expect_equal(nrow(tvb2), 5)
  expect_equal(colSums(tvb10[, 2:3]), colSums(tvb2[, 2:3]))

})


test_that("encodding doesn't matter", {

  txt <- c("”", "“", "–", "’", "…", "—", "‘", "•", "»",
           "·", "�", "£", "«", "→", "®", "🙂", "←", "€", "™",
           "©", "", "", "", "−", "\u0093", "\u0094", "›", "\u0097",
           "×", "§")

  v1 <- vocab(txt)
  Encoding(txt) <- "UTF-8"
  v2 <- vocab(txt)
  Encoding(txt[1:5]) <- "native"
  v3 <- vocab(txt)

  expect_equal(Encoding(v1$term), Encoding(v2$term))
  expect_equal(Encoding(v1$term), Encoding(v3$term))

})

vspinu/mlvocab documentation built on June 11, 2021, 7:37 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

vspinu/mlvocab
Vocabulary and Corpus Preprocessing for Natural Language Pipelines

tests/testthat/test_vocab.R
In vspinu/mlvocab: Vocabulary and Corpus Preprocessing for Natural Language Pipelines

R Package Documentation

Browse R Packages

We want your feedback!

vspinu/mlvocab Vocabulary and Corpus Preprocessing for Natural Language Pipelines

tests/testthat/test_vocab.R In vspinu/mlvocab: Vocabulary and Corpus Preprocessing for Natural Language Pipelines

R Package Documentation

Browse R Packages

We want your feedback!

vspinu/mlvocab
Vocabulary and Corpus Preprocessing for Natural Language Pipelines

tests/testthat/test_vocab.R
In vspinu/mlvocab: Vocabulary and Corpus Preprocessing for Natural Language Pipelines