tests/testthat/test_vocab.R

context("vocab")

corpus <- list(a = c("The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"),
               b = c("the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog",
                     "the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"))
nms <- unique(unlist(corpus))
scorpus <- sapply(corpus, paste, collapse = " ")
dcorpus <- data.frame(names = names(corpus))
dcorpus$corpus <- corpus
dscorpus <- data.frame(names = names(scorpus), corpus = unname(scorpus), stringsAsFactors = F)

test_that("vocab is computed correctly", {

  v <- vocab(corpus, regex = " ")
  expect_equal(v$term, c("the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog", "The"))
  expect_equal(v$term_count, c(5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L))
  expect_equal(v$doc_count, c(2L, 2L, 2L, 2L, 2L, 2L, 2L,  2L, 1L))

  v <- vocab(corpus, ngram = c(2, 3), ngram_sep = " ", regex = " ")
  expect_equal(v$term, c("quick brown", "quick brown fox", "brown fox", "brown fox jumps",
                         "fox jumps", "fox jumps over", "jumps over", "jumps over the",
                         "over the", "over the lazy", "the lazy", "the lazy dog", "lazy dog",
                         "the quick brown", "the quick", "dog the quick", "dog the", "The quick",
                         "lazy dog the", "The quick brown"))
  expect_equal(v$term_count, c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 1L, 1L, 1L, 1L, 1L))
  expect_equal(v$doc_count, c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L))

})

test_that("vocab adds new terms to the end", {
  v <- vocab(corpus, regex=" ")
  sv <- vocab(scorpus, regex=" ")
  dv <- vocab(dcorpus, regex=" ")
  dsv <- vocab(dscorpus, regex=" ")
  expect_equal(v, sv)
  expect_equal(v, dv)
  expect_equal(v, dsv)
  extras <- list(extras = c("apples", "oranges"))
  v2 <- vocab(c(corpus, extras), regex = " ")
  expect_equal(v2$term[-c(1:nrow(v))], extras$extras)
  expect_equal(v2, update_vocab(v, extras))
  sv2 <- vocab(c(scorpus, paste(extras[[1]], collapse = "   ")), regex = " ")
  expect_equal(v2, sv2)
})

test_that("prune_vocab works as expected", {
  v <- vocab(corpus)
  sv <- vocab(scorpus, regex = " ")
  dv <- vocab(dcorpus)
  dsv <- vocab(dscorpus, regex = " ")
  expect_equal(prune_vocab(v, max_terms = 8)$term,
               prune_vocab(sv, max_terms = 8)$term)
  expect_equal(prune_vocab(v, max_terms = 8)$term,
               prune_vocab(dv, max_terms = 8)$term)
  expect_equal(prune_vocab(v, max_terms = 8)$term,
               prune_vocab(dsv, max_terms = 8)$term)
  expect_equal(prune_vocab(v, max_terms = 8)$term,
               c("the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog"))
  expect_equal(prune_vocab(v, term_count_min = 2)$term,
               c("the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog"))
  expect_equal(prune_vocab(v, term_count_max = 3)$term,
               c("quick", "brown", "fox", "jumps", "over", "lazy", "dog", "The"))
})


test_that("prune_vocab adds buckets correctly", {

  v <- vocab(corpus)
  vb <- prune_vocab(v, max_terms = 2, nbuckets = 0)
  expect_equal(attr(vb, "nbuckets"), 0)
  expect_true(all(colSums(v[, 2:3]) >  colSums(vb[, 2:3])))
  expect_equal(nrow(vb), 2)
  expect_true("the" %in% vb$term)

  v <- vocab(scorpus, regex = " ")
  vb <- prune_vocab(v, max_terms = 2, nbuckets = 3)
  expect_equal(attr(vb, "nbuckets"), 3)
  expect_equal(colSums(v[, 2:3]), colSums(vb[, 2:3]))
  expect_equal(nrow(vb), 5)
  expect_true("the" %in% vb$term)

  v <- vocab(scorpus, c(1, 2), regex = " ")
  vb <- prune_vocab(v, max_terms = 10, nbuckets = 3)
  expect_equal(attr(vb, "nbuckets"), 3)
  expect_equal(colSums(v[, 2:3]), colSums(vb[, 2:3]))
  expect_equal(nrow(vb), 13)
  expect_true("the" %in% vb$term)

})

test_that("update_vocab fails on pruned vocabularies", {
  v <- vocab(corpus, c(1, 2))
  v <- prune_vocab(v, max_terms = 10, nbuckets = 3)
  expect_error(update_vocab(v, corpus))
})

test_that("prune_vocab puts unknown buckets at the end", {
  v <- vocab(corpus, c(1, 2))
  v10 <- prune_vocab(v, max_terms = 10, nbuckets = 3)
  v2a <- prune_vocab(v10[sample(nrow(v10), nrow(v10)), ], max_terms = 1)
  v2b <- prune_vocab(v10, max_terms = 1)
  expect_equal(v2a, v2b)
})

test_that("prune_vocab works incrementally", {

  v <- vocab(corpus, c(1, 2))
  sv <- vocab(corpus, c(1, 2))
  vb2 <- prune_vocab(v, max_terms = 2, nbuckets = 3)
  vb10 <- prune_vocab(sv, max_terms = 10, nbuckets = 3)

  expect_error(prune_vocab(vb10, max_terms = 3, nbuckets = 2))

  vb10_2 <- prune_vocab(vb10, max_terms = 2, nbuckets = 3)
  expect_equal(vb2, vb10_2)

  tvb <- prune_vocab(vb10, doc_count_min = 3)
  expect_equal(nrow(tvb), 3)
  expect_equal(colSums(tvb[, 2:3]), colSums(v[, 2:3]))

  tvb10 <- prune_vocab(v, max_terms = 10, nbuckets = 0)
  tvb2 <- prune_vocab(tvb10, max_terms = 2, nbuckets = 3)
  expect_equal(nrow(tvb2), 5)
  expect_equal(colSums(tvb10[, 2:3]), colSums(tvb2[, 2:3]))

})


test_that("encodding doesn't matter", {

  txt <- c("”", "“", "–", "’", "…", "—", "‘", "•", "»",
           "·", "�", "£", "«", "→", "®", "🙂", "←", "€", "™",
           "©", "", "­", "​", "−", "\u0093", "\u0094", "›", "\u0097",
           "×", "§")

  v1 <- vocab(txt)
  Encoding(txt) <- "UTF-8"
  v2 <- vocab(txt)
  Encoding(txt[1:5]) <- "native"
  v3 <- vocab(txt)

  expect_equal(Encoding(v1$term), Encoding(v2$term))
  expect_equal(Encoding(v1$term), Encoding(v3$term))

})
vspinu/mlvocab documentation built on June 11, 2021, 7:37 a.m.