test-colloc.R
In text2vec: Modern Text Mining Framework for R

logger = lgr::get_logger("text2vec")
logger$set_threshold("fatal")

context("collocations")

preprocessor = function(x) {
  gsub("[^[:alnum:]\\s]", replacement = " ", tolower(x))
}
sample_ind = 1:100
tokens = preprocessor(movie_review$review[sample_ind])
tokens = word_tokenizer(tokens)
it = itoken(tokens, ids = movie_review$id[sample_ind])
system.time(v <- create_vocabulary(it))
v = prune_vocabulary(v, term_count_min = 5)

cc = Collocations$new(collocation_count_min = 5, pmi_min = 5)
cc$fit(it, n_iter = 2)
cc$collocation_stat

it2 = cc$transform(it)
v2 = create_vocabulary(it2)
v2 = prune_vocabulary(v2, term_count_min = 5)

# check what phrases model has learned


test_that("collocations internalfunctions", {
  sep = "_"
  ptr = text2vec:::create_xptr_unordered_set(c("c_te", "d_ivoire", "c_te_d_ivoire"))
  ptr_sw = text2vec:::create_xptr_unordered_set(character(0))
  x = text2vec:::collapse_collocations_cpp(list(c("c", "te", "d", "2", "2")), ptr, ptr_sw, sep)[[1]]
  expect_equal(c("c_te", "d", "2", "2" ), x)
  x = text2vec:::collapse_collocations_cpp(list(c("c", "te", "d", "ivoire", "2")), ptr, ptr_sw, sep)[[1]]
  expect_equal(c("c_te_d_ivoire", "2" ), x)
  x = text2vec:::collapse_collocations_cpp(list(c("c", "te", "d", "ivoire")), ptr, ptr_sw, sep)[[1]]
  expect_equal(c("c_te_d_ivoire"), x)

  expect_equal(length(setdiff(v2$term, v$term)), 44L)
  expect_equal(nrow(cc$collocation_stat), 46L)
  expect_true("jeroen_krabb" %in% v2$term)
  expect_true(all(cc$collocation_stat$n_ij >= 5))
})


test_that("collocations Dunning's LLR", {
  # Data taken from Dunning's original paper:
  # http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962
  tbl = '270.72 110 2442 111 29114 "the swiss"
  263.90 29 13 123 31612 "can be"
  256.84 31 23 139 31584 "previous year"
  167.23 10 0 3 31764 "mineral water"
  157.21 76 104 2476 29121 "at the"
  157.03 16 16 51 31694 "real terms"
  146.80 9 0 5 31763 "natural gas"
  115.02 16 0 865 30896 "owing to"
  104.53 10 9 41 31717 "health insurance"
  100.96 8 2 27 31740 "stiff competition"
  98.72 12 111 14 31640 "is likely"
  95.29 8 5 24 31740 "qualified personnel"
  94.50 10 93 6 31668 "an estimated"
  91.40 12 111 21 31633 "is expected"
  81.55 10 45 35 31687 "1 2"
  76.30 5 13 0 31759 "balance sheet"
  73.35 16 2536 1 29224 "the united"
  68.96 6 2 45 31724 "accident insurance"
  68.61 24 43 1316 30394 "terms of"
  61.61 3 0 0 31774 "natel c"
  60.77 6 92 2 31677 "will probably"
  57.44 4 11 1 31761 "great deal"
  57.44 4 11 1 31761 "government bonds"
  57.14 13 7 1327 30430 "part of"
  53.98 4 1 18 31754 "waste paper"
  53.65 4 13 2 31758 "machine exhibition"
  52.33 7 61 27 31682 "rose slightly"
  52.30 5 9 25 31738 "passenger service"
  49.79 4 61 0 31712 "not yet"
  48.94 9 12 429 31327 "affected by"
  48.85 13 1327 12 30425 "of september"
  48.80 9 4 872 30892 "continue to"
  47.84 4 41 1 31731 "2 nd"
  47.20 8 27 157 31585 "competition from"
  46.38 10 472 20 31275 "a positive"
  45.53 4 18 6 31749 "per 100"
  44.36 7 0 1333 30437 "course of"
  43.93 5 18 33 31721 "generally good"
  43.61 19 50 1321 30387 "level of"
  43.35 20 2532 25 29200 "the stock"
  43.07 6 875 0 30896 "to register"
  43.06 3 1 10 31763 "french speaking"
  41.69 3 29 0 31745 "3 rd"
  41.67 3 1 13 31760 "knitting machines"
  40.68 4 5 40 31728 "25 000"
  39.23 9 5 1331 30432 "because of"
  39.20 5 40 25 31707 "stock markets"
  38.87 2 0 1 31774 "scanner cash"
  38.79 3 0 48 31726 "pent up"
  38.51 3 23 1 31750 "firms surveyed"
  38.46 4 2 98 31673 "restaurant business"
  38.28 3 12 3 31759 "fell back"
  38.14 6 4 432 31335 "climbed by"
  37.20 6 41 70 31660 "total production"
  37.15 2 0 2 31773 "hay crop"
  36.98 3 10 5 31759 "current transactions"'

  contingency_tbl = read.table(text = tbl, header = FALSE, stringsAsFactors = FALSE)

  names(contingency_tbl) = c("LLR", "AB", "A~B", "~AB", "~A~B", "bigram")
  nword = contingency_tbl[["AB"]] + contingency_tbl[["A~B"]] +  contingency_tbl[["~AB"]] + contingency_tbl[["~A~B"]]
  n_i = contingency_tbl[["AB"]] + contingency_tbl[["A~B"]]
  n_j = contingency_tbl[["AB"]] + contingency_tbl[["~AB"]]
  n_ij = contingency_tbl[["AB"]]

  L_func = text2vec:::L_func

  test_llr = -2 * (  L_func(n_j / nword, n_i, n_ij) +
                   L_func(n_j / nword, nword - n_i, n_j - n_ij) -
                   L_func(n_ij / n_i, n_i, n_ij) -
                   L_func((n_j - n_ij) / (nword - n_i), nword - n_i, n_j - n_ij))


  expect_equal(test_llr, contingency_tbl[["LLR"]], tolerance = 1e-4)
})