library(polmineR)
use("polmineR")
use(pkg = "RcppCWB", corpus = "REUTERS")
testthat::context("cooccurrences")
test_that(
"cooccurrences-method for corpus",
{
y <- cooccurrences("REUTERS", query = "oil", p_attribute = "word")
expect_equal(subset(y, !is.na(ll))[["word"]][1:4], c("prices", "crude", "industry", "recent"))
y <- cooccurrences("REUTERS", query = '"barrel.*"', p_attribute = "word")
expect_equal(subset(y, !is.na(ll))[["word"]][1:5], c("dlrs", "mln", "a", "reserve", "brings"))
# handle more than one p-attribute
p_attrs <- c("word", "pos")
dt <- corpus("GERMAPARLMINI") %>%
cooccurrences(query = "Arbeit", p_attribute = p_attrs) %>%
format()
expect_true(all(p_attrs %in% colnames(dt)))
expect_equal(
cooccurrences("REUTERS", query = "asdfasdf", p_attribute = "word"),
NULL
)
expect_equal(
cooccurrences("REUTERS", query = '"asdfasdfasdfasd.*"', cqp = TRUE),
NULL
)
}
)
test_that(
"cooccurrences-method for partition",
{
P <- partition("REUTERS", places = "saudi-arabia", regex = TRUE)
y <- cooccurrences(P, query = "oil", p_attribute = "word")
expect_equal(subset(y, !is.na(ll))[["word"]][1:5], c("prices", "below", "its", "crude", "market"))
y <- cooccurrences(P, query = '"barrel.*"', cqp = TRUE, p_attribute = "word")
expect_equal(subset(y, is.na(ll))[["word"]][1:5], c("10", "17.52", "18", "1986","3.5"))
expect_equal(
cooccurrences(P, query = "asdfasdf", p_attribute = "word"),
NULL
)
expect_equal(
cooccurrences(P, query = '"asdfasdfasdfasd.*"', cqp = TRUE, p_attribute = "word"),
NULL
)
}
)
test_that(
"Check log-likelihood formula",
{
cooc <- cooccurrences("GERMAPARLMINI", query = "Integration")
cooc_dt <- cooc@stat[!is.na(ll)]
data.table::setorderv(cooc_dt, cols = "ll", order = -1L)
for (i in seq.int(from = 1L, to = 10L)){
o11 <- cooc_dt[["count_coi"]][i]
o12 <- cooc_dt[["count_ref"]][i]
o21 <- cooc@size_coi - o11
o22 <- cooc@size_ref - o12
N <- o21 + o22 + o11 + o12
e11 <- (o11 + o21) * ((o11 + o12) / N)
e12 <- (o12 + o22) * ((o11 + o12) / N)
e21 <- (o11 + o21) * ((o21 + o22) / N)
e22 <- (o12 + o22) * ((o21 + o22) / N)
ll <- 2 * (o11*log(o11/e11) + o12*log(o12/e12) + o21*log(o21/e21) + o22*log(o22/e22) )
expect_identical(round(cooc_dt[["ll"]][i], 3), round(ll, 3))
}
}
)
test_that(
"Identity of cooccurrences and Cooccurrences",
{
testthat::skip_on_cran()
rm <- noise(
terms("REUTERS", p_attribute = "word"),
specialChars = NULL, minNchar = 2L, stopwordsLanguage = "en"
)
stopwords <- unname(unlist(rm))
r <- Cooccurrences("REUTERS", p_attribute = "word", left = 5L, right = 5L, stoplist = stopwords)
stm <- as.simple_triplet_matrix(r)
decode(r)
spm <- as.sparseMatrix(r)
coocs <- list(
c("oil", "prices"),
c("Saudi", "Arabia"),
c("Sheikh", "Ali"),
c("barrel", "dlrs")
)
lapply(
coocs,
function(tokens){
a2b <- as.integer(as.matrix(stm[tokens[1], tokens[2]]))
b2a <- as.integer(as.matrix(stm[tokens[2], tokens[1]]))
expect_equal(a2b, b2a)
expect_equal(spm[tokens[1], tokens[2]], spm[tokens[2], tokens[1]])
a2b_cqp <- count("REUTERS", sprintf('"%s" []{0,4} "%s"', tokens[1], tokens[2]), cqp = TRUE)
b2a_cqp <- count("REUTERS", sprintf('"%s" []{0,4} "%s"', tokens[2], tokens[1]), cqp = TRUE)
expect_equal(a2b_cqp[["count"]] + b2a_cqp[["count"]], a2b)
expect_equal(a2b_cqp[["count"]] + b2a_cqp[["count"]], spm[tokens[1], tokens[2]])
expect_equal(
a2b,
cooccurrences("REUTERS", query = tokens[1])@stat[word == tokens[2]][["count_coi"]]
)
NULL
})
ll(r)
decode(r)
a <- data.table::as.data.table(cooccurrences(r, query = "oil"))
a <- a[!is.na(ll)][!is.nan(ll)]
b <- data.table::as.data.table(cooccurrences("REUTERS", query = "oil"))[!word %in% stopwords]
b <- b[!is.na(ll)][!is.nan(ll)]
library(data.table)
setkeyv(a, cols = "word")
setkeyv(b, cols = "word")
m <- a[b]
expect_equal(m[["count_coi"]], m[["i.count_coi"]])
expect_equal(m[["obs_ref"]], m[["i.count_ref"]])
expect_equal(m[["exp_coi"]], m[["i.exp_coi"]])
expect_equal(m[["exp_ref"]], m[["i.exp_ref"]])
expect_equal(m[["ll"]], m[["i.ll"]])
expect_equal(a[["word"]][1:14], b[["word"]][1:14])
}
)
test_that(
"Cooccurences-method for subcorpus and partition objects",
{
testthat::skip_on_cran()
merkel <- partition(
"GERMAPARLMINI",
speaker = "Merkel",
date = "2009-11-10",
interjection = "speech",
regex = TRUE
)
merkel_cooc <- Cooccurrences(
merkel,
p_attribute = c("word", "pos"),
left = 3L, right = 3L,
verbose = TRUE
)
ll(merkel_cooc)
decode(merkel_cooc)
expect_identical(
unique(merkel_cooc@stat[a_word == "und"][["a_count"]]),
count(merkel, "und")[["count"]]
)
#######
merkel_sc <- corpus("GERMAPARLMINI") %>%
subset(date == "2009-11-10") %>%
subset(grep("Merkel", speaker)) %>%
subset(interjection == "speech")
merkel_cooc_sc <- Cooccurrences(
merkel_sc,
p_attribute = c("word", "pos"),
left = 3L, right = 3L,
verbose = TRUE
)
ll(merkel_cooc_sc)
decode(merkel_cooc_sc)
expect_identical(
unique(merkel_cooc_sc@stat[a_word == "und"][["a_count"]]),
count(merkel_sc, "und", verbose = FALSE)[["count"]]
)
expect_identical(merkel_cooc@stat, merkel_cooc_sc@stat)
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.