library(polmineR)
use("polmineR")
use(pkg = "RcppCWB", corpus = "REUTERS")
testthat::context("count-method")
test_that("count",{
dt <- count("REUTERS")
expect_true(all(colnames(dt) %in% c("word", "word_id", "count")))
expect_true(is.integer(dt[["count"]]))
expect_true(is.integer(dt[["word_id"]]))
expect_true(is.character(dt[["word"]]))
expect_equal(sum(dt[["count"]]), 4050)
expect_equal(dt@stat[word == "barrel"][["count"]], 15)
})
reuters <- partition("REUTERS", list(id = ".*"), regex = TRUE)
test_that("count (one query)", {
expect_equal(count(reuters, query = "is")[["count"]], 25)
})
test_that("count (multiple queries)", {
queries <- c("is", "this", "real")
reuters_cnt <- corpus("REUTERS") %>% count(query = queries)
expect_equal(reuters_cnt[["count"]], c(25, 7, 3))
reuters_kuwait <- corpus("REUTERS") %>% subset(grepl("kuwait", places))
reuters_kuwait_cnt <- count(reuters_kuwait, query = queries, breakdown = FALSE)
expect_equal(reuters_kuwait_cnt[["count"]], c(3L, 3L, 0L))
reuters_kuwait_partition <- partition("REUTERS", places = "kuwait", regex = TRUE)
reuters_kuwait_cnt2 <- count(reuters_kuwait_partition, query = queries)
expect_equal(reuters_kuwait_cnt2[["count"]], c(3L, 3L, 0L))
# issue warning when query matches overlap
testthat::expect_warning(
corpus("REUTERS") %>% count(query = c('"price.*"', '"prices"'), cqp = TRUE)
)
testthat::expect_warning(
count(reuters_kuwait, query = c('"price.*"', '"prices"'), cqp = TRUE)
)
testthat::expect_warning(
count(reuters_kuwait_partition, query = c('"price.*"', '"prices"'), cqp = TRUE)
)
})
test_that("count - breakdown", {
y <- count("REUTERS", query = '"remain.*"', breakdown = TRUE)
expect_equal(sum(y[["count"]]), 5)
})
test_that(
"count over partition_bundle",
{
cnt_int_total <- corpus("GERMAPARLMINI") %>% count(query = "Integration")
sp_bundle <- corpus("GERMAPARLMINI") %>%
as.speeches(s_attribute_name = "speaker", s_attribute_date = "date")
cnt_int_pb <- count(sp_bundle, query = "Integration") %>%
subset(Integration > 0)
expect_equal(sum(cnt_int_pb[["TOTAL"]]), cnt_int_total[["count"]])
for (i in 1L:nrow(cnt_int_pb)){
exp <- count(sp_bundle[[cnt_int_pb[["partition"]][i]]], query = "Integration")[["count"]]
expect_equal(exp, cnt_int_pb[["TOTAL"]][i])
}
})
test_that(
"count over partition_bundle with phrases",
{
obs <- corpus("GERMAPARLMINI") %>% count(p_attribute = "word")
phrases <- corpus("GERMAPARLMINI") %>%
ngrams(n = 2L, p_attribute = "word") %>%
pmi(observed = obs) %>%
subset(ngram_count > 5L) %>%
subset(1:100) %>%
as.phrases()
speeches <- corpus("GERMAPARLMINI") %>%
as.speeches(s_attribute_name = "speaker", s_attribute_date = "date", progress = FALSE)
dtm <- count(speeches, phrases = phrases, p_attribute = "word", progress = FALSE, verbose = TRUE) %>%
as.DocumentTermMatrix(col = "count", verbose = FALSE)
queries <- c(
"erneuerbaren_Energien" = '"erneuerbaren" "Energien"',
"Vereinten_Nationen" = '"Vereinten" "Nationen"',
"gesetzlichen_Mindestlohn" = '"gesetzlichen" "Mindestlohn"'
)
matches <- count(speeches, query = queries, cqp = TRUE, progress = FALSE) %>%
subset(TOTAL > 0)
for (i in 1:nrow(matches)){
expect_equal(matches[['TOTAL']][i], sum(as.vector(dtm[matches$partition[i], names(queries)])))
}
}
)
test_that(
"equivalence of using arg p_attribute and CQP syntax for count,partition_bundle",
{
speeches <- corpus("GERMAPARLMINI") |>
as.speeches(s_attribute_date = "date", s_attribute_name = "speaker")
a <- count(speeches, query = '[pos = "ADJA"]', cqp = TRUE)
b <- count(speeches, query = "ADJA", p_attribute = "pos")
expect_identical(a$TOTAL, b$TOTAL)
}
)
test_that(
"issue warning if there are overlapping queries",
{
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.