Nothing
library(polmineR)
use("polmineR")
use(pkg = "RcppCWB", corpus = "REUTERS")
testthat::context("as.TermDocumentMatrix")
test_that(
"Generate Term-Document-Matrix from corpus using as.TermDocumentMatrix",
{
dtm <- as.DocumentTermMatrix("REUTERS", p_attribute = "word", s_attribute = "id")
expect_equal(
length(s_attributes("REUTERS", "id")), dim(dtm)[1]
)
expect_equal(
RcppCWB::cl_lexicon_size(corpus = "REUTERS", p_attribute = "word", registry = registry()),
dim(dtm)[2]
)
expect_equal(sum(dtm[,"is"]), count("REUTERS", "is")[["count"]])
## this is a more comprehensive test that ensures that a column for a document
# from the document-term-matrix and a simple count for this document are identical
dtm <- as.DocumentTermMatrix("GERMAPARLMINI", p_attribute = "word", s_attribute = "party")
spd_cnt <- as.matrix(dtm)["SPD",]
spd_dt <- data.table::data.table(token = names(spd_cnt), count = unname(spd_cnt))[count > 0L]
data.table::setorderv(spd_dt, cols = "token")
spd_dt_obj <- corpus("GERMAPARLMINI") %>% subset(party == "SPD") %>% count(p_attribute = "word")
spd_cnt_2 <- spd_dt_obj@stat[, "word_id" := NULL]
data.table::setorderv(spd_cnt_2, cols = "word")
expect_identical(spd_cnt_2[["count"]], spd_dt[["count"]])
expect_identical(spd_cnt_2[["word"]], spd_dt[["token"]])
}
)
test_that(
"identity of as.TermDocumentMatrix and as.DocumentTermMatrix",
{
sp <- as.speeches("GERMAPARLMINI", s_attribute_name = "speaker", s_attribute_date = "date")
tdm <- as.TermDocumentMatrix(sp, p_attribute = "word")
dtm <- as.DocumentTermMatrix(sp, p_attribute = "word")
expect_identical(tdm, t(dtm))
}
)
test_that(
"Check ways to generate DocumentTermMatrix against each other",
{
pb <- partition_bundle("GERMAPARLMINI", s_attribute = "speaker")
dtm_count <- count(pb, p_attribute = "word", verbose = FALSE) %>%
as.sparseMatrix(col = "count")
dtm_enrich <- enrich(pb, p_attribute = "word") %>%
as.sparseMatrix(col = "count")
dtm_direttisima <- as.DocumentTermMatrix("GERMAPARLMINI", p_attribute = "word", s_attribute = "speaker") %>%
as.sparseMatrix() %>%
Matrix::t()
expect_identical(length(which(!rownames(dtm_direttisima) %in% rownames(dtm_count))), 0L)
expect_identical(length(which(!rownames(dtm_enrich) %in% rownames(dtm_count))), 0L)
expect_identical(length(which(!colnames(dtm_enrich) %in% colnames(dtm_count))), 0L)
expect_identical(slam::col_sums(dtm_enrich), slam::col_sums(dtm_count))
dtm_enrich2 <- dtm_enrich[rownames(dtm_count),]
expect_identical(dtm_count, dtm_enrich2)
dtm_direttisima2 <- dtm_direttisima[,colnames(dtm_count)]
dtm_direttisima3 <- dtm_direttisima2[rownames(dtm_count),]
expect_identical(dtm_count, dtm_direttisima3)
}
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.