tests/testthat/test_as.DocumentTermMatrix.R

library(polmineR)
use("polmineR")
use(pkg = "RcppCWB", corpus = "REUTERS")
testthat::context("as.TermDocumentMatrix")

test_that(
  "Generate Term-Document-Matrix from corpus using as.TermDocumentMatrix",
  {
    dtm <- as.DocumentTermMatrix("REUTERS", p_attribute = "word", s_attribute = "id")
    expect_equal(
      length(s_attributes("REUTERS", "id")), dim(dtm)[1]
    )
    expect_equal(
      RcppCWB::cl_lexicon_size(corpus = "REUTERS", p_attribute = "word", registry = registry()),
      dim(dtm)[2]
    )
    expect_equal(sum(dtm[,"is"]), count("REUTERS", "is")[["count"]])
    
    ## this is a more comprehensive test that ensures that a column for a document
    # from the document-term-matrix and a simple count for this document are identical
    
    dtm <- as.DocumentTermMatrix("GERMAPARLMINI", p_attribute = "word", s_attribute = "party")
    spd_cnt <- as.matrix(dtm)["SPD",]
    spd_dt <- data.table::data.table(token = names(spd_cnt), count = unname(spd_cnt))[count > 0L]
    data.table::setorderv(spd_dt, cols = "token")
    
    spd_dt_obj <- corpus("GERMAPARLMINI") %>% subset(party == "SPD") %>% count(p_attribute = "word")
    spd_cnt_2 <- spd_dt_obj@stat[, "word_id" := NULL]
    data.table::setorderv(spd_cnt_2, cols = "word")
    
    expect_identical(spd_cnt_2[["count"]], spd_dt[["count"]])
    expect_identical(spd_cnt_2[["word"]], spd_dt[["token"]])

  }
)


test_that(
  "identity of as.TermDocumentMatrix and as.DocumentTermMatrix",
  {
    sp <- as.speeches("GERMAPARLMINI", s_attribute_name = "speaker", s_attribute_date = "date")
    
    tdm <- as.TermDocumentMatrix(sp, p_attribute = "word")
    dtm <- as.DocumentTermMatrix(sp, p_attribute = "word")
    
    expect_identical(tdm, t(dtm))
  }
)

test_that(
  "Check ways to generate DocumentTermMatrix against each other",
  {
    pb <- partition_bundle("GERMAPARLMINI", s_attribute = "speaker")
    
    dtm_count <- count(pb, p_attribute = "word", verbose = FALSE) %>%
      as.sparseMatrix(col = "count")
    
    dtm_enrich <- enrich(pb, p_attribute = "word") %>%
      as.sparseMatrix(col = "count")
    
    dtm_direttisima <- as.DocumentTermMatrix("GERMAPARLMINI", p_attribute = "word", s_attribute = "speaker") %>%
      as.sparseMatrix() %>%
      Matrix::t()
      
    expect_identical(length(which(!rownames(dtm_direttisima) %in% rownames(dtm_count))), 0L)
    expect_identical(length(which(!rownames(dtm_enrich) %in% rownames(dtm_count))), 0L)
    expect_identical(length(which(!colnames(dtm_enrich) %in% colnames(dtm_count))), 0L)
    
    expect_identical(slam::col_sums(dtm_enrich), slam::col_sums(dtm_count))
    

    dtm_enrich2 <- dtm_enrich[rownames(dtm_count),]
    expect_identical(dtm_count, dtm_enrich2)
    
    dtm_direttisima2 <- dtm_direttisima[,colnames(dtm_count)]
    dtm_direttisima3 <- dtm_direttisima2[rownames(dtm_count),]

    expect_identical(dtm_count, dtm_direttisima3)
    
  }
)

Try the polmineR package in your browser

Any scripts or data that you put into this service are public.

polmineR documentation built on Nov. 2, 2023, 5:52 p.m.