tests/testthat/test_as.speeches.R

library(polmineR)
use("polmineR")
testthat::context("as.speeches")

test_that(
  "as.speeches",
  {
    p <- partition("GERMAPARLMINI", date = ".*", regex = TRUE)
    pb <- as.speeches(corpus("GERMAPARLMINI"), s_attribute_name = "speaker", s_attribute_date = "date")
    expect_equal(length(pb), 276L)
    
    scb <- as.speeches(corpus("GERMAPARLMINI"), s_attribute_name = "speaker", s_attribute_date = "date")
    expect_equal(length(scb), length(pb))
    expect_equal(sum(unname(unlist(lapply(scb@objects, size)))), size("GERMAPARLMINI"))
    expect_equal(all(names(scb) %in% names(pb)), TRUE)
    
    pb <- pb[names(scb)]
    expect_identical(names(scb), names(pb))
    expect_identical(
      do.call(rbind, lapply(scb@objects, function(x) x@cpos)),
      do.call(rbind, lapply(pb@objects, function(x) x@cpos))
    )
  }
)

test_that(
  "as.speeches() same result for partition and corpus-method",
  {
    sp_all <- as.speeches("GERMAPARLMINI", s_attribute_name = "speaker", s_attribute_date = "date")
    sp_min1 <- sp_all[grep("(2009-10-28|2009-11-10)", names(sp_all), value = TRUE)]
    
    sp_min2 <- corpus("GERMAPARLMINI") %>%
      subset(date %in% c("2009-10-28", "2009-11-10")) %>%
      as.speeches(s_attribute_name = "speaker", s_attribute_date = "date")
    
    expect_identical(length(sp_min1), length(sp_min2))
    expect_identical(sum(summary(sp_min1)$size), sum(summary(sp_min2)$size))
    expect_true(all(names(sp_min1) %in% names(sp_min2)))
    
  }
)

test_that(
  "tdm for as.speeches",
  {
    skip("knowingly not working")
    p <- partition("GERMAPARLMINI", date = ".*", regex = TRUE)
    pb <- as.speeches(p, s_attribute_name = "speaker", s_attribute_date = "date")
    cnt <- count(pb, p_attribute = "word")
    tdm <- as.TermDocumentMatrix(cnt, col = "count")
    
    co <- corpus("GERMAPARLMINI")
    sp <- as.speeches(co, s_attribute_name = "speaker", s_attribute_date = "date")
    tmp <- sp[[names(pb)]]
    sp@objects <- tmp@objects
    cnt2 <- count(sp, p_attribute = "word")
    tdm2 <- as.TermDocumentMatrix(cnt2, col = "count")
    
    expect_identical(tdm, tdm2)
  }
)

test_that(
  "tdm for as.speeches, but partition/subcorpus",
  {
    p <- partition("GERMAPARLMINI", date = "2009-11-11", regex = TRUE)
    pb <- as.speeches(p, s_attribute_name = "speaker", s_attribute_date = "date")
    cnt <- count(pb, p_attribute = "word")
    tdm <- as.TermDocumentMatrix(cnt, col = "count")
    
    co <- corpus("GERMAPARLMINI")
    s <- subset(co, date == "2009-11-11")
    sp <- as.speeches(co, s_attribute_name = "speaker", s_attribute_date = "date")
    tmp <- sp[names(pb)]
    sp@objects <- tmp@objects
    cnt2 <- count(sp, p_attribute = "word")
    tdm2 <- as.TermDocumentMatrix(cnt2, col = "count")
    
    expect_identical(tdm, tdm2)
  }
)

test_that(
  "as.speeches() for nested corpus",
  {
    skip_if_not(use("GermaParl2"))
    sp <- corpus("GERMAPARL2MINI") %>%
      subset(p_type == "speech") %>%
      as.speeches(
        s_attribute_date = "protocol_date",
        s_attribute_name = "speaker_name",
        verbose = FALSE, progress = FALSE
      )
    expect_identical(length(sp), 14L)
    
    cschmid <- corpus("GERMAPARL2MINI") %>%
      subset(speaker_name == "Carlo Schmid") %>%
      subset(p_type == "speech")

    expect_identical(
      size(sp[[grep("Carlo Schmid", names(sp), value = TRUE)]]),
      size(cschmid)
    )
  }
)

Try the polmineR package in your browser

Any scripts or data that you put into this service are public.

polmineR documentation built on Nov. 2, 2023, 5:52 p.m.