tests/testthat/test_split.R

library(polmineR)
testthat::context("split")
use("polmineR")


test_that(
  "split up corpus",
  {
    x <- partition("GERMAPARLMINI", date = "2009-11-11", speaker = "Gerda Hasselfeldt")
    y <- split(x, gap = 500)
    expect_identical(is(y)[1], "partition_bundle")
    expect_identical(length(y), 15L)
    expect_identical(size(merge(y)), size(x))
    
    y2 <- merge(y)
    expect_identical(as.integer(merge(y)@cpos), as.integer(x@cpos))
    expect_identical(as.vector(merge(y)@cpos), as.vector(x@cpos))
    
    # check that argument values works as intended
    speakers <- c("Volker Kauder", "Norbert Lammert", "Wolfgang Thierse")
    sb_speakers <- corpus("GERMAPARLMINI") %>% 
      split(s_attribute = "speaker", values = speakers)
    expect_true(all(speakers %in% names(sb_speakers)))
    
    
    # the following tests require that GERMAPARL2MINI is available
    # It is wrapped into the GermaParl2 package, which can be installed as 
    # follows:
    # install.packages(
    #   pkgs = "GermaParl2",
    #   contriburl = "https://polmine.github.io/drat/src/contrib",
    #   type = "source"
    # )
    
    skip_if_not(use("GermaParl2"))
    
    gparl2 <- corpus("GERMAPARL2MINI")
    
    n_sentences <- gparl2 %>% 
      split(s_attribute = "p", values = FALSE, verbose = FALSE) %>% 
      length()
    
    attr_size <- RcppCWB::cl_attribute_size(
      corpus = "GERMAPARL2MINI",
      attribute = "p",
      attribute_type = "s",
      registry = gparl2@registry_dir
    )
    
    expect_identical(n_sentences, attr_size)
    
  }
)

test_that(
  "different order, same result",
  {
    pp1 <- corpus("GERMAPARLMINI") %>%
      subset(protocol_date == "2009-11-10") %>%
      split(s_attribute = "speaker")
    
    pp2 <- corpus("GERMAPARLMINI") %>%
      subset(date == "2009-11-10") %>%
      split(s_attribute = "speaker")
    
    pp3 <- corpus("GERMAPARLMINI") %>%
      split(s_attribute = "speaker") %>%
      .[["Angela Dorothea Merkel"]] %>%
      subset(date == "2009-11-10")
    
    dimnames(pp3@cpos) <- NULL
    
    expect_identical(
      pp1[["Angela Dorothea Merkel"]]@cpos,
      pp2[["Angela Dorothea Merkel"]]@cpos
    )
    
    expect_identical(
      pp1[["Angela Dorothea Merkel"]]@cpos,
      pp3@cpos
    )
    
    skip_if_not(use("GermaParl2"))
    
    gparl2 <- corpus("GERMAPARL2MINI")
    
    renner <- gparl2 %>% 
      subset(speaker_who == "Renner")

    n_subcorpora <- split(renner, s_attribute = "s", verbose = FALSE) |>
      length()
    
    n_sentences <- renner |>
      slot("cpos") %>% 
      RcppCWB::ranges_to_cpos() %>% 
      RcppCWB::cl_cpos2struc(corpus = "GERMAPARL2MINI", s_attribute = "s", cpos = ., registry = gparl2@registry_dir) %>% 
      unique() %>% 
      length()
    
    expect_identical(n_subcorpora, n_sentences)
  }
)

Try the polmineR package in your browser

Any scripts or data that you put into this service are public.

polmineR documentation built on Nov. 2, 2023, 5:52 p.m.