test_stats.R
In rainette: The Reinert Method for Textual Data Clustering

library(testthat)
library(quanteda)
context("stats functions")

mini_corpus <- head(data_corpus_inaugural, n = 10)
mini_corpus <- split_segments(mini_corpus)
dtm <- dfm(tokens(mini_corpus, remove_punct = TRUE), tolower = TRUE)
dtm <- dfm_remove(dtm, stopwords("en"))
dtm <- dfm_trim(dtm, min_termfreq = 3)

res <- rainette(dtm, k = 5, min_segment_size = 5, min_split_members = 3)

test_that("rainette_stats", {

  ## chi2
  groups <- cutree_rainette(res, k = 3)
  stats <- rainette_stats(groups, dtm, n_terms = 10)
  expect_equal(length(stats), max(groups))
  n_stats_pos <- sum(stats[[2]]$sign == "positive")
  expect_equivalent(
    stats[[2]][stats[[2]]$sign == "positive", -ncol(stats[[2]])],
    head(quanteda.textstats::textstat_keyness(dtm, groups == 2), n_stats_pos)
  )
  n_stats_neg <- sum(stats[[3]]$sign == "negative")
  expect_equivalent(
    stats[[3]][stats[[3]]$sign == "negative",  -ncol(stats[[3]])] %>%
      dplyr::arrange(desc(chi2)),
    tail(quanteda.textstats::textstat_keyness(dtm, groups == 3), n_stats_neg)
  )

  ## lr
  groups <- cutree_rainette(res, k = 4)
  stats <- rainette_stats(groups, dtm, measure = "lr", n_terms = 8)
  expect_equal(length(stats), max(groups))
  n_stats_pos <- sum(stats[[1]]$sign == "positive")
  expect_equivalent(
    stats[[1]][stats[[1]]$sign == "positive", -ncol(stats[[1]])],
    head(quanteda.textstats::textstat_keyness(dtm, measure = "lr", groups == 1), n_stats_pos)
  )
  n_stats_neg <- sum(stats[[4]]$sign == "negative")
  expect_equivalent(
    stats[[4]][stats[[4]]$sign == "negative", -ncol(stats[[4]])] %>%
      dplyr::arrange(desc(G2)),
    tail(quanteda.textstats::textstat_keyness(dtm, measure = "lr", groups == 4), n_stats_neg)
  )

  ## max_p
  stats <- rainette_stats(groups, dtm, n_terms = 10, max_p = 1e-09)
  for (i in seq_along(stats)) {
    expect(all(stats[[i]]$p <= 1e-09), "p not less than max_p")
  }

  ## frequency and docprop
  groups <- cutree_rainette(res, k = 3)
  stats <- rainette_stats(groups, dtm, n_terms = 10, measure = "frequency")
  expect_equal(length(stats), max(groups))
  dtm2 <- dfm_subset(dtm, groups == 2)
  expect_equivalent(
    stats[[2]] %>% select(-sign),
    quanteda.textstats::textstat_frequency(dtm2) %>%
      as_tibble() %>%
      mutate(docprop = .data$docfreq / ndoc(dtm2)) %>%
      arrange(desc(frequency)) %>%
      slice(1:10)
  )
  stats <- rainette_stats(groups, dtm, n_terms = 10, measure = "docprop")
  expect_equal(length(stats), max(groups))
  dtm3 <- dfm_subset(dtm, groups == 3)
  expect_equivalent(
    stats[[3]] %>% select(-sign),
    quanteda.textstats::textstat_frequency(dtm3) %>%
      as_tibble() %>%
      mutate(docprop = .data$docfreq / ndoc(dtm3)) %>%
      arrange(desc(docprop)) %>%
      head(10)
  )
})

Any scripts or data that you put into this service are public.

rainette documentation built on March 31, 2023, 6:43 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

rainette
The Reinert Method for Textual Data Clustering

tests/testthat/test_stats.R
In rainette: The Reinert Method for Textual Data Clustering

Try the rainette package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rainette The Reinert Method for Textual Data Clustering

tests/testthat/test_stats.R In rainette: The Reinert Method for Textual Data Clustering

Try the rainette package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rainette
The Reinert Method for Textual Data Clustering

tests/testthat/test_stats.R
In rainette: The Reinert Method for Textual Data Clustering