tests/testthat/test_rainette1.R

library(quanteda)
context("simple reinert classification")

m <- matrix(
  c(
    1L, 1L, 1L, 0L, 0L,
    0L, 0L, 1L, 1L, 0L,
    1L, 0L, 0L, 0L, 0L,
    1L, 1L, 1L, 1L, 0L,
    1L, 1L, 1L, 1L, 1L,
    1L, 0L, 0L, 0L, 0L,
    1L, 1L, 1L, 0L, 0L
  ),
  ncol = 5,
  byrow = TRUE,
)
mini_dfm <- as.dfm(m)


## docs_order_by_ca

indices <- rainette:::order_docs(mini_dfm)

test_that("order_docs", {
  skip_if_not_installed("quanteda.textmodels")
  if (requireNamespace("quanteda.textmodels", quietly = TRUE)) {
    ca_values <- quanteda.textmodels::textmodel_ca(mini_dfm)$rowcoord[, 1]
    expect_equal(order(ca_values), indices)
  }
})

## split_tab_by_chisq

tab <- convert(mini_dfm, to = "matrix")
storage.mode(tab) <- "integer"
res_split <- rainette:::cpp_split_tab(m, indices)

test_that("split_tab", {
  manual_res <- data.frame(index = -1, chisq = -1)
  for (i in 1:6) {
    tab1 <- m[indices[1:i], , drop = FALSE]
    tab2 <- m[indices[(i + 1):7], , drop = FALSE]
    expect_warning(chisq <- chisq.test(rbind(colSums(tab1), colSums(tab2)))$statistic)
    manual_res <- rbind(manual_res, c(index = indices[i], chisq = chisq))
  }
  max <- which.max(manual_res$chisq)
  expect_equal(res_split$max_index, manual_res$index[max])
  expect_equal(res_split$max_chisq, manual_res$chisq[max], tolerance = 1e-06)
})

## switch_docs

test_that("switch_docs", {
  ## Wrong max_index et max_chisq
  max_chisq <- 4.5
  max_index <- 1
  res_switch <- rainette:::switch_docs(tab, indices, max_index, max_chisq)
  expect_equal(res_switch$indices1, c(3, 6))
  expect_equal(res_switch$indices2, c(7, 4, 5, 2, 1))
  expect_equal(res_switch$chisq, res_split$max_chisq)
})

## select_features

test_that("select_features", {
  res_switch <- rainette:::switch_docs(tab, indices, res_split$max_index, res_split$max_chisq)
  indices1 <- res_switch$indices1
  indices2 <- res_switch$indices2
  res <- rainette:::select_features(tab, indices1, indices2)
  expect_equal(res$cols1, character(0))
  expect_equal(res$cols2, c("feat2", "feat3", "feat4"))
  indices1 <- c(4, 3, 5, 6, 2)
  indices2 <- c(7, 1)
  res <- rainette:::select_features(tab, indices1, indices2)
  expect_equal(res$cols1, c("feat1", "feat3", "feat4"))
  expect_equal(res$cols2, character(0))
})

## rainette

test_that("rainette on mini_dfm", {
  res <- rainette(mini_dfm, k = 2, min_segment_size = 1, min_split_members = 3)
  expect_is(res, c("rainette", "hclust"))
  expect_equal(res$group, c(2, 2, 1, 2, 2, 1, 2))
  expect_equal(which(res$group == 1), indices[1:which(indices == res_split$max_index)])
  expect_equal(res$height, res_split$max_chisq)
})


mini_corpus <- head(data_corpus_inaugural, n = 2)
mini_corpus <- split_segments(mini_corpus)
dtm <- dfm(tokens(mini_corpus, remove_punct = TRUE), tolower = TRUE)
dtm <- dfm_remove(dtm, stopwords("en"))
dtm <- dfm_wordstem(dtm, language = "english")
dtm <- dfm_trim(dtm, min_termfreq = 3)

test_that("Stopping if tab too small", {
  expect_message(
    res <- rainette(dtm, k = 12, min_segment_size = 10, min_split_members = 3),
    "! Tab to be splitted is not big enough. Stopping after iteration"
  )
  expect_equal(max(res$group), 9)
})

test_that("Stopping if no more group > min_split_members", {
  expect_message(
    res <- rainette(dtm, k = 3, min_segment_size = 10, min_split_members = 12),
    "^! No more group bigger than min_split_members. Stopping after iteration"
  )
  expect_equal(max(res$group), 2)
  expect_message(
    res <- rainette(dtm, k = 3, min_segment_size = 10, min_split_members = 15),
    "! No computed clusters. Returning NULL."
  )
  expect_null(res)
})

test_that("all uce belong to the same group", {
  res <- rainette(dtm, k = 3, min_segment_size = 5)
  dtm_uc <- rainette:::merge_segments(dtm, min_segment_size = 5)
  expect_equal(anyDuplicated(unique(cbind(docvars(dtm_uc, "rainette_uc_id"), res$group))), 0)
  expect_equal(anyDuplicated(unique(cbind(docvars(dtm_uc, "rainette_uce_id"), res$group))), 0)
})

corpus <- head(data_corpus_inaugural, n = 10)
corpus <- split_segments(corpus)
dtm <- dfm(tokens(corpus, remove_punct = TRUE), tolower = TRUE)
dtm <- dfm_remove(dtm, stopwords("en"))
dtm <- dfm_trim(dtm, min_termfreq = 3)

test_that("groups correspondance is ok", {
  res <- rainette(dtm, k = 6, min_segment_size = 5)
  df <- data.frame(res$uce_groups)
  tab <- table(df[, 4], df[, 5])
  expect_equal(sum(tab > 0), 6)
  tab <- table(df[, 3], df[, 4])
  expect_equal(sum(tab > 0), 5)
})


## Special cases

test_that("dfm with 'document' feature is ok", {
  m_document <- m
  colnames(m_document) <- c("document", "éé", "ùù", "feat4", "feat5")
  dfm_document <- as.dfm(m_document)
  res_document <- rainette(dfm_document, k = 2, min_segment_size = 1, min_split_members = 3)
  res <- rainette(mini_dfm, k = 2, min_segment_size = 1, min_split_members = 3)
  expect_equal(res$group, res_document$group)
  expect_equal(res$height, res_document$height)
})


## min_members deprecation

test_that("min_members is deprecated", {
  expect_warning(
    res_deprecated <- rainette(mini_dfm, k = 2, min_segment_size = 1, min_members = 3),
    "min_members is deprecated. Use min_split_members instead."
  )
  res_deprecated$call <- NULL
  res <- rainette(mini_dfm, k = 2, min_segment_size = 1, min_split_members = 3)
  res$call <- NULL
  expect_equal(
    res,
    res_deprecated
  )
})

Try the rainette package in your browser

Any scripts or data that you put into this service are public.

rainette documentation built on March 31, 2023, 6:43 p.m.