Nothing
library(quanteda)
context("simple reinert classification")
m <- matrix(
c(
1L, 1L, 1L, 0L, 0L,
0L, 0L, 1L, 1L, 0L,
1L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 0L, 0L
),
ncol = 5,
byrow = TRUE,
)
mini_dfm <- as.dfm(m)
## docs_order_by_ca
indices <- rainette:::order_docs(mini_dfm)
test_that("order_docs", {
skip_if_not_installed("quanteda.textmodels")
if (requireNamespace("quanteda.textmodels", quietly = TRUE)) {
ca_values <- quanteda.textmodels::textmodel_ca(mini_dfm)$rowcoord[, 1]
expect_equal(order(ca_values), indices)
}
})
## split_tab_by_chisq
tab <- convert(mini_dfm, to = "matrix")
storage.mode(tab) <- "integer"
res_split <- rainette:::cpp_split_tab(m, indices)
test_that("split_tab", {
manual_res <- data.frame(index = -1, chisq = -1)
for (i in 1:6) {
tab1 <- m[indices[1:i], , drop = FALSE]
tab2 <- m[indices[(i + 1):7], , drop = FALSE]
expect_warning(chisq <- chisq.test(rbind(colSums(tab1), colSums(tab2)))$statistic)
manual_res <- rbind(manual_res, c(index = indices[i], chisq = chisq))
}
max <- which.max(manual_res$chisq)
expect_equal(res_split$max_index, manual_res$index[max])
expect_equal(res_split$max_chisq, manual_res$chisq[max], tolerance = 1e-06)
})
## switch_docs
test_that("switch_docs", {
## Wrong max_index et max_chisq
max_chisq <- 4.5
max_index <- 1
res_switch <- rainette:::switch_docs(tab, indices, max_index, max_chisq)
expect_equal(res_switch$indices1, c(3, 6))
expect_equal(res_switch$indices2, c(7, 4, 5, 2, 1))
expect_equal(res_switch$chisq, res_split$max_chisq)
})
## select_features
test_that("select_features", {
res_switch <- rainette:::switch_docs(tab, indices, res_split$max_index, res_split$max_chisq)
indices1 <- res_switch$indices1
indices2 <- res_switch$indices2
res <- rainette:::select_features(tab, indices1, indices2)
expect_equal(res$cols1, character(0))
expect_equal(res$cols2, c("feat2", "feat3", "feat4"))
indices1 <- c(4, 3, 5, 6, 2)
indices2 <- c(7, 1)
res <- rainette:::select_features(tab, indices1, indices2)
expect_equal(res$cols1, c("feat1", "feat3", "feat4"))
expect_equal(res$cols2, character(0))
})
## rainette
test_that("rainette on mini_dfm", {
res <- rainette(mini_dfm, k = 2, min_segment_size = 1, min_split_members = 3)
expect_is(res, c("rainette", "hclust"))
expect_equal(res$group, c(2, 2, 1, 2, 2, 1, 2))
expect_equal(which(res$group == 1), indices[1:which(indices == res_split$max_index)])
expect_equal(res$height, res_split$max_chisq)
})
mini_corpus <- head(data_corpus_inaugural, n = 2)
mini_corpus <- split_segments(mini_corpus)
dtm <- dfm(tokens(mini_corpus, remove_punct = TRUE), tolower = TRUE)
dtm <- dfm_remove(dtm, stopwords("en"))
dtm <- dfm_wordstem(dtm, language = "english")
dtm <- dfm_trim(dtm, min_termfreq = 3)
test_that("Stopping if tab too small", {
expect_message(
res <- rainette(dtm, k = 12, min_segment_size = 10, min_split_members = 3),
"! Tab to be splitted is not big enough. Stopping after iteration"
)
expect_equal(max(res$group), 9)
})
test_that("Stopping if no more group > min_split_members", {
expect_message(
res <- rainette(dtm, k = 3, min_segment_size = 10, min_split_members = 12),
"^! No more group bigger than min_split_members. Stopping after iteration"
)
expect_equal(max(res$group), 2)
expect_message(
res <- rainette(dtm, k = 3, min_segment_size = 10, min_split_members = 15),
"! No computed clusters. Returning NULL."
)
expect_null(res)
})
test_that("all uce belong to the same group", {
res <- rainette(dtm, k = 3, min_segment_size = 5)
dtm_uc <- rainette:::merge_segments(dtm, min_segment_size = 5)
expect_equal(anyDuplicated(unique(cbind(docvars(dtm_uc, "rainette_uc_id"), res$group))), 0)
expect_equal(anyDuplicated(unique(cbind(docvars(dtm_uc, "rainette_uce_id"), res$group))), 0)
})
corpus <- head(data_corpus_inaugural, n = 10)
corpus <- split_segments(corpus)
dtm <- dfm(tokens(corpus, remove_punct = TRUE), tolower = TRUE)
dtm <- dfm_remove(dtm, stopwords("en"))
dtm <- dfm_trim(dtm, min_termfreq = 3)
test_that("groups correspondance is ok", {
res <- rainette(dtm, k = 6, min_segment_size = 5)
df <- data.frame(res$uce_groups)
tab <- table(df[, 4], df[, 5])
expect_equal(sum(tab > 0), 6)
tab <- table(df[, 3], df[, 4])
expect_equal(sum(tab > 0), 5)
})
## Special cases
test_that("dfm with 'document' feature is ok", {
m_document <- m
colnames(m_document) <- c("document", "éé", "ùù", "feat4", "feat5")
dfm_document <- as.dfm(m_document)
res_document <- rainette(dfm_document, k = 2, min_segment_size = 1, min_split_members = 3)
res <- rainette(mini_dfm, k = 2, min_segment_size = 1, min_split_members = 3)
expect_equal(res$group, res_document$group)
expect_equal(res$height, res_document$height)
})
## min_members deprecation
test_that("min_members is deprecated", {
expect_warning(
res_deprecated <- rainette(mini_dfm, k = 2, min_segment_size = 1, min_members = 3),
"min_members is deprecated. Use min_split_members instead."
)
res_deprecated$call <- NULL
res <- rainette(mini_dfm, k = 2, min_segment_size = 1, min_split_members = 3)
res$call <- NULL
expect_equal(
res,
res_deprecated
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.