require(quanteda)
toks <- tokens(data_corpus_moviereviews[1:500],
remove_punct = TRUE,
remove_symbols = TRUE,
remove_number = TRUE)
dfmt <- dfm(toks) %>%
dfm_remove(stopwords(), min_nchar = 2) %>%
dfm_trim(max_docfreq = 0.1, docfreq_type = "prop")
dict <- dictionary(list(romance = c("love*", "couple*"),
sifi = c("alien*", "star", "space")))
set.seed(1234)
lda <- textmodel_lda(dfmt, k = 5)
set.seed(1234)
slda <- textmodel_seededlda(dfmt, dict, residual = TRUE, weight = 0.02,
min_termfreq = 10)
test_that("divergence() is working", {
# in-sample
div1 <- divergence(lda)
expect_equal(div1, 0.33, tolerance = 0.01)
toks_val <- tokens(data_corpus_moviereviews[501:600],
remove_punct = TRUE,
remove_symbols = TRUE,
remove_number = TRUE)
dfmt_val <- dfm(toks_val) %>%
dfm_remove(stopwords(), min_nchar = 2) %>%
dfm_trim(max_docfreq = 0.1, docfreq_type = "prop")
# out-sample
set.seed(1234)
expect_output(
div2 <- divergence(lda, newdata = dfmt_val, max_iter = 100, verbose = TRUE),
"Fitting LDA with 5 topics.*"
)
expect_equal(div2, 0.34, tolerance = 0.01)
})
test_that("perplexity() is working", {
# in-sample
ppl1 <- perplexity(lda)
expect_equal(ppl1, 7742, tolerance = 1)
toks_val <- tokens(data_corpus_moviereviews[501:600],
remove_punct = TRUE,
remove_symbols = TRUE,
remove_number = TRUE)
dfmt_val <- dfm(toks_val) %>%
dfm_remove(stopwords(), min_nchar = 2) %>%
dfm_trim(max_docfreq = 0.1, docfreq_type = "prop")
# out-sample
set.seed(1234)
expect_output(
ppl2 <- perplexity(lda, newdata = dfmt_val, max_iter = 100, verbose = TRUE),
"Fitting LDA with 5 topics.*"
)
expect_equal(ppl2, 7534, tolerance = 1)
})
test_that("regularize is working", {
# LDA
expect_equal(divergence(lda),
0.34, tolerance = 0.01)
expect_gt(divergence(lda, regularize = FALSE),
divergence(lda, regularize = TRUE))
expect_gt(divergence(lda),
divergence(lda, min_size = 0.1))
expect_equal(divergence(slda),
0.29, tolerance = 0.02)
expect_true(divergence(lda, 0.05) != divergence(lda, 0.01))
expect_error(
divergence(lda, regularize = 1),
"The type of regularize must be logical"
)
expect_silent(divergence(lda, select = c("topic1", "topic2")))
expect_error(
divergence(lda, select = 1:3),
"The type of select must be character"
)
expect_error(
divergence(lda, select = c("topic1")),
"The length of select must be between 2 and 5"
)
expect_error(
divergence(lda, select = c("topic1", "topic2", "xxxx")),
"Selected topics must be in the model"
)
# Seeded LDA
expect_gt(divergence(slda, regularize = FALSE),
divergence(slda, regularize = TRUE))
expect_gt(divergence(slda),
divergence(slda, min_size = 0.1))
expect_true(divergence(slda, 0.05) != divergence(slda, 0.01))
expect_error(
divergence(slda, regularize = 1),
"The type of regularize must be logical"
)
expect_silent(divergence(slda, select = c("romance", "sifi")))
expect_error(
divergence(slda, select = 1:3),
"The type of select must be character"
)
expect_error(
divergence(slda, select = c("romance")),
"The length of select must be between 2 and 3"
)
expect_error(
divergence(slda, select = c("romance", "sifi", "xxxx")),
"Selected topics must be in the model"
)
})
test_that("sizes() is working", {
size1 <- sizes(lda)
expect_true(is.numeric(size1))
expect_equal(names(size1), c("topic1", "topic2", "topic3", "topic4", "topic5"))
expect_equal(sum(size1), 1)
size2 <- sizes(slda)
expect_true(is.numeric(size2))
expect_equal(names(size2), c("romance", "sifi", "other"))
expect_equal(sum(size2), 1)
})
test_that("get_threads are working", {
options("seededlda_threads" = "abc")
expect_error(
suppressWarnings(seededlda:::get_threads()),
"seededlda_threads must be an integer"
)
options("seededlda_threads" = NA)
expect_error(
seededlda:::get_threads(),
"seededlda_threads must be an integer"
)
## respect other settings
options("seededlda_threads" = NULL)
Sys.setenv("OMP_THREAD_LIMIT" = 2)
expect_equal(
seededlda:::get_threads(),
ifelse(seededlda::info_tbb()$enabled, 2, 1)
)
Sys.unsetenv("OMP_THREAD_LIMIT")
Sys.setenv("RCPP_PARALLEL_NUM_THREADS" = 3)
expect_equal(
seededlda:::get_threads(),
ifelse(seededlda::info_tbb()$enabled, 3, 1)
)
Sys.unsetenv("RCPP_PARALLEL_NUM_THREADS")
options("seededlda_threads" = NULL)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.