Nothing
context("tests of tidy methods for tidylda")
dtm <- nih_sample_dtm
d1 <- dtm[1:50, ]
d2 <- dtm[51:100, ]
# make sure we have different vocabulary for each data set
d1 <- d1[, Matrix::colSums(d1) > 0]
d2 <- d2[, Matrix::colSums(d2) > 0]
lda <- tidylda(
data = d1,
k = 4,
iterations = 20, burnin = 10,
alpha = 0.1, eta = 0.05,
optimize_alpha = TRUE,
calc_likelihood = TRUE,
calc_r2 = TRUE,
return_data = FALSE,
verbose = FALSE
)
### test tidy methods ----
test_that("tidy.tidylda works as expected", {
# tidy beta
tidy_beta <- tidy(
x = lda,
matrix = "beta"
)
expect_named(tidy_beta, c("topic", "token", "beta"))
expect_type(tidy_beta[[1]], "double")
expect_type(tidy_beta[[2]], "character")
expect_type(tidy_beta[[3]], "double")
# log to tidy beta
tidy_beta_log <- tidy(
x = lda,
matrix = "beta",
log = TRUE
)
expect_named(tidy_beta_log, c("topic", "token", "log_beta"))
expect_type(tidy_beta_log[[1]], "double")
expect_type(tidy_beta_log[[2]], "character")
expect_type(tidy_beta_log[[3]], "double")
expect_equal(sum(colnames(lda$beta) %in% tidy_beta$token), length(colnames(lda$beta)))
# tidy theta
tidy_theta <- tidy(
x = lda,
matrix = "theta"
)
expect_named(tidy_theta, c("document", "topic", "theta"))
expect_type(tidy_theta[[1]], "character")
expect_type(tidy_theta[[2]], "double")
expect_type(tidy_theta[[3]], "double")
expect_equal(sum(colnames(lda$theta) %in% tidy_theta$topic), length(colnames(lda$theta)))
# tidy lambda
tidy_lambda <- tidy(
x = lda,
matrix = "lambda"
)
expect_named(tidy_lambda, c("topic", "token", "lambda"))
expect_type(tidy_lambda[[1]], "double")
expect_type(tidy_lambda[[2]], "character")
expect_type(tidy_lambda[[3]], "double")
expect_equal(sum(colnames(lda$lambda) %in% tidy_lambda$token), length(colnames(lda$lambda)))
})
test_that("tidy throws errors for malformed inputs", {
expect_error(
tidy(
x = lda,
matrix = "WRONG"
)
)
expect_error(
tidy(
x = lda,
matrix = 1
)
)
expect_error(
tidy(
x = lda,
matrix = "beta",
log = "WRONG"
)
)
# matrices the same as above
expect_error(
tidy_theta <- tidy(
x = lda$theta,
matrix = "WRONG"
)
)
expect_error(
tidy_theta <- tidy(
x = lda$theta,
matrix = "theta",
log = "WRONG"
)
)
})
### tests for the glance method ----
test_that("glance.tidylda behaves nicely", {
# well-formed call
g <- glance(lda)
expect_named(g, c(
"num_topics", "num_documents", "num_tokens",
"iterations", "burnin"
))
expect_equal(g$num_topics, nrow(lda$beta))
expect_equal(g$num_documents, nrow(lda$theta))
expect_equal(g$num_tokens, ncol(lda$beta))
expect_equal(g$iterations, lda$call$iterations)
expect_equal(g$burnin, lda$call$burnin)
})
test_that("glance works with updated models", {
l2 <- refit(lda, d2, iterations = 20, verbose = FALSE)
g <- glance(l2)
expect_named(g, c(
"num_topics", "num_documents", "num_tokens",
"iterations", "burnin"
))
expect_equal(g$num_topics, nrow(l2$beta))
expect_equal(g$num_documents, nrow(l2$theta))
expect_equal(g$num_tokens, ncol(l2$beta))
expect_equal(g$iterations, l2$call$iterations)
expect_equal(g$burnin, NA)
})
### tests for the glance method ----
# make a tidy tibble
# note this uses unigrams and bigrams to ensure that there isn't
# 100% overlap in vocabulary between data and model
tidy_docs <-
nih_sample[1:10, ] %>%
dplyr::select(APPLICATION_ID, ABSTRACT_TEXT) %>%
tidytext::unnest_tokens(output = word,
input = ABSTRACT_TEXT,
stopwords = tidytext::stop_words$word,
token = "ngrams",
n_min = 1, n = 2)
test_that("augment.tidylda behaves nicely", {
td <- tidy_docs
# tidy tibble input
a <- augment(
x = lda,
data = td,
type = "class",
document_col = colnames(td)[1],
term_col = colnames(td)[2]
)
# right number of rows?
expect_equal(nrow(a), nrow(tidy_docs))
# correctly identified topics?
expect_equal(
a %>%
dplyr::filter(.[[colnames(td)[2]]] %in% colnames(lda$beta)) %>%
dplyr::select(topic) %>%
dplyr::summarise(na_topic = sum(is.na(topic))) %>%
as.numeric,
0,
label = "augment is.na(topic class)"
)
# sparse dtm input
a <- augment(
x = lda,
data = d1,
type = "prob"
)
expect_false(
a %>%
dplyr::filter(term %in% colnames(lda$beta)) %>%
dplyr::select(-c(document, term)) %>%
colSums() %>%
sum %>%
is.na,
label = "augment is.na(topic probs)"
)
# probably should test other dtm types...
})
test_that("augment.tidylda reacts to malformed inputs correctly", {
expect_error(
augment(
x = "wat",
data = d1,
type = "prob"
)
)
expect_error(
augment(
x = lda,
data = "wat",
type = "prob"
)
)
expect_error(
augment(
x = lda,
data = d1,
type = "wat"
)
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.