Nothing
txt_test <- c(text1 = "The new law included a capital gains tax, and an inheritance tax.",
text2 = "New York City has raised a taxes: an income tax and a sales tax.")
dfmat_test <- dfm(tokens(txt_test, remove_punct = TRUE))
test_that("test STM package converter", {
skip_if_not_installed("stm")
skip_if_not_installed("tm")
stmdfm <- convert(dfmat_test, to = "stm")
stmtp <- stm::textProcessor(txt_test, removestopwords = FALSE, verbose = FALSE,
stem = FALSE, wordLengths = c(1, Inf))
expect_equivalent(stmdfm$documents[1], stmtp$documents[1])
expect_equivalent(stmdfm$documents[2], stmtp$documents[2])
expect_equivalent(stmdfm$vocab, stmtp$vocab)
expect_error(convert(dfmat_test, to = "stm", omit_empty = logical()),
"The length of omit_empty must be 1")
expect_error(convert(dfmat_test, to = "stm", docid_field = c("field1", "field2")),
"The length of docid_field must be 1")
})
test_that("docvars error traps work", {
expect_error(
convert(data_dfm_lbgexample, docvars = "ERROR"),
"docvars must be a data.frame"
)
expect_error(
convert(data_dfm_lbgexample, docvars = data.frame(error = c(1, 2))),
"docvars must have the same number of rows as ndoc\\(x\\)"
)
})
test_that("test STM package converter with metadata", {
skip_if_not_installed("stm")
skip_if_not_installed("tm")
dat <- data.frame(myvar = c("A", "B"))
corp <- corpus(txt_test, docvars = dat)
dfmat <- dfm(tokens(corp, remove_punct = TRUE))
stmdfm <- convert(dfmat, to = "stm")
stmtp <- stm::textProcessor(txt_test, removestopwords = FALSE, verbose = FALSE,
stem = FALSE, wordLengths = c(1, Inf))
expect_equivalent(stmdfm$documents[1], stmtp$documents[1])
expect_equivalent(stmdfm$documents[2], stmtp$documents[2])
expect_equivalent(stmdfm$vocab, stmtp$vocab)
expect_identical(stmdfm$meta, dat)
})
test_that("test STM package converter with metadata w/zero-count document", {
skip_if_not_installed("stm")
skip_if_not_installed("tm")
txt_test2 <- c(text1 = "The new law included a capital gains tax, and an inheritance tax.",
text2 = ";", # this will become empty
text3 = "New York City has raised a taxes: an income tax and a sales tax.")
dat <- data.frame(myvar = c("A", "B", "C"))
corp <- corpus(txt_test2, docvars = dat)
dfmat <- dfm(tokens(corp, remove_punct = TRUE))
expect_true(ntoken(dfmat)[2] == 0)
stmdfm <- suppressWarnings(convert(dfmat, to = "stm"))
stmtp <- stm::textProcessor(txt_test, removestopwords = FALSE, verbose = FALSE,
stem = FALSE, wordLengths = c(1, Inf))
expect_equivalent(stmdfm$documents[1], stmtp$documents[1])
expect_equivalent(stmdfm$documents[2], stmtp$documents[2])
expect_equivalent(stmdfm$vocab, stmtp$vocab)
expect_identical(stmdfm$meta, dat[-2, , drop = FALSE])
})
test_that("test tm package converter", {
skip_if_not_installed("tm")
dtmq <- convert(dfmat_test[, order(featnames(dfmat_test))], to = "tm")
dtmtm <- tm::DocumentTermMatrix(tm::VCorpus(tm::VectorSource(char_tolower(txt_test))),
control = list(removePunctuation = TRUE,
wordLengths = c(1, Inf)))
## FAILS
# expect_equivalent(dtmq, dfmtm)
expect_equivalent(as.matrix(dtmq), as.matrix(dtmtm))
})
test_that("test lda package converter", {
skip_if_not_installed("tm")
expect_identical(convert(dfmat_test, to = "topicmodels"), quanteda:::dfm2dtm(dfmat_test))
})
test_that("test topicmodels package converter", {
skip_if_not_installed("tm")
expect_identical(convert(dfmat_test, to = "lda"), quanteda:::dfm2lda(dfmat_test))
})
test_that("test austin package converter", {
expect_identical(convert(dfmat_test, to = "austin"),
structure(as.matrix(dfmat_test), class = c("wfm", "matrix"),
dimnames = c(list(docs = docnames(dfmat_test),
words = featnames(dfmat_test)))))
})
test_that("test lsa converter", {
skip_if_not_installed("lsa")
require(lsa)
# create some files
td <- tempfile()
dir.create(td)
write(c("cat", "dog", "mouse"), file = paste(td, "D1", sep = "/"))
write(c("hamster", "mouse", "sushi"), file = paste(td, "D2", sep = "/"))
write(c("dog", "monster", "monster"), file = paste(td, "D3", sep = "/"))
# read them, create a document-term matrix
lsamat <- lsa::textmatrix(td)
dfmat <- dfm(tokens(c(D1 = c("cat dog mouse"),
D2 = c("hamster mouse sushi"),
D3 = c("dog monster monster"))))
# guarantee sort order
# was temporarily required when tests broke following release of R 4.0
# dfmat <- dfmat[, sort(featnames(dfmat))]
lsamat2 <- convert(dfmat, to = "lsa")
expect_equivalent(lsamat, lsamat2)
})
test_that("test stm converter: under extreme situations ", {
#zero-count document
dfmat1 <- as.dfm(matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
0, 0, 0, 0,
1, 2, 3, 4), byrow = TRUE, nrow = 4))
expect_warning(
convert(dfmat1, to = "stm"),
"Dropped 4 empty document(s)",
fixed = TRUE
)
#zero-count feature
dfmat2 <- as.dfm(matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
1, 0, 0, 0,
1, 0, 3, 4), byrow = TRUE, nrow = 4))
expect_warning(
convert(dfmat2, to = "stm"),
"Dropped 4 zero-count feature(s)",
fixed = TRUE
)
# when dfm is 0% sparse
stmdfm <- convert(as.dfm(matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)), to = "stm")
expect_equal(length(stmdfm$documents), 3)
})
test_that("lsa converter works under extreme situations", {
skip_if_not_installed("lsa")
require(lsa)
#zero-count document
dfmat1 <- as.dfm(matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
0, 0, 0, 0,
1, 2, 3, 4), byrow = TRUE, nrow = 4))
# lsa handles empty docs with a warning message
expect_warning(lsa1 <- lsa::lsa(convert(dfmat1, to = "lsa")),
"there are singular values which are zero")
expect_equal(class(lsa1), "LSAspace")
#zero-count feature:
dfmat2 <- as.dfm(matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
1, 0, 0, 0,
1, 0, 3, 4), byrow = TRUE, nrow = 4))
expect_warning(lsa2 <- lsa::lsa(convert(dfmat2, to = "lsa")), "there are singular values which are zero")
expect_equal(class(lsa2), "LSAspace")
#when dfm is 0% sparse
lsadfm <- convert(as.dfm(matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)), to = "lsa")
expect_equal(suppressWarnings(class(lsa(lsadfm))), "LSAspace")
})
test_that("topicmodels converter works under extreme situations", {
# skip_on_os("mac")
skip_if_not_installed("topicmodels")
require(topicmodels)
#zero-count document
mydfm <- as.dfm(matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
0, 0, 0, 0,
1, 2, 3, 4), byrow = TRUE, nrow = 4))
motifresult <- LDA(convert(mydfm, to = "topicmodels"), k = 3)
expect_equivalent(class(motifresult), "LDA_VEM")
#zero-count feature:topicmodels takes the input matrix correctly, just it shouldn't return feat2 as topic words
mydfm <- as.dfm(matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
1, 0, 0, 0,
1, 0, 3, 4), byrow = TRUE, nrow = 4))
motifresult <- LDA(convert(mydfm, to = "topicmodels"), k = 3)
expect_equivalent(class(motifresult), "LDA_VEM")
#when dfm is 0% sparse
motifdfm <- convert(as.dfm(matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)), to = "topicmodels")
motifresult <- LDA(motifdfm, 3)
expect_equivalent(class(motifresult), "LDA_VEM")
})
test_that("lda converter works under extreme situations", {
skip_if_not_installed("lda")
require(lda)
#zero-count document
mydfm <- as.dfm(matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
0, 0, 0, 0,
1, 2, 3, 4), byrow = TRUE, nrow = 4))
ldadfm <- convert(mydfm, to = "lda")
ldaresult <- lda.collapsed.gibbs.sampler(ldadfm$documents, 5, ldadfm$vocab, 25, 0.1, 0.1,
compute.log.likelihood = TRUE)
top_words <- top.topic.words(ldaresult$topics, 4, by.score = TRUE)
expect_equal(dim(top_words), c(4, 5))
#zero-count feature: lda takes the input matrix correctly, just it shouldn't return feat2 as topic words
mydfm <- as.dfm(matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
1, 0, 0, 0,
1, 0, 3, 4), byrow = TRUE, nrow = 4))
ldadfm <- convert(mydfm, to = "lda")
ldaresult <- lda.collapsed.gibbs.sampler(ldadfm$documents, 5, ldadfm$vocab, 25, 0.1, 0.1,
compute.log.likelihood = TRUE)
top_words <- top.topic.words(ldaresult$topics, 5, by.score = TRUE)
expect_equal(dim(top_words), c(5, 5))
#when dfm is 0% sparse
motifdfm <- convert(as.dfm(matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)), to = "lda")
ldadfm <- convert(mydfm, to = "lda")
ldaresult <- lda.collapsed.gibbs.sampler(ldadfm$documents, 5, ldadfm$vocab, 25, 0.1, 0.1,
compute.log.likelihood = TRUE)
top_words <- top.topic.words(ldaresult$topics, 5, by.score = TRUE)
expect_equal(dim(top_words), c(5, 5))
})
test_that("tm converter works under extreme situations", {
skip_if_not_installed("tm")
#zero-count document
amatrix <- matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
0, 0, 0, 0,
1, 2, 3, 4), byrow = TRUE, nrow = 4)
tmdfm <- convert(as.dfm(amatrix), to = "tm")
expect_equivalent(as.matrix(tmdfm), amatrix)
#zero-count feature:
bmatrix <- matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
1, 0, 0, 0,
1, 0, 3, 4), byrow = TRUE, nrow = 4)
tmdfm <- convert(as.dfm(bmatrix), to = "tm")
expect_equivalent(as.matrix(tmdfm), bmatrix)
#when dfm is 0% sparse
cmatrix <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)
tmdfm <- convert(as.dfm(cmatrix), to = "tm")
expect_equivalent(as.matrix(tmdfm), cmatrix)
})
test_that("weighted dfm is not convertible to a topic model format (#1091)", {
err_msg <- "cannot convert a non-count dfm to a topic model format"
expect_error(convert(dfm_weight(dfmat_test, "prop"), to = "stm"), err_msg)
expect_error(convert(dfm_weight(dfmat_test, "prop"), to = "topicmodels"), err_msg)
expect_error(convert(dfm_weight(dfmat_test, "prop"), to = "lda"), err_msg)
expect_error(convert(dfm_weight(dfmat_test, "prop"), to = "stm"), err_msg)
expect_error(convert(dfm_weight(dfmat_test, "prop"), to = "stm"), err_msg)
expect_error(convert(dfm_tfidf(dfmat_test), to = "stm"), err_msg)
})
test_that("triplet converter works", {
mt <- dfm(tokens(c("a b c", "c c d")))
expect_identical(convert(mt, to = "tripletlist"),
list(document = c(rep("text1", 3), rep("text2", 2)),
feature = c("a", "b", "c", "c", "d"),
frequency = c(1, 1, 1, 2, 1)
))
})
test_that("omit_empty works as expected (#1600", {
skip_if_not_installed("tm")
dfmat <- as.dfm(matrix(c(1, 0, 2, 0,
0, 0, 1, 2,
0, 0, 0, 0,
1, 2, 3, 4), byrow = TRUE, nrow = 4))
expect_equal(
dim(convert(dfmat, to = "topicmodels", omit_empty = TRUE)),
c(3, 4)
)
expect_equal(
dim(convert(dfmat, to = "topicmodels", omit_empty = FALSE)),
c(4, 4)
)
expect_equal(length(convert(dfmat, to = "lda", omit_empty = TRUE)$documents), 3)
expect_equal(length(convert(dfmat, to = "lda", omit_empty = FALSE)$documents), 4)
expect_error(
quanteda:::dfm2stm(dfmat, omit_empty = FALSE),
"omit_empty = FALSE not implemented for STM format"
)
expect_warning(convert(dfmat, to = "stm", omit_empty = TRUE), "omit_empty not used")
expect_warning(convert(dfmat, to = "tm", omit_empty = TRUE), "omit_empty not used")
expect_warning(convert(dfmat, to = "austin", omit_empty = TRUE), "omit_empty not used")
expect_warning(convert(dfmat, to = "lsa", omit_empty = TRUE), "omit_empty not used")
expect_warning(convert(dfmat, to = "data.frame", omit_empty = TRUE), "omit_empty not used")
expect_warning(convert(dfmat, to = "tripletlist", omit_empty = TRUE), "omit_empty not used")
})
test_that("convert.corpus works", {
corp <- corpus(c(d1 = "Text one.", d2 = "Text two."),
docvars = data.frame(dvar1 = 1:2, dvar2 = c("one", "two"),
stringsAsFactors = FALSE))
expect_identical(
data.frame(doc_id = c("d1", "d2"),
text = c("Text one.", "Text two."),
dvar1 = 1:2, dvar2 = c("one", "two"),
stringsAsFactors = FALSE),
convert(corp, to = "data.frame")
)
expect_warning(
convert(corp, to = "data.frame", nothing = TRUE),
"^nothing argument is not used",
)
expect_warning(
convert(dfm(tokens(corp)), to = "data.frame", nothing = TRUE),
"^nothing argument is not used",
)
expect_identical(
convert(corp, to = "json", pretty = TRUE) |> as.character(),
'[
{
"doc_id": "d1",
"text": "Text one.",
"dvar1": 1,
"dvar2": "one"
},
{
"doc_id": "d2",
"text": "Text two.",
"dvar1": 2,
"dvar2": "two"
}
]'
)
expect_identical(
convert(corp, to = "json") |> as.character(),
'[{"doc_id":"d1","text":"Text one.","dvar1":1,"dvar2":"one"},{"doc_id":"d2","text":"Text two.","dvar1":2,"dvar2":"two"}]'
)
expect_error(convert(corp, to = "json", prett = logical()),
"The length of pretty must be 1")
})
test_that("convert to = data.frame works", {
dfmat <- dfm(tokens(c(d1 = "this is a fine document",
d2 = "this is a fine feature")))
expect_identical(
convert(dfmat, to = "data.frame"),
data.frame(
doc_id = c("d1", "d2"),
this = c(1, 1),
is = c(1, 1),
a = c(1, 1),
fine = c(1, 1),
document = c(1, 0),
feature = c(0, 1), stringsAsFactors = FALSE
)
)
expect_identical(
convert(dfmat, to = "data.frame", docid_field = "__document"),
data.frame(
"__document" = c("d1", "d2"),
this = c(1, 1),
is = c(1, 1),
a = c(1, 1),
fine = c(1, 1),
document = c(1, 0),
feature = c(0, 1),
stringsAsFactors = FALSE, check.names = FALSE
)
)
expect_error(
convert(dfmat, to = "data.frame", docid_field = "document"),
"'document' matches a feature in the dfm; use a different docid_field value"
)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.