tests/testthat/test-convert.R

txt_test <- c(text1 = "The new law included a capital gains tax, and an inheritance tax.",
              text2 = "New York City has raised a taxes: an income tax and a sales tax.")
dfmat_test <- dfm(tokens(txt_test, remove_punct = TRUE))

test_that("test STM package converter", {
    skip_if_not_installed("stm")
    skip_if_not_installed("tm")

    stmdfm <- convert(dfmat_test, to = "stm")
    stmtp <- stm::textProcessor(txt_test, removestopwords = FALSE, verbose = FALSE,
                                stem = FALSE, wordLengths = c(1, Inf))
    expect_equivalent(stmdfm$documents[1], stmtp$documents[1])
    expect_equivalent(stmdfm$documents[2], stmtp$documents[2])
    expect_equivalent(stmdfm$vocab, stmtp$vocab)
    
    expect_error(convert(dfmat_test, to = "stm", omit_empty = logical()),
                 "The length of omit_empty must be 1")
    expect_error(convert(dfmat_test, to = "stm", docid_field = c("field1", "field2")),
                 "The length of docid_field must be 1")
})

test_that("docvars error traps work", {
    expect_error(
        convert(data_dfm_lbgexample, docvars = "ERROR"),
        "docvars must be a data.frame"
    )
    expect_error(
        convert(data_dfm_lbgexample, docvars = data.frame(error = c(1, 2))),
        "docvars must have the same number of rows as ndoc\\(x\\)"
    )
})

test_that("test STM package converter with metadata", {
    skip_if_not_installed("stm")
    skip_if_not_installed("tm")
    dat <- data.frame(myvar = c("A", "B"))
    corp <- corpus(txt_test, docvars = dat)
    dfmat <- dfm(tokens(corp, remove_punct = TRUE))
    stmdfm <- convert(dfmat, to = "stm")
    stmtp <- stm::textProcessor(txt_test, removestopwords = FALSE, verbose = FALSE,
                                stem = FALSE, wordLengths = c(1, Inf))
    expect_equivalent(stmdfm$documents[1], stmtp$documents[1])
    expect_equivalent(stmdfm$documents[2], stmtp$documents[2])
    expect_equivalent(stmdfm$vocab, stmtp$vocab)
    expect_identical(stmdfm$meta, dat)
})

test_that("test STM package converter with metadata w/zero-count document", {
    skip_if_not_installed("stm")
    skip_if_not_installed("tm")
    txt_test2 <- c(text1 = "The new law included a capital gains tax, and an inheritance tax.",
                  text2 = ";",  # this will become empty
                  text3 = "New York City has raised a taxes: an income tax and a sales tax.")
    dat <- data.frame(myvar = c("A", "B", "C"))
    corp <- corpus(txt_test2, docvars = dat)
    dfmat <- dfm(tokens(corp, remove_punct = TRUE))
    expect_true(ntoken(dfmat)[2] == 0)

    stmdfm <- suppressWarnings(convert(dfmat, to = "stm"))
    stmtp <- stm::textProcessor(txt_test, removestopwords = FALSE, verbose = FALSE,
                             stem = FALSE, wordLengths = c(1, Inf))
    expect_equivalent(stmdfm$documents[1], stmtp$documents[1])
    expect_equivalent(stmdfm$documents[2], stmtp$documents[2])
    expect_equivalent(stmdfm$vocab, stmtp$vocab)
    expect_identical(stmdfm$meta, dat[-2, , drop = FALSE])
})

test_that("test tm package converter", {
    skip_if_not_installed("tm")
    dtmq <- convert(dfmat_test[, order(featnames(dfmat_test))], to = "tm")
    dtmtm <- tm::DocumentTermMatrix(tm::VCorpus(tm::VectorSource(char_tolower(txt_test))),
                                    control = list(removePunctuation = TRUE,
                                                   wordLengths = c(1, Inf)))
    ## FAILS
    # expect_equivalent(dtmq, dfmtm)
    expect_equivalent(as.matrix(dtmq), as.matrix(dtmtm))
})

test_that("test lda package converter", {
    skip_if_not_installed("tm")
    expect_identical(convert(dfmat_test, to = "topicmodels"), quanteda:::dfm2dtm(dfmat_test))
})

test_that("test topicmodels package converter", {
    skip_if_not_installed("tm")
    expect_identical(convert(dfmat_test, to = "lda"), quanteda:::dfm2lda(dfmat_test))
})

test_that("test austin package converter", {
    expect_identical(convert(dfmat_test, to = "austin"), 
                     structure(as.matrix(dfmat_test), class = c("wfm", "matrix"),
                               dimnames = c(list(docs = docnames(dfmat_test), 
                                                 words = featnames(dfmat_test)))))
})

test_that("test lsa converter", {
    skip_if_not_installed("lsa")
    require(lsa)
    # create some files
    td <- tempfile()
    dir.create(td)
    write(c("cat", "dog", "mouse"), file = paste(td, "D1", sep = "/"))
    write(c("hamster", "mouse", "sushi"), file = paste(td, "D2", sep = "/"))
    write(c("dog", "monster", "monster"), file = paste(td, "D3", sep = "/"))
    # read them, create a document-term matrix
    lsamat <- lsa::textmatrix(td)

    dfmat <- dfm(tokens(c(D1 = c("cat dog mouse"),
                          D2 = c("hamster mouse sushi"),
                          D3 = c("dog monster monster"))))
    # guarantee sort order
    # was temporarily required when tests broke following release of R 4.0
    # dfmat <- dfmat[, sort(featnames(dfmat))]
    lsamat2 <- convert(dfmat, to = "lsa")
    expect_equivalent(lsamat, lsamat2)
})

test_that("test stm converter: under extreme situations ", {
    #zero-count document
    dfmat1 <- as.dfm(matrix(c(1, 0, 2, 0,
                             0, 0, 1, 2,
                             0, 0, 0, 0,
                             1, 2, 3, 4), byrow = TRUE, nrow = 4))
    expect_warning(
        convert(dfmat1, to = "stm"), 
        "Dropped 4 empty document(s)",
        fixed = TRUE
    )

    #zero-count feature
    dfmat2 <- as.dfm(matrix(c(1, 0, 2, 0,
                             0, 0, 1, 2,
                             1, 0, 0, 0,
                             1, 0, 3, 4), byrow = TRUE, nrow = 4))
    expect_warning(
        convert(dfmat2, to = "stm"), 
        "Dropped 4 zero-count feature(s)",
        fixed = TRUE
    )

    # when dfm is 0% sparse
    stmdfm <- convert(as.dfm(matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)), to = "stm")
    expect_equal(length(stmdfm$documents), 3)
})

test_that("lsa converter works under extreme situations", {
    skip_if_not_installed("lsa")
    require(lsa)
    #zero-count document
    dfmat1 <- as.dfm(matrix(c(1, 0, 2, 0,
                             0, 0, 1, 2,
                             0, 0, 0, 0,
                             1, 2, 3, 4), byrow = TRUE, nrow = 4))
    # lsa handles empty docs with a warning message
    expect_warning(lsa1 <- lsa::lsa(convert(dfmat1, to = "lsa")), 
                   "there are singular values which are zero")
    expect_equal(class(lsa1), "LSAspace")

    #zero-count feature:
    dfmat2 <- as.dfm(matrix(c(1, 0, 2, 0,
                             0, 0, 1, 2,
                             1, 0, 0, 0,
                             1, 0, 3, 4), byrow = TRUE, nrow = 4))
    expect_warning(lsa2 <- lsa::lsa(convert(dfmat2, to = "lsa")), "there are singular values which are zero")
    expect_equal(class(lsa2), "LSAspace")

    #when dfm is 0% sparse
    lsadfm <- convert(as.dfm(matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)), to = "lsa")
    expect_equal(suppressWarnings(class(lsa(lsadfm))), "LSAspace")
})

test_that("topicmodels converter works under extreme situations", {
    # skip_on_os("mac") 
    skip_if_not_installed("topicmodels")
    require(topicmodels)
    #zero-count document
    mydfm <- as.dfm(matrix(c(1, 0, 2, 0,
                             0, 0, 1, 2,
                             0, 0, 0, 0,
                             1, 2, 3, 4), byrow = TRUE, nrow = 4))
    motifresult <- LDA(convert(mydfm, to = "topicmodels"), k = 3)
    expect_equivalent(class(motifresult), "LDA_VEM")

    #zero-count feature:topicmodels takes the input matrix correctly, just it shouldn't return feat2 as topic words
    mydfm <- as.dfm(matrix(c(1, 0, 2, 0,
                             0, 0, 1, 2,
                             1, 0, 0, 0,
                             1, 0, 3, 4), byrow = TRUE, nrow = 4))
    motifresult <- LDA(convert(mydfm, to = "topicmodels"), k = 3)
    expect_equivalent(class(motifresult), "LDA_VEM")

    #when dfm is 0% sparse
    motifdfm <- convert(as.dfm(matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)), to = "topicmodels")
    motifresult <- LDA(motifdfm, 3)
    expect_equivalent(class(motifresult), "LDA_VEM")
})

test_that("lda converter works under extreme situations", {
    skip_if_not_installed("lda")
    require(lda)
    #zero-count document
    mydfm <- as.dfm(matrix(c(1, 0, 2, 0,
                             0, 0, 1, 2,
                             0, 0, 0, 0,
                             1, 2, 3, 4), byrow = TRUE, nrow = 4))
    ldadfm <- convert(mydfm, to = "lda")
    ldaresult <- lda.collapsed.gibbs.sampler(ldadfm$documents, 5, ldadfm$vocab, 25, 0.1, 0.1,
                                             compute.log.likelihood = TRUE)
    top_words <- top.topic.words(ldaresult$topics, 4, by.score = TRUE)
    expect_equal(dim(top_words), c(4, 5))

    #zero-count feature: lda takes the input matrix correctly, just it shouldn't return feat2 as topic words
    mydfm <- as.dfm(matrix(c(1, 0, 2, 0,
                             0, 0, 1, 2,
                             1, 0, 0, 0,
                             1, 0, 3, 4), byrow = TRUE, nrow = 4))
    ldadfm <- convert(mydfm, to = "lda")
    ldaresult <- lda.collapsed.gibbs.sampler(ldadfm$documents, 5, ldadfm$vocab, 25, 0.1, 0.1,
                                             compute.log.likelihood = TRUE)
    top_words <- top.topic.words(ldaresult$topics, 5, by.score = TRUE)
    expect_equal(dim(top_words), c(5, 5))

    #when dfm is 0% sparse
    motifdfm <- convert(as.dfm(matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)), to = "lda")
    ldadfm <- convert(mydfm, to = "lda")
    ldaresult <- lda.collapsed.gibbs.sampler(ldadfm$documents, 5, ldadfm$vocab, 25, 0.1, 0.1,
                                             compute.log.likelihood = TRUE)
    top_words <- top.topic.words(ldaresult$topics, 5, by.score = TRUE)
    expect_equal(dim(top_words), c(5, 5))
})

test_that("tm converter works under extreme situations", {
    skip_if_not_installed("tm")
    #zero-count document
    amatrix <- matrix(c(1, 0, 2, 0,
                        0, 0, 1, 2,
                        0, 0, 0, 0,
                        1, 2, 3, 4), byrow = TRUE, nrow = 4)
    tmdfm <- convert(as.dfm(amatrix), to = "tm")
    expect_equivalent(as.matrix(tmdfm), amatrix)

    #zero-count feature:
    bmatrix <- matrix(c(1, 0, 2, 0,
                        0, 0, 1, 2,
                        1, 0, 0, 0,
                        1, 0, 3, 4), byrow = TRUE, nrow = 4)
    tmdfm <- convert(as.dfm(bmatrix), to = "tm")
    expect_equivalent(as.matrix(tmdfm), bmatrix)

    #when dfm is 0% sparse
    cmatrix <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)
    tmdfm <- convert(as.dfm(cmatrix), to = "tm")
    expect_equivalent(as.matrix(tmdfm), cmatrix)
})

test_that("weighted dfm is not convertible to a topic model format (#1091)", {
    err_msg <- "cannot convert a non-count dfm to a topic model format"

    expect_error(convert(dfm_weight(dfmat_test, "prop"), to = "stm"), err_msg)
    expect_error(convert(dfm_weight(dfmat_test, "prop"), to = "topicmodels"), err_msg)
    expect_error(convert(dfm_weight(dfmat_test, "prop"), to = "lda"), err_msg)
    expect_error(convert(dfm_weight(dfmat_test, "prop"), to = "stm"), err_msg)
    expect_error(convert(dfm_weight(dfmat_test, "prop"), to = "stm"), err_msg)

    expect_error(convert(dfm_tfidf(dfmat_test), to = "stm"), err_msg)
})

test_that("triplet converter works", {
    mt <- dfm(tokens(c("a b c", "c c d")))
    expect_identical(convert(mt, to = "tripletlist"),
                     list(document = c(rep("text1", 3), rep("text2", 2)),
                          feature = c("a", "b", "c", "c", "d"),
                          frequency = c(1, 1, 1, 2, 1)
                     ))

})

test_that("omit_empty works as expected (#1600", {
    skip_if_not_installed("tm")
    dfmat <- as.dfm(matrix(c(1, 0, 2, 0,
                             0, 0, 1, 2,
                             0, 0, 0, 0,
                             1, 2, 3, 4), byrow = TRUE, nrow = 4))
    expect_equal(
        dim(convert(dfmat, to = "topicmodels", omit_empty = TRUE)),
        c(3, 4)
    )
    expect_equal(
        dim(convert(dfmat, to = "topicmodels", omit_empty = FALSE)),
        c(4, 4)
    )

    expect_equal(length(convert(dfmat, to = "lda", omit_empty = TRUE)$documents), 3)
    expect_equal(length(convert(dfmat, to = "lda", omit_empty = FALSE)$documents), 4)

    expect_error(
        quanteda:::dfm2stm(dfmat, omit_empty = FALSE),
        "omit_empty = FALSE not implemented for STM format"
    )

    expect_warning(convert(dfmat, to = "stm", omit_empty = TRUE), "omit_empty not used")
    expect_warning(convert(dfmat, to = "tm", omit_empty = TRUE), "omit_empty not used")
    expect_warning(convert(dfmat, to = "austin", omit_empty = TRUE), "omit_empty not used")
    expect_warning(convert(dfmat, to = "lsa", omit_empty = TRUE), "omit_empty not used")
    expect_warning(convert(dfmat, to = "data.frame", omit_empty = TRUE), "omit_empty not used")
    expect_warning(convert(dfmat, to = "tripletlist", omit_empty = TRUE), "omit_empty not used")
})

test_that("convert.corpus works", {
    corp <- corpus(c(d1 = "Text one.", d2 = "Text two."),
                   docvars = data.frame(dvar1 = 1:2, dvar2 = c("one", "two"),
                                        stringsAsFactors = FALSE))
    expect_identical(
        data.frame(doc_id = c("d1", "d2"),
                   text = c("Text one.", "Text two."),
                   dvar1 = 1:2, dvar2 = c("one", "two"),
                   stringsAsFactors = FALSE),
        convert(corp, to = "data.frame")
    )
    expect_warning(
        convert(corp, to = "data.frame", nothing = TRUE),
        "^nothing argument is not used",
    )
    expect_warning(
        convert(dfm(tokens(corp)), to = "data.frame", nothing = TRUE),
        "^nothing argument is not used",
    )

    expect_identical(
        convert(corp, to = "json", pretty = TRUE) |> as.character(),
        '[
  {
    "doc_id": "d1",
    "text": "Text one.",
    "dvar1": 1,
    "dvar2": "one"
  },
  {
    "doc_id": "d2",
    "text": "Text two.",
    "dvar1": 2,
    "dvar2": "two"
  }
]'
        )
    expect_identical(
        convert(corp, to = "json") |> as.character(),
        '[{"doc_id":"d1","text":"Text one.","dvar1":1,"dvar2":"one"},{"doc_id":"d2","text":"Text two.","dvar1":2,"dvar2":"two"}]'
    )
    expect_error(convert(corp, to = "json", prett = logical()),
                 "The length of pretty must be 1")
})

test_that("convert to = data.frame works", {
    dfmat <- dfm(tokens(c(d1 = "this is a fine document",
                   d2 = "this is a fine feature")))
    expect_identical(
        convert(dfmat, to = "data.frame"),
        data.frame(
            doc_id = c("d1", "d2"),
            this = c(1, 1),
            is = c(1, 1),
            a = c(1, 1),
            fine = c(1, 1),
            document = c(1, 0),
            feature = c(0, 1), stringsAsFactors = FALSE
        )
    )
    expect_identical(
      convert(dfmat, to = "data.frame", docid_field = "__document"),
      data.frame(
        "__document" = c("d1", "d2"),
        this = c(1, 1),
        is = c(1, 1),
        a = c(1, 1),
        fine = c(1, 1),
        document = c(1, 0),
        feature = c(0, 1), 
        stringsAsFactors = FALSE, check.names = FALSE
      )
    )
    expect_error(
        convert(dfmat, to = "data.frame", docid_field = "document"),
        "'document' matches a feature in the dfm; use a different docid_field value"
    )
})
quanteda/quanteda documentation built on April 15, 2024, 7:59 a.m.