tests/testthat/test-dfm_compress.R

test_that("dfm_compress: simple test", {
    mat <- rbind(dfm(tokens(c("b A A", "C C a b B")), tolower = FALSE, verbose = FALSE),
                 dfm(tokens("A C C C C C"), tolower = FALSE, verbose = FALSE))
    colnames(mat) <- char_tolower(featnames(mat))
    expect_equal(as.matrix(dfm_compress(mat, margin = "documents")),
                 matrix(c(1,1,3,0,5,2,0,1,0,1), nrow = 2,
                        dimnames = list(docs = c("text1", "text2"), features = featnames(mat))))
    expect_equal(
        as.matrix(dfm_compress(mat, margin = "features")),
        matrix(c(1,2,0,2,1,1,0,2,5), nrow = 3,
               dimnames = list(docs = c("text1", "text2", "text1"),
                               features = c("b", "a", "c")))
    )
    expect_equal(
        as.matrix(dfm_compress(mat, margin = "both")),
        matrix(c(1,2,3,1,5,2), nrow = 2,
               dimnames = list(docs = c("text1", "text2"), features = c("b", "a", "c")))
    )
})

test_that("dfm_compress: no effect if no compression needed", {
    compactdfm <- dfm(tokens(data_corpus_inaugural[1:5]), tolower = TRUE, verbose = FALSE)
    expect_equal(dim(compactdfm), dim(dfm_compress(compactdfm)))
})

test_that("dfm_compress: empty features are preserved", {
    testdfm <- new("dfm", Matrix::Matrix(matrix(c(0,0,0, 2,1,5, 0,1,0, 1,1,0), nrow = 3,
                                                      dimnames = list(docs = paste0("d", 1:3),
                                                                      features = c("a", "b", "c", "b"))),
                                               sparse = TRUE))
    expect_equal(colSums(dfm_compress(testdfm))[1], c(a = 0))
})

test_that("dfm_compress: empty documents are preserved", {
    testdfm <- new("dfm", Matrix::Matrix(matrix(c(0,0,0, 2,1,0, 0,1,0, 1,1,0), nrow = 3,
                                                      dimnames = list(docs = paste0("d", 1:3),
                                                                      features = c("a", "b", "c", "b"))),
                                               sparse = TRUE))
    expect_equal(rowSums(dfm_compress(testdfm))[3], c(d3 = 0))
})

test_that("dfm_compress preserves docvars (#1506)", {
    corp <- corpus(c(d1 = "A A A b c D D",
                 d2 = "b b b b D D D"),
               docvars = data.frame(bool = c(TRUE, FALSE)))
    thedfm <- dfm(tokens(corp))
    # this ensures the existence of _document
    docnames(thedfm) <- docnames(thedfm)
    
    expect_true("docname_" %in% names(thedfm@docvars))
    expect_true("docid_" %in% names(thedfm@docvars))
    expect_true("segid_" %in% names(thedfm@docvars))

    expect_identical(
        thedfm@docvars,
        dfm_compress(thedfm, margin = "features")@docvars
    )
    
    expect_identical(
        thedfm@docvars,
        dfm(thedfm)@docvars
    )
})

test_that("add test for group_matrix with features and fill = TRUE", {
    x <- dfm(tokens(c("a a b c d", "b c d e")))
    colnames(x)[4] <- "e"
    expect_identical(
        as.matrix(quanteda:::group_matrix(x, fill = TRUE,
                                          features = factor(featnames(x), levels = letters[1:5]))),
        matrix(c(2, 0, 1, 1, 1, 1, 0, 0, 1, 2), nrow = 2,
               dimnames = list(paste0("text", 1:2), c("a", "b", "c", "d", "e")))
    )
    expect_identical(
        as.matrix(quanteda:::group_matrix(x, fill = FALSE,
                                          features = factor(featnames(x), levels = letters[1:5]))),
        matrix(c(2, 0, 1, 1, 1, 1, 1, 2), nrow = 2,
               dimnames = list(paste0("text", 1:2), c("a", "b", "c", "e")))
    )
})

Try the quanteda package in your browser

Any scripts or data that you put into this service are public.

quanteda documentation built on May 31, 2023, 8:28 p.m.