tests/testthat/test-tokens_group.R

test_that("test that tokens_group is working", {
    corp <- corpus(c("a b c d", "e f g h", "A B C", "X Y Z"),
                   docname = c("doc1", "doc1", "doc2", "doc2"),
                   unique_docnames = FALSE)
    toks <- tokens(corp)
    expect_equal(
        as.list(tokens_group(toks, c(1, 1, 2, 2))),
        list("1" = c("a", "b", "c", "d", "e", "f", "g", "h"),
             "2" = c("A", "B", "C", "X", "Y", "Z"))
    )
    
    expect_equal(
        as.list(tokens_group(toks)),
        list("doc1" = c("a", "b", "c", "d", "e", "f", "g", "h"),
             "doc2" = c("A", "B", "C", "X", "Y", "Z"))
    )

    expect_equal(
        as.list(tokens_group(toks, c(2, 1, 2, 1))),
        list("1" = c("e", "f", "g", "h", "X", "Y", "Z"),
             "2" = c("a", "b", "c", "d", "A", "B", "C"))
    )

    expect_equal(
        as.list(tokens_group(toks, c("Z", "A", "Z", "A"))),
        list("A" = c("e", "f", "g", "h", "X", "Y", "Z"),
             "Z" = c("a", "b", "c", "d", "A", "B", "C"))
    )

    group <- factor(c("Z", "A", "Z", "A"), levels = c("A", "B", "Z"))
    expect_equal(
        as.list(tokens_group(toks, group)),
        list("A" = c("e", "f", "g", "h", "X", "Y", "Z"),
             "Z" = c("a", "b", "c", "d", "A", "B", "C"))
    )

    expect_equal(
        as.list(tokens_group(toks, group, fill = TRUE)),
        list("A" = c("e", "f", "g", "h", "X", "Y", "Z"),
             "B" = character(),
             "Z" = c("a", "b", "c", "d", "A", "B", "C"))
    )
})

test_that("tokens_group works with empty documents", {
    toks <- tokens(c(doc1 = "a b c c", doc2 = "b c d", doc3 = ""))
    expect_equivalent(
        as.list(tokens_group(toks, c("doc1", "doc1", "doc2"))),
        list(doc1 = c("a", "b", "c", "c", "b", "c", "d"), doc2 = character())
    )

    expect_equivalent(
        as.list(tokens_group(toks, c(1, 1, 2))),
        list(doc1 = c("a", "b", "c", "c", "b", "c", "d"), doc2 = character())
    )
})

test_that("dfm_group and tokens_group are equivalent", {
    txt <- c("a b c c", "b c d", "a")
    toks <- tokens(txt)

    expect_identical(
        dfm_group(dfm(toks), c("doc1", "doc1", "doc2")),
        dfm(tokens_group(toks, c("doc1", "doc1", "doc2"))))

    expect_identical(
        dfm_group(dfm(toks), c(1, 1, 2)),
        dfm(tokens_group(toks, c(1, 1, 2))))

    expect_identical(
        dfm_group(dfm(toks), c(1, 1, 1)),
        dfm(tokens_group(toks, c(1, 1, 1))))
})

test_that("test tokens_group with wrongly dimensioned groups variables", {
    grpvar <- c("D", "D", "A", "C")
    corp <- corpus(c("a b c c", "b c d", "a", "b d d"),
                   docvars = data.frame(grp = grpvar, stringsAsFactors = FALSE))
    toks <- tokens(corp)
    expect_error(
        tokens_group(toks, groups = c(1, 1, 2, 3, 3), fill = FALSE),
        "groups must have length ndoc(x)", fixed = TRUE
    )
    expect_error(
        tokens_group(toks, groups = c(1, 1, 2, 3, 3), fill = TRUE),
        "groups must have length ndoc(x)", fixed = TRUE
    )
    expect_error(
        tokens_group(toks, groups = c(1, 1, 2, 3, 4), fill = TRUE),
        "groups must have length ndoc(x)", fixed = TRUE
    )
})

test_that("tokens_group works with NA group labels", {
    corp <- corpus(c("Doc 1", "Doc 1b", "Doc2", "Doc 3 with NA", "Doc 4, more NA"),
                   docvars = data.frame(factorvar = c("Yes", "Yes", "No", NA, NA)))
    toks <- tokens(corp) |>
        tokens_group(groups = factorvar)
    expect_identical(
        as.list(toks),
        list(No = "Doc2", Yes = c("Doc", "1", "Doc", "1b"))
    )
})

test_that("element names are correctly reset after tokens_group() - #1949", {
    expect_identical(
        tokens(letters[1:3]) |> 
            tokens_group(groups = c("x", "x", "y")) |>
            names(),
        c("x", "y")
    )
})


test_that("tokens_group save grouping variable (#2037)", {
    corp <- corpus(c("a b c c", "b c d", "a", "b d d"),
                   docvars = data.frame(grp = factor(c("D", "D", "A", "C"), levels = c("A", "B", "C", "D")), 
                                        var1 = c(1, 1, 2, 2),
                                        var2 = c(1, 1, 2, 2), 
                                        var3 = c("x", "x", "y", NA),
                                        var4 = c("x", "y", "y", "x"),
                                        var5 = as.Date(c("2018-01-01", "2018-01-01", "2015-03-01", "2012-12-15")),
                                        var6 = as.Date(c("2018-01-01", "2015-03-01", "2015-03-01", "2012-12-15")),
                                        stringsAsFactors = FALSE))
    
    toks <- tokens(corp)
    grpvar <- factor(c("E", "E", "F", "G"), levels = c("E", "F", "G", "H"))
    toks_grp1 <- tokens_group(toks, grp)
    toks_grp2 <- tokens_group(toks, grpvar)
    toks_grp3 <- tokens_group(toks, var1)
    toks_grp4 <- tokens_group(toks, grp, fill = TRUE)
    toks_grp5 <- tokens_group(toks, grpvar, fill = TRUE)
    toks_grp6 <- tokens_group(toks, var1, fill = TRUE)
    toks_grp7 <- tokens_group(toks, groups = interaction(var1, var3))
    
    expect_equal(
        docvars(toks_grp1, "grp"), 
        factor(c("A", "C", "D"), levels = c("A", "C", "D"))
    )
    expect_equal(docvars(toks_grp1)$var1, c(2, 2, 1))
    expect_null(docvars(toks_grp2)$grpvar)
    expect_equal(docvars(toks_grp2)$var1, c(1, 2, 2))
    expect_equal(docvars(toks_grp3)$var1, c(1, 2))
    expect_equal(
        docvars(toks_grp4, "grp"), 
        factor(c("A", "B", "C", "D"), levels = c("A", "B", "C", "D"))
    )
    expect_equal(docvars(toks_grp4)$var1, c(2, NA, 2, 1))
    expect_null(docvars(toks_grp5)$grpvar)
    expect_equal(docvars(toks_grp5)$var1, c(1, 2, 2, NA))
    expect_equal(docvars(toks_grp6)$var1, c(1, 2))
    expect_equal(
        docvars(toks_grp7, "grp"), 
        factor(c("D", "A"), levels = c("A", "B", "C", "D"))
    )
    expect_equal(docvars(toks_grp7)$var1, c(1, 2))
})

test_that("tokens_group drop document for NA", {
    
    corp <- corpus(c("a b c c", "b c d", "a", "b d d"),
                   docvars = data.frame(grp = factor(c(NA, NA, "A", "C"), levels = c("A", "B", "C", "D")), 
                                        var1 = c(1, 1, 2, 2),
                                        var2 = c("x", "x", "y", NA),
                                        stringsAsFactors = FALSE))
    toks <- tokens(corp)
    expect_equal(attr(tokens_group(toks, grp), "docvars"),
                 data.frame(docname_ = c("A", "C"),
                            docid_ = factor(c("A", "C"), levels = c("A", "C")),
                            segid_ = c(1L, 1L),
                            grp = factor(c("A", "C"), levels = c("A", "C")), 
                            var1 = c(2, 2),
                            var2 = c("y", NA),
                            stringsAsFactors = FALSE))
    
    expect_equal(attr(tokens_group(toks, grp, fill = TRUE), "docvars"),
                 data.frame(docname_ = c("A", "B", "C", "D"),
                            docid_ = factor(c("A", "B", "C", "D"), levels = c("A", "B", "C", "D")),
                            segid_ = c(1L, 1L, 1L, 1L),
                            grp = factor(c("A", "B", "C", "D"), levels = c("A", "B", "C", "D")), 
                            var1 = c(2, NA, 2, NA),
                            var2 = c("y", NA, NA, NA),
                            stringsAsFactors = FALSE))
})
quanteda/quanteda documentation built on April 15, 2024, 7:59 a.m.