tests/testthat/test-tokens_chunk.R

test_that("tokens_chunk works", {
    txt <- c(d1 = "a b c d", d2 = "e f g", d3 = "")

    corp <- corpus(txt, docvars = data.frame(title = c("title1", "title2", "title3"),
                                             stringsAsFactors = FALSE))
    toks <- tokens(corp)

    toks_chunk1 <- tokens_chunk(toks, size = 5)
    expect_identical(as.list(toks_chunk1),
                     list(d1 = c("a", "b", "c", "d"),
                          d2 = c("e", "f", "g")))
    expect_identical(attr(toks_chunk1, "docvars"),
                     data.frame("docname_" = c("d1", "d2"),
                                "docid_" = factor(c("d1", "d2"), 
                                                  levels = c("d1", "d2", "d3")),
                                "segid_" = c(1L, 1L),
                                "title" = c("title1", "title2"),
                                stringsAsFactors = FALSE))

    toks_chunk2 <- tokens_chunk(toks, size = 2)
    expect_identical(as.list(toks_chunk2),
                     list(d1.1 = c("a", "b"),
                          d1.2 = c("c", "d"),
                          d2.1 = c("e", "f"),
                          d2.2 = c("g")))
    expect_identical(attr(toks_chunk2, "docvars"),
                     data.frame("docname_" = c("d1.1", "d1.2", "d2.1", "d2.2"),
                                "docid_" = factor(c("d1", "d1", "d2", "d2"),
                                                  levels = c("d1", "d2", "d3")),
                                "segid_" = c(1L, 2L, 1L, 2L),
                                "title" = c("title1", "title1", "title2", "title2"),
                                stringsAsFactors = FALSE,
                                check.names = FALSE))
    
    toks_chunk3 <- tokens_chunk(toks, size = 2, use_docvars = FALSE)
    expect_identical(as.list(toks_chunk3),
                     list(d1.1 = c("a", "b"),
                          d1.2 = c("c", "d"),
                          d2.1 = c("e", "f"),
                          d2.2 = c("g")))
    expect_identical(attr(toks_chunk3, "docvars"),
                     data.frame("docname_" = c("d1.1", "d1.2", "d2.1", "d2.2"),
                                "docid_" = factor(c("d1", "d1", "d2", "d2"),
                                                  levels = c("d1", "d2", "d3")),
                                "segid_" = c(1L, 2L, 1L, 2L),
                                stringsAsFactors = FALSE,
                                check.names = FALSE))

    toks_chunk4 <- tokens_chunk(toks, 2, overlap = 1)
    expect_identical(as.list(toks_chunk4),
                     list(d1.1 = c("a", "b"),
                          d1.2 = c("b", "c"),
                          d1.3 = c("c", "d"),
                          d1.4 = c("d"),
                          d2.1 = c("e", "f"),
                          d2.2 = c("f", "g"),
                          d2.3 = c("g")))
    expect_identical(attr(toks_chunk4, "docvars"),
                     data.frame("docname_" = c("d1.1", "d1.2", "d1.3", "d1.4", "d2.1", "d2.2", "d2.3"),
                                "docid_" = factor(c("d1", "d1", "d1", "d1", "d2", "d2", "d2"),
                                                  levels = c("d1", "d2", "d3")),
                                "segid_" = c(1L, 2L, 3L, 4L, 1L, 2L, 3L),
                                "title" = c("title1", "title1", "title1", "title1", "title2",
                                            "title2", "title2"),
                                stringsAsFactors = FALSE,
                                check.names = FALSE))
    
})

test_that("tokens_chunk raises error for invalid size", {
    toks <- tokens(c(d1 = "a b c d", d2 = "e f g"))
    expect_error(tokens_chunk(toks, -1),
                 "The value of size must be between 1 and Inf")
    expect_error(tokens_chunk(toks, 0),
                 "The value of size must be between 1 and Inf")
    expect_error(tokens_chunk(toks, c(1, 3)),
                 "The length of size must be 1")
})

test_that("tokens_chunk raises error for invalid overlap", {
    toks <- tokens(c(d1 = "a b c d", d2 = "e f g"))
    expect_error(tokens_chunk(toks, 2, 2),
                 "The value of overlap must be smaller than size")
    expect_error(tokens_chunk(toks, 2, -1),
                 "The value of overlap must be between 0 and Inf")
})

test_that("tokens_chunk works", {
    toks <- tokens(c("a b c d e f", "a a b d c"))
    expect_is(tokens_chunk(toks, size = 3), "tokens")
    expect_equivalent(
        as.list(tokens_chunk(toks, 3)),
        list(c("a", "b", "c"), c("d", "e", "f"), c("a", "a", "b"), c("d", "c"))
    )
    expect_identical(
        attr(tokens_chunk(toks, 3), "docvars"),
        data.frame("docname_" = c("text1.1", "text1.2", "text2.1", "text2.2"),
                   "docid_" = factor(c("text1", "text1", "text2", "text2"),
                                     levels = c("text1", "text2")),
                   "segid_" = c(1L, 2L, 1L, 2L),
                   stringsAsFactors = FALSE,
                   check.names = FALSE)
    )
})

test_that("tokens_chunk() works with sizes longer than tokens length", {
    toks <- tokens(c(d1 = "a b c d e", d2 = "a b c"))
    expect_identical(
        as.list(tokens_chunk(toks, size = 4)),
        list(d1.1 = c("a", "b", "c", "d"),
             d1.2 = "e",
             d2.1 = c("a", "b", "c"))
    )
})
quanteda/quanteda documentation built on April 15, 2024, 7:59 a.m.