tests/testthat/test-docvars.R

test_that("make_docvars() works", {
    docvar1 <- quanteda:::make_docvars(0L, docname = character())
    docvar2 <- quanteda:::make_docvars(3L, docname = c("A", "B", "C"))
    docvar3 <- quanteda:::make_docvars(3L, docname = 1:3)
    docvar4 <- quanteda:::make_docvars(10L)
    docvar5 <- quanteda:::make_docvars(3L, docname = c("A", "B", "B"))

    expect_equal(dim(docvar1), c(0, 3))
    expect_equal(dim(docvar2), c(3, 3))
    expect_equal(dim(docvar3), c(3, 3))
    expect_equal(colnames(docvar1), c("docname_", "docid_", "segid_"))
    expect_equal(colnames(docvar2), c("docname_", "docid_", "segid_"))
    expect_equal(colnames(docvar3), c("docname_", "docid_", "segid_"))

    expect_equal(colnames(docvar4), c("docname_", "docid_", "segid_"))
    expect_equal(docvar4[["docname_"]], paste0("text", 1:10))
    expect_error(quanteda:::make_docvars(n = 2, docname = c("A", "B", "C")))
    expect_equal(docvar5[["docname_"]], c("A.1", "B.1", "B.2"))
    expect_error(quanteda:::make_docvars(n = "3"))
    expect_error(quanteda:::make_docvars(n = 1.4))

    docvar4 <- quanteda:::make_docvars(5L, c("A", "A", "B", "B", "C"))
    expect_equal(docvar4[["docname_"]], c("A.1", "A.2", "B.1", "B.2", "C.1"))
    docvar5 <- quanteda:::make_docvars(5L, c("A", "B", "B", "A", "C"))
    expect_equal(docvar5[["docname_"]], c("A.1", "B.1", "B.2", "A.2", "C.1"))
    docvar6 <- quanteda:::make_docvars(5L, c("A", "A", "B", "B", "C"), unique = FALSE)
    expect_equal(docvar6[["docname_"]], c("A", "A", "B", "B", "C"))
    expect_equal(docvar6[["segid_"]], c(1, 1, 1, 1, 1))

})

test_that("reshape_docvars() works", {
    
    docvar1 <- data.frame("docname_" = c("doc1", "doc2"),
                          "docid_" = factor(c("doc1", "doc2")),
                          "segid_" = c(1, 1), stringsAsFactors = FALSE)
    
    expect_identical(
        quanteda:::reshape_docvars(docvar1, c(1, 2))[["docname_"]],
        c("doc1", "doc2")
    )
    expect_identical(
        quanteda:::reshape_docvars(docvar1, c(1, 1, 2, 2))[["docname_"]],
        c("doc1.1", "doc1.2", "doc2.1", "doc2.2")
    )
    
    docvar2 <- data.frame("docname_" = c("doc1.1", "doc1.2"),
                          "docid_" = factor(c("doc1", "doc1")),
                          "segid_" = c(1, 1))
    expect_identical(
        quanteda:::reshape_docvars(docvar2, c(1, 2))[["docname_"]],
        c("doc1.1", "doc1.2")
    )
    expect_identical(
        quanteda:::reshape_docvars(docvar2, c(1, 1, 2, 2))[["docname_"]],
        c("doc1.1", "doc1.2", "doc1.3", "doc1.4")
    )
    expect_identical(
        quanteda:::reshape_docvars(docvar2, c(1, 1, 1, 1))[["docname_"]],
        c("doc1.1", "doc1.2", "doc1.3", "doc1.4")
    )
})
    

test_that("upgrade_docvars() works", {
    docvar1 <- data.frame()
    docvar2 <- data.frame("var1" = c(100, 200, 300),
                          "var2" = c(TRUE, TRUE, FALSE))
    docvar2$lis <- list(1:3, -5, 3:4)
    docvar3 <- data.frame("var1" = c(100, 200, 300),
                          "var2" = c(TRUE, TRUE, FALSE),
                          row.names = c("A", "B", "C"))
    docvar3$lis <- list(1:3, -5, 3:4)
    docvar4 <- data.frame("docname_" = c("A", "B", "C"),
                          "docid_" = factor(c("A", "B", "C")),
                          "segid_" = rep(1L, 3),
                          "var1" = c(100, 200, 300),
                          "var2" = c(TRUE, TRUE, FALSE),
                          stringsAsFactors = FALSE)
    docvar4$lis <- list(1:3, -5, 3:4)

    expect_identical(
        quanteda:::upgrade_docvars(docvar1, c("A", "B", "C")),
        docvar4[, 1:3]
    )
    expect_identical(
        quanteda:::upgrade_docvars(docvar3),
                     docvar4
    )
    expect_identical(
        quanteda:::upgrade_docvars(docvar2, c("A", "B", "C")),
        docvar4
    )
    expect_identical(
        quanteda:::upgrade_docvars(docvar2, c("A", "B", "C")),
        docvar4
    )
})

test_that("get_docvars() works", {

    data <- data.frame("docname_" = c("A", "B", "C"),
                       "docid_" = factor(c("A", "B", "C")),
                       "segid_" = rep(1L, 3),
                       "var1" = c(100, 200, 300),
                       "var2" = c(TRUE, TRUE, FALSE),
                       stringsAsFactors = FALSE)

    expect_identical(
        quanteda:::select_docvars(data, user = FALSE, system = TRUE),
        data.frame("docname_" = c("A", "B", "C"),
                   "docid_" = factor(c("A", "B", "C")),
                   "segid_" = rep(1L, 3),
                   stringsAsFactors = FALSE)
    )
    expect_error(quanteda:::select_docvars(data, "docid_", user = FALSE, system = FALSE))
    expect_identical(
        quanteda:::select_docvars(data, "docid_", user = FALSE, system = TRUE),
        data.frame("docid_" = factor(c("A", "B", "C")),
                   stringsAsFactors = FALSE)
    )
    expect_identical(
        quanteda:::select_docvars(data, "docid_", user = FALSE, system = TRUE, drop = TRUE),
        factor(c("A", "B", "C"))
    )
    expect_identical(
        quanteda:::select_docvars(data),
        data.frame("var1" = c(100, 200, 300),
                   "var2" = c(TRUE, TRUE, FALSE),
                   stringsAsFactors = FALSE)
    )
    expect_identical(
        quanteda:::select_docvars(data, "var1"),
        data.frame("var1" = c(100, 200, 300), stringsAsFactors = FALSE)
    )
    expect_identical(
        quanteda:::select_docvars(data, "var1", drop = TRUE),
        c(100, 200, 300)
    )
})

test_that("set_docvars() works", {

    data <- data.frame("docname_" = c("A", "B", "C"),
                       "docid_" = c("A", "B", "C"),
                       "docnum_" = 1L:3L,
                       "segid_" = rep(1L, 3),
                       "var1" = c(100, 200, 300),
                       "var2" = c(TRUE, TRUE, FALSE),
                       stringsAsFactors = FALSE)

    quanteda:::set_docvars(data, "var2") <- c(10, 20, 30)
    expect_identical(data[["var2"]], c(10, 20, 30))
    quanteda:::set_docvars(data, "var3") <- c(1000, 2000, 3000)
    expect_identical(data[["var3"]], c(1000, 2000, 3000))
    quanteda:::set_docvars(data) <- data.frame("var1" = c(100, 200, 300),
                                               "var2" = c(TRUE, TRUE, TRUE))
    expect_identical(data[["var1"]], c(100, 200, 300))
    expect_identical(data[["var2"]], c(TRUE, TRUE, TRUE))
    expect_identical(names(data), c("docname_", "docid_", "segid_", "var1", "var2"))
    quanteda:::set_docvars(data) <- NULL
    expect_identical(names(data), c("docname_", "docid_", "segid_"))
})

test_that("docvars of corpus is a data.frame", {
    expect_equal(
        class(docvars(data_corpus_inaugural)),
        "data.frame"
    )
})

test_that("docvars with non-existent field names generate correct error messages", {
    expect_error(
        docvars(data_corpus_inaugural, c("President", "nonexistent")),
        "field\\(s\\) nonexistent not found"
    )

    toks <- tokens(data_corpus_inaugural, include_docvars = TRUE)
    expect_error(
        docvars(toks, c("President", "nonexistent")),
        "field\\(s\\) nonexistent not found"
    )
})

test_that("docvars is working with tokens", {
    corp <- data_corpus_inaugural[1:58]
    toks <- tokens(corp, include_docvars = TRUE)
    expect_equal(docvars(toks), docvars(corp))
    expect_equal(docvars(toks, "President"), docvars(corp, "President"))
    
    # Subset
    toks2 <- toks[docvars(toks, "Year") > 2000]
    expect_equal(ndoc(toks2), nrow(docvars(toks2)))
    
    # Add field to meta-data
    expect_equal(
        docvars(quanteda:::"docvars<-"(toks2, "Type", "Speech"), "Type"),
        rep("Speech", 5)
    )
    
    # Remove meta-data
    expect_output(
        print(docvars(quanteda:::"docvars<-"(toks, field = NULL, NULL))),
        "data frame with 0 columns and 58 rows"
    )
    
    # Add fresh meta-data
    expect_equal(
        docvars(quanteda:::"docvars<-"(toks, field = "ID", 1:58), "ID"),
        1:58
    )
})

test_that("docvars is working with dfm", {
    corp <- data_corpus_inaugural
    toks <- tokens(corp, include_docvars = TRUE)
    thedfm <- dfm(toks)
    
    expect_equal(docvars(toks), docvars(thedfm))
    expect_equal(docvars(toks, "Party"), docvars(corp, "Party"))
    
    thedfm2 <- dfm(tokens(corp))
    expect_equal(docvars(corp), docvars(thedfm2))
    expect_equal(docvars(corp, "Party"), docvars(thedfm2, "Party"))
    
    corp2 <- corpus_subset(corp, Party == "Democratic")
    thedfm3 <- dfm(tokens(corp2))
    expect_equal(docvars(corp2), docvars(thedfm3))
})

test_that("$ returns docvars", {
    corp <- data_corpus_inaugural
    toks <- tokens(corp, include_docvars = TRUE)
    dfmat <- dfm(toks)

    expect_equal(docvars(corp, "Party"), corp$Party)
    expect_equal(docvars(toks, "Party"), toks$Party)
    expect_equal(docvars(dfmat, "Party"), dfmat$Party)
})

test_that("tokens works works with one docvar", {
    docv1 <- data.frame(dvar1 = c("A", "B"))
    mycorpus1 <- corpus(c(d1 = "This is sample document one.",
                          d2 = "Here is the second sample document."),
                        docvars = docv1)
    toks1 <- tokens(mycorpus1, include_docvars = TRUE)
    expect_equivalent(docvars(toks1), docv1)
})


test_that("tokens works works with two docvars", {
    docv2 <- data.frame(dvar1 = c("A", "B"),
                        dvar2 = c(1, 2))
    mycorpus2 <- corpus(c(d1 = "This is sample document one.",
                          d2 = "Here is the second sample document."),
                        docvars = docv2)
    toks2 <- tokens(mycorpus2, include_docvars = TRUE)
    expect_equivalent(docvars(toks2), docv2)
})

test_that("dfm works works with one docvar", {
    docv1 <- data.frame(dvar1 = c("A", "B"))
    mycorpus1 <- corpus(c(d1 = "This is sample document one.",
                          d2 = "Here is the second sample document."),
                        docvars = docv1)
    dfm1 <- suppressWarnings(dfm(tokens(mycorpus1), include_docvars = TRUE))
    expect_equivalent(docvars(dfm1), docv1)
})

test_that("dfm works works with two docvars", {
    docv2 <- data.frame(dvar1 = c("A", "B"),
                        dvar2 = c(1, 2))
    mycorpus2 <- corpus(c(d1 = "This is sample document one.",
                          d2 = "Here is the second sample document."),
                        docvars = docv2)
    dfm2 <- suppressWarnings(dfm(tokens(mycorpus2), include_docvars = TRUE))
    expect_equivalent(docvars(dfm2), docv2)
})

test_that("object always have docvars in the same rows as documents", {
    txt <- data_char_ukimmig2010
    corp1 <- corpus(txt)
    expect_true(nrow(docvars(corp1)) == ndoc(corp1))
    expect_true(all(row.names(docvars(corp1)) == seq_len(ndoc(corp1))))

    corp2 <- corpus_segment(corp1, "\\p{P}", valuetype = "regex")
    expect_true(nrow(docvars(corp2)) == ndoc(corp2))
    expect_true(all(row.names(docvars(corp2)) == seq_len(ndoc(corp2))))

    corp3 <- corpus_reshape(corp1, to = "sentences")
    expect_true(nrow(docvars(corp3)) == ndoc(corp3))
    expect_true(all(row.names(docvars(corp3)) == seq_len(ndoc(corp3))))

    corp4 <- corpus_sample(corp1, size = 5)
    expect_true(nrow(docvars(corp4)) == ndoc(corp4))
    expect_true(all(row.names(docvars(corp4)) == seq_len(ndoc(corp4))))

    toks1 <- tokens(txt)
    expect_true(nrow(docvars(toks1)) == ndoc(toks1))
    expect_true(all(row.names(docvars(toks1)) == seq_len(ndoc(toks1))))

    toks2 <- tokens(corpus(txt))
    expect_true(nrow(docvars(toks2)) == ndoc(toks2))
    expect_true(all(row.names(docvars(toks2)) == seq_len(ndoc(toks2))))

    toks3 <- quanteda:::tokens_group(toks1, rep(c(1, 2, 3), 3))
    expect_true(nrow(docvars(toks3)) == ndoc(toks3))
    expect_true(all(row.names(docvars(toks3)) == seq_len(ndoc(toks3))))

    toks4 <- tokens_select(toks1, stopwords())
    expect_true(nrow(docvars(toks4)) == ndoc(toks4))
    expect_true(all(row.names(docvars(toks4)) == seq_len(ndoc(toks4))))

    dfm1 <- dfm(tokens(txt))
    expect_true(nrow(docvars(dfm1)) == ndoc(dfm1))
    expect_true(all(row.names(docvars(dfm1)) == seq_len(ndoc(dfm1))))

    dfm2 <- dfm(tokens(txt))
    expect_true(nrow(docvars(dfm2)) == ndoc(dfm2))
    expect_true(all(row.names(docvars(dfm2)) == seq_len(ndoc(dfm2))))

    dfm3 <- dfm(tokens(corpus(txt)))
    expect_true(nrow(docvars(dfm3)) == ndoc(dfm3))
    expect_true(all(row.names(docvars(dfm3)) == seq_len(ndoc(dfm3))))

    dfm4 <- dfm_group(dfm1, rep(c(1, 2, 3), 3))
    expect_true(nrow(docvars(dfm4)) == ndoc(dfm4))
    expect_true(all(row.names(docvars(dfm4)) == seq_len(ndoc(dfm4))))

    dfm5 <- dfm(dfm1) |>
        dfm_group(groups = rep(c(1, 2, 3), 3))
    expect_true(nrow(docvars(dfm5)) == ndoc(dfm5))
    expect_true(all(row.names(docvars(dfm5)) == seq_len(ndoc(dfm5))))

    dfm6 <- dfm_subset(dfm1, rep(c(TRUE, TRUE, FALSE), 3))
    expect_true(nrow(docvars(dfm6)) == ndoc(dfm6))
    expect_true(all(row.names(docvars(dfm6)) == seq_len(ndoc(dfm6))))

    dfm7 <- rbind(dfm1, dfm1)
    expect_true(nrow(docvars(dfm7)) == ndoc(dfm7))
    expect_true(all(row.names(docvars(dfm7)) == seq_len(ndoc(dfm7))))

    dfm8 <- suppressWarnings(cbind(dfm1, dfm1))
    expect_true(nrow(docvars(dfm8)) == ndoc(dfm8))
    expect_true(all(row.names(docvars(dfm8)) == seq_len(ndoc(dfm8))))
})

test_that("error when nrow and ndoc mismatch", {

    toks <- tokens(c("a b c", "b c d", "c d e"))
    expect_error(docvars(toks) <- data.frame(var = c(1, 5)))
    expect_silent(docvars(toks) <- data.frame(var = c(1, 5, 6)))
    expect_error(docvars(toks) <- data.frame(var = c(1, 5, 6, 3)))

    mt <- dfm(toks)
    expect_error(docvars(mt) <- data.frame(var = c(1, 5)))
    expect_silent(docvars(mt) <- data.frame(var = c(1, 5, 6)))
    expect_error(docvars(mt) <- data.frame(var = c(1, 5, 6, 3)))

})

test_that("assignment of NULL only drop columns", {
    toks <- tokens(data_corpus_inaugural[1:14])
    docvars(toks) <- NULL
    expect_identical(dim(docvars(toks)), c(14L, 0L))

    mt <- dfm(tokens(data_corpus_inaugural[1:14]))
    docvars(mt) <- NULL
    expect_identical(dim(docvars(mt)), c(14L, 0L))
})

test_that("can assign docvars when value is a dfm (#1417)", {
    mycorp <- corpus(data_char_ukimmig2010)

    thedfm <- dfm(tokens(mycorp))[, "the"]
    docvars(mycorp) <- thedfm
    expect_identical(
        docvars(mycorp),
        data.frame(the = as.vector(thedfm))
    )

    anddfm <- dfm(tokens(mycorp))[, "and"]
    docvars(anddfm) <- anddfm
    expect_identical(
        docvars(anddfm),
        data.frame(and = as.vector(anddfm))
    )

    toks <- tokens(mycorp)
    docvars(toks) <- anddfm
    expect_identical(
        docvars(toks),
        data.frame(and = as.vector(anddfm))
    )
})

test_that("docvar can be renamed (#1603)", {
    corp <- data_corpus_inaugural
    names(docvars(corp))[c(1, 3)] <- c("year", "forename")
    expect_identical(names(docvars(corp)),
                     c("year", "President", "forename", "Party"))

    toks <- tokens(data_corpus_inaugural)
    names(docvars(toks))[c(1, 3)] <- c("year", "forename")
    expect_identical(names(docvars(toks)),
                     c("year", "President", "forename", "Party"))

    dfmat <- dfm(tokens(data_corpus_inaugural))
    names(docvars(dfmat))[c(1, 3)] <- c("year", "forename")
    expect_identical(names(docvars(dfmat)),
                     c("year", "President", "forename", "Party"))
})

test_that("docvar assignment is fully robust including to renaming (#1603)", {
    # assigning a data.frame to blank docars
    corp <- corpus(c("A b c d.", "A a b. B c."))
    docvars(corp) <- data.frame(testdv = 10:11)
    expect_identical(
        docvars(corp),
        data.frame(testdv = 10:11)
    )

    # assigning a vector to blank docars
    corp <- corpus(c("A b c d.", "A a b. B c."))
    expect_error(
        docvars(corp) <- c("x", "y"),
        "you must supply field name(s)", fixed = TRUE
    )

    # assigning an unnamed matrix as docvars
    docvars(corp) <- matrix(c("x", "y"), ncol = 1)
    expect_identical(
        docvars(corp),
        data.frame(V1 = c("x", "y"), stringsAsFactors = FALSE)
    )

    # block assigning a data.frame with missing names
    df <- data.frame(c("x", "y"), c("a", "b"), 11:12, stringsAsFactors = FALSE)
    names(df) <- NULL
    names(df)[2] <- "name2"
    expect_error(docvars(corp) <- df,
                 "data.frame must have column names")

})

test_that("docvars<-.corpus and name uniqueness", {

    # preexisting docvars keep unique names
    corp <- corpus(c("A b c d.", "A a b. B c."))
    docvars(corp) <- data.frame(docvar1 = 1:2)
    docvars(corp)[2] <- 11:12
    docvars(corp)[3] <- c("a", "b")
    expect_identical(
        docvars(corp),
        data.frame(docvar1 = 1:2, V2 = 11:12, V3 = c("a", "b"), stringsAsFactors = FALSE)
    )
})

test_that("docvars<- NULL removes docvars", {
    corp1 <- data_corpus_inaugural
    docvars(corp1)[c(1, 3)] <- NULL
    expect_identical(names(docvars(corp1)), c("President", "Party"))

    corp2 <- data_corpus_inaugural
    docvars(corp2)[c("President", "Party")] <- NULL
    expect_identical(names(docvars(corp2)), c("Year", "FirstName"))

    corp3 <- data_corpus_inaugural
    docvars(corp3, c("President", "Party")) <- NULL
    expect_identical(names(docvars(corp3)), c("Year", "FirstName"))

    toks <- tokens(data_corpus_inaugural)
    toks1 <- toks
    docvars(toks1)[c(1, 3)] <- NULL
    expect_identical(names(docvars(toks1)), c("President", "Party"))

    toks2 <- toks
    docvars(toks2)[c("President", "Party")] <- NULL
    expect_identical(names(docvars(toks2)), c("Year", "FirstName"))

    toks3 <- toks
    docvars(toks3, c("President", "Party")) <- NULL
    expect_identical(names(docvars(toks3)), c("Year", "FirstName"))

    dfmat <- dfm(toks)
    dfmat1 <- dfmat
    docvars(dfmat1)[c(1, 3)] <- NULL
    expect_identical(names(docvars(dfmat1)), c("President", "Party"))

    dfmat2 <- dfmat
    docvars(dfmat2)[c("President", "Party")] <- NULL
    expect_identical(names(docvars(dfmat2)), c("Year", "FirstName"))

    dfmat3 <- dfmat
    docvars(dfmat3, c("President", "Party")) <- NULL
    expect_identical(names(docvars(dfmat3)), c("Year", "FirstName"))
})

test_that("works correctly in edge cases", {
    corp <- corpus(c("A b c d.", "A a b. B c.", "D f. e g.", "H i j."))
    expect_error(docvars(corp) <- 1:4,
                 quanteda:::message_error("docvar_noname"))
    expect_silent(docvars(corp, "var1") <- 1)
    expect_equal(docvars(corp, "var1"), rep(1, 4))
    expect_silent(docvars(corp, "var2") <- 1:4)
    expect_equal(docvars(corp, "var2"), 1:4)
    expect_silent(docvars(corp, "var3") <- 1:2)
    expect_equal(docvars(corp, "var3"), c(1, 2, 1, 2))
    expect_error(docvars(corp, "var4") <- 1:3)
})

test_that("group_docvars() works", {

    docvar <- data.frame("docname_" = c("A", "B", "C"),
                         "docid_" = factor(c("A", "B", "C")),
                         "segid_" = rep(1L, 3),
                         "var1" = c(100, 100, 200),
                         "var2" = c(TRUE, TRUE, FALSE),
                         stringsAsFactors = FALSE)
    
    docvar1 <- quanteda:::group_docvars(docvar, factor(c("X", "X", "Y")))
    expect_equal(
        names(docvar1), 
        c("docname_", "docid_", "segid_", "var1", "var2")
    )
    expect_equal(
        docvar1$var1,
        c(100, 200)
    )
    expect_equal(
        docvar1$var2,
        c(TRUE, FALSE)
    )
    
    docvar2 <- quanteda:::group_docvars(docvar, 
                                        factor(c("X", "X", "Y"), levels = c("X", "Y", "Z")), 
                                        field = "var3")
    expect_equal(
        names(docvar2), 
        c("docname_", "docid_", "segid_", "var1", "var2", "var3")
    )
    expect_equal(
        docvar2$var1,
        c(100, 200, NA)
    )
    expect_equal(
        docvar2$var2,
        c(TRUE, FALSE, NA)
    )
    expect_equal(
        docvar2$var3,
        factor(c("X", "Y", "Z"), levels = c("X", "Y", "Z"))
    )
    
    docvar3 <- quanteda:::group_docvars(docvar, 
                                        factor(c("X", "X", "Y"), levels = c("Z", "Y", "X")), 
                                        field = "var3")
    expect_equal(
        names(docvar3), 
        c("docname_", "docid_", "segid_", "var1", "var2", "var3")
    )
    expect_equal(
        docvar3$var1,
        c(NA, 200, 100)
    )
    expect_equal(
        docvar3$var2,
        c(NA, FALSE, TRUE)
    )
    expect_equal(
        docvar3$var3,
        factor(c("Z", "Y", "X"), levels = c("Z", "Y", "X"))
    )
})

test_that("docid works", {
    corp <- corpus(c(textone = "This is a sentence.  Another sentence.  Yet another.",
                     textwo = "Sentence 1. Sentence 2."))
    corp_sent <- corp |>
        corpus_reshape(to = "sentences")
    
    expect_identical(
        docid(corp_sent),
        factor(c("textone", "textone", "textone", "textwo", "textwo"))
    )
    expect_identical(
        docid(tokens(corp_sent)),
        docid(corp_sent)
    )
    expect_identical(
        docid(dfm(tokens(corp_sent))),
        docid(corp_sent)
    )
    
    # defaults for group functions
    expect_identical(
        docid(corpus_group(corp_sent)),
        factor(docnames(corp))
    )
    expect_identical(
        docid(tokens_group(tokens(corp_sent))),
        factor(docnames(corp))
    )
    expect_identical(
        docid(dfm_group(dfm(tokens(corp_sent)))),
        factor(docnames(corp))
    )
    expect_identical(
        docid(corpus_group(corp_sent, groups = docid(corp_sent))),
        factor(docnames(corp))
    )
    expect_identical(
        docid(tokens_group(tokens(corp_sent), groups = docid(corp_sent))),
        factor(docnames(corp))
    )
    expect_identical(
        docid(dfm_group(dfm(tokens(corp_sent)), groups = docid(corp_sent))),
        factor(docnames(corp))
    )
})

test_that("docvars are combined along with the main objects", {
    
    docvar <- data.frame("var1" = c(100, 100, 200, NA, NA),
                         "var2" = c(NA, NA, NA, TRUE, FALSE))
    
    corp1 <- corpus(data.frame(text = c(d1 = "aa",  d2 = "bb", d3 = "cc"),
                               var1 = c(100, 100, 200)))
    corp2 <- corpus(data.frame(text = c(d4 = "dd", d5 = "ee"),
                               var2 = c(TRUE, FALSE)))
    expect_equal(docvars(c(corp1, corp2)), docvar)
    
    toks1 <- tokens(corp1)
    toks2 <- tokens(corp2)
    expect_equal(docvars(c(toks1, toks2)), docvar)
    
    dfmat1 <- dfm(tokens(corp1))
    dfmat2 <- dfm(tokens(corp2))
    expect_equal(docvars(rbind(dfmat1, dfmat2)), docvar)
})
quanteda/quanteda documentation built on April 15, 2024, 7:59 a.m.