tests/testthat/test-dplyr-single.R

context("test single table dplyr verbs")

test_that("corpus arrange() works", {
    expect_identical(
        docnames(arrange(data_corpus_inaugural[1:3], c(3, 1, 2))),
        c("1793-Washington", "1797-Adams", "1789-Washington")
    )
    expect_identical(
        docnames(arrange(data_corpus_inaugural[1:3], President)),
        c("1797-Adams", "1789-Washington", "1793-Washington")
    )
    expect_identical(
        docnames(arrange(data_corpus_inaugural[1:3], President, desc(Year))),
        c("1797-Adams", "1793-Washington", "1789-Washington")
    )
})

test_that("corpus filter() works", {
    expect_identical(
        docnames(filter(data_corpus_inaugural, President == "Roosevelt")),
        c("1905-Roosevelt", "1933-Roosevelt", "1937-Roosevelt",
          "1941-Roosevelt", "1945-Roosevelt")
    )
    expect_identical(
        docvars(filter(data_corpus_inaugural, President == "Roosevelt")),
        structure(list(Year = c(1905L, 1933L, 1937L, 1941L, 1945L),
                       President = c("Roosevelt", "Roosevelt", "Roosevelt",
                                     "Roosevelt", "Roosevelt"),
                       FirstName = c("Theodore", "Franklin D.", "Franklin D.",
                                     "Franklin D.", "Franklin D."),
                       Party = structure(c(5L, 1L, 1L, 1L, 1L), .Label = c("Democratic", 
                                                                           "Democratic-Republican", "Federalist", "none", "Republican", 
                                                                           "Whig"), class = "factor")),
                  row.names = c(NA, -5L), class = "data.frame")
    )
    expect_identical(
        docnames(filter(data_corpus_inaugural[1:3], c(TRUE, FALSE, TRUE))),
        c("1789-Washington", "1797-Adams")
    )

    x <- c(TRUE, FALSE, TRUE)
    expect_identical(
        docnames(filter(data_corpus_inaugural[1:3], x)),
        c("1789-Washington", "1797-Adams")
    )
})

test_that("corpus mutate() works", {
    expect_identical(
        docvars(mutate(data_corpus_inaugural[1],
                       pyear = paste(President, Year, sep = "_"))),
        structure(list(Year = 1789L, President = "Washington",
                       FirstName = "George",
                       Party = structure(4L, .Label = c("Democratic", "Democratic-Republican", 
                                                        "Federalist", "none", "Republican", "Whig"), class = "factor"),
                       pyear = "Washington_1789"),
                  row.names = c(NA, -1L), class = "data.frame")
    )
    expect_identical(
        docvars(mutate(data_corpus_inaugural[1:3],
                       cent = floor(Year / 100) * 100),
                "cent"),
        c(1700, 1700, 1700)
    )
})

test_that("corpus transmute() works", {
    expect_identical(
        docvars(transmute(data_corpus_inaugural[1],
                       pyear = paste(President, Year, sep = "_"))),
        structure(list(pyear = "Washington_1789"), row.names = c(NA, -1L),
                  class = "data.frame")
    )
})

test_that("corpus select() works", {
    expect_identical(
        names(docvars(select(data_corpus_inaugural[1:2], Party, Year))),
        c("Party", "Year")
    )
    expect_identical(
        names(docvars(rename(data_corpus_inaugural[1:2],
                             LastName = President))),
        c("Year", "LastName", "FirstName", "Party")
    )
})

test_that("pull works", {
    corp <- head(data_corpus_inaugural, 3)
    expect_identical(
        corp %>% pull(President),
        c("Washington", "Washington", "Adams")
    )
    expect_identical(
        corp %>% pull(1),
        c(1789L, 1793L, 1797L)
    )
    expect_identical(
        corp %>% pull(-1),
        structure(c(4L, 4L, 3L),
                  .Label = c("Democratic", "Democratic-Republican",
                             "Federalist", "none", "Republican", "Whig"),
                  class = "factor")
    )

    toks <- tokens(corp)
    expect_identical(
        toks %>% pull(President),
        c("Washington", "Washington", "Adams")
    )

    dfmat <- dfm(toks)
    expect_identical(
        dfmat %>% pull(President),
        c("Washington", "Washington", "Adams")
    )
})

test_that("pull gives informative error with empty docvars", {
    corp <- corpus(letters[1:3])
    expect_error(
        pull(corp),
        "Cannot extract document variables: corpus has no docvars"
    )

    toks <- tokens(corp)
    expect_error(
        pull(toks),
        "Cannot extract document variables: corpus has no docvars"
    )

    dfmat <- dfm(toks)
    expect_error(
        pull(dfmat),
        "Cannot extract document variables: corpus has no docvars"
    )
})

test_that("corpus distinct() works", {
    # Test distinct with .keep_all = FALSE (default behavior)
    corp <- data_corpus_inaugural[c(1, 2, 4)]  # Washington, Washington, Jefferson
    result <- distinct(corp, President)
    expect_identical(
        docnames(result),
        c("1789-Washington", "1801-Jefferson")
    )
    expect_identical(
        names(docvars(result)),
        c("President")
    )

    # Test distinct with .keep_all = TRUE
    result_keep_all <- distinct(corp, President, .keep_all = TRUE)
    expect_identical(
        docnames(result_keep_all),
        c("1789-Washington", "1801-Jefferson")
    )
    expect_identical(
        names(docvars(result_keep_all)),
        c("Year", "President", "FirstName", "Party")
    )

    # Test distinct with multiple variables
    corp2 <- data_corpus_inaugural[1:10]
    result_multi <- distinct(corp2, Party, .keep_all = TRUE)
    expect_true(
        nrow(docvars(result_multi)) < nrow(docvars(corp2))
    )
})

test_that("corpus relocate() works", {
    corp <- data_corpus_inaugural[1:3]

    # Test relocating to front
    result <- relocate(corp, Party)
    expect_identical(
        names(docvars(result)),
        c("Party", "Year", "President", "FirstName")
    )

    # Test relocating to end
    result_after <- relocate(corp, Year, .after = last_col())
    expect_identical(
        names(docvars(result_after)),
        c("President", "FirstName", "Party", "Year")
    )

    # Test relocating before a column
    result_before <- relocate(corp, Party, .before = President)
    expect_identical(
        names(docvars(result_before)),
        c("Year", "Party", "President", "FirstName")
    )

    # Verify document order is preserved
    expect_identical(
        docnames(result),
        docnames(corp)
    )
})

test_that("corpus rename() works", {
    corp <- data_corpus_inaugural[1:3]

    # Test renaming a single variable
    result <- rename(corp, LastName = President)
    expect_identical(
        names(docvars(result)),
        c("Year", "LastName", "FirstName", "Party")
    )
    expect_identical(
        docvars(result)$LastName,
        c("Washington", "Washington", "Adams")
    )

    # Test renaming multiple variables
    result_multi <- rename(corp, LastName = President, Given = FirstName)
    expect_identical(
        names(docvars(result_multi)),
        c("Year", "LastName", "Given", "Party")
    )

    # Verify document order and texts are preserved
    expect_identical(docnames(result), docnames(corp))
    expect_identical(as.character(result), as.character(corp))
})

test_that("corpus rename_with() works", {
    corp <- data_corpus_inaugural[1:3]

    # Test renaming all columns with a function
    result <- rename_with(corp, toupper)
    expect_identical(
        names(docvars(result)),
        c("YEAR", "PRESIDENT", "FIRSTNAME", "PARTY")
    )

    # Test renaming with selection
    result_select <- rename_with(corp, toupper, starts_with("P"))
    expect_identical(
        names(docvars(result_select)),
        c("Year", "PRESIDENT", "FirstName", "PARTY")
    )

    # Test with custom function
    add_prefix <- function(x) paste0("doc_", x)
    result_prefix <- rename_with(corp, add_prefix, all_of(c("Year", "President")))
    expect_identical(
        names(docvars(result_prefix)),
        c("doc_Year", "doc_President", "FirstName", "Party")
    )

    # Verify document order and content are preserved
    expect_identical(
        docnames(result),
        docnames(corp)
    )
    expect_identical(
        docvars(result)$YEAR,
        docvars(corp)$Year
    )
})

test_that("corpus slice() works", {
    corp <- data_corpus_inaugural[1:10]

    # Test basic slicing by position
    result <- slice(corp, 1:3)
    expect_identical(
        docnames(result),
        docnames(corp)[1:3]
    )
    expect_equal(ndoc(result), 3)

    # Test slicing with negative indices
    result_neg <- slice(corp, -(1:5))
    expect_equal(ndoc(result_neg), 5)
    expect_identical(
        docnames(result_neg),
        docnames(corp)[6:10]
    )

    # Test slicing with non-sequential indices
    result_skip <- slice(corp, c(1, 3, 5, 7, 9))
    expect_equal(ndoc(result_skip), 5)
    expect_identical(
        docnames(result_skip)[1],
        docnames(corp)[1]
    )
})

test_that("corpus slice_head() works", {
    corp <- data_corpus_inaugural[1:10]

    # Test with n argument
    result_n <- slice_head(corp, n = 3)
    expect_equal(ndoc(result_n), 3)
    expect_identical(
        docnames(result_n),
        docnames(corp)[1:3]
    )

    # Test with prop argument
    result_prop <- slice_head(corp, prop = 0.3)
    expect_equal(ndoc(result_prop), 3)
    expect_identical(
        docnames(result_prop),
        docnames(corp)[1:3]
    )
})

test_that("corpus slice_tail() works", {
    corp <- data_corpus_inaugural[1:10]

    # Test with n argument
    result_n <- slice_tail(corp, n = 3)
    expect_equal(ndoc(result_n), 3)
    expect_identical(
        docnames(result_n),
        docnames(corp)[8:10]
    )

    # Test with prop argument
    result_prop <- slice_tail(corp, prop = 0.2)
    expect_equal(ndoc(result_prop), 2)
    expect_identical(
        docnames(result_prop),
        docnames(corp)[9:10]
    )
})

test_that("corpus slice_sample() works", {
    corp <- data_corpus_inaugural[1:10]

    # Test with n argument
    set.seed(123)
    result_n <- slice_sample(corp, n = 5)
    expect_equal(ndoc(result_n), 5)
    expect_true(all(docnames(result_n) %in% docnames(corp)))

    # Test with prop argument
    set.seed(456)
    result_prop <- slice_sample(corp, prop = 0.5)
    expect_equal(ndoc(result_prop), 5)
    expect_true(all(docnames(result_prop) %in% docnames(corp)))

    # Test that different seeds give different results
    set.seed(789)
    result_diff <- slice_sample(corp, n = 5)
    expect_false(identical(docnames(result_n), docnames(result_diff)))
})

test_that("corpus slice_min() works", {
    corp <- data_corpus_inaugural[1:10]

    # Test with n argument
    result_n <- slice_min(corp, Year, n = 3)
    expect_equal(ndoc(result_n), 3)
    expect_true(all(docvars(result_n)$Year <= 1797))

    # Test with prop argument
    result_prop <- slice_min(corp, Year, prop = 0.2)
    expect_equal(ndoc(result_prop), 2)

    # Verify ordering (should be in original order, not sorted)
    result_ordered <- slice_min(corp, Year, n = 5)
    expect_true(min(docvars(result_ordered)$Year) == min(docvars(corp)$Year))
})

test_that("corpus slice_max() works", {
    corp <- data_corpus_inaugural[1:10]

    # Test with n argument (top 3 years are 1825, 1821, 1817)
    result_n <- slice_max(corp, Year, n = 3)
    expect_equal(ndoc(result_n), 3)
    expect_true(all(docvars(result_n)$Year >= 1817))

    # Test with prop argument
    result_prop <- slice_max(corp, Year, prop = 0.3)
    expect_equal(ndoc(result_prop), 3)

    # Test with ties
    corp_ties <- data_corpus_inaugural[c(1, 2, 3)]  # Has two 179x years
    result_ties <- slice_max(corp_ties, Year, n = 1, with_ties = TRUE)
    expect_true(ndoc(result_ties) >= 1)
})

test_that("corpus add_count() works", {
    corp <- data_corpus_inaugural[1:10]

    # Test basic add_count by single variable
    result <- add_count(corp, President)
    expect_true("n" %in% names(docvars(result)))
    expect_equal(ndoc(result), ndoc(corp))

    # Check that Washington has count of 2 (appears twice in first 10)
    washington_count <- unique(docvars(result)[docvars(result)$President == "Washington", "n"])
    expect_equal(washington_count, 2)

    # Test add_count with custom name
    result_custom <- add_count(corp, President, name = "pres_count")
    expect_true("pres_count" %in% names(docvars(result_custom)))
    expect_false("n" %in% names(docvars(result_custom)))

    # Test add_count by multiple variables
    result_multi <- add_count(corp, President, Party)
    expect_true("n" %in% names(docvars(result_multi)))

    # Test add_count with sort
    result_sort <- add_count(corp, President, sort = TRUE)
    expect_true("n" %in% names(docvars(result_sort)))
    # First document should have the highest count or tied for highest
    first_count <- docvars(result_sort)$n[1]
    expect_true(first_count >= max(docvars(result_sort)$n) - 1)

    # Verify original docvars are preserved
    expect_true(all(c("Year", "President", "FirstName", "Party") %in% names(docvars(result))))
})

test_that("corpus add_tally() works", {
    corp <- data_corpus_inaugural[1:10]

    # Test basic add_tally (adds total count to each row)
    result <- add_tally(corp)
    expect_true("n" %in% names(docvars(result)))
    expect_equal(ndoc(result), ndoc(corp))

    # All rows should have the same count (total number of documents)
    expect_true(all(docvars(result)$n == 10))

    # Test add_tally with custom name
    result_custom <- add_tally(corp, name = "total")
    expect_true("total" %in% names(docvars(result_custom)))
    expect_false("n" %in% names(docvars(result_custom)))
    expect_true(all(docvars(result_custom)$total == 10))

    # Test add_tally with a subset that has duplicates
    corp_subset <- data_corpus_inaugural[c(1, 1, 2, 2, 2, 3)]
    result_subset <- add_tally(corp_subset)
    expect_equal(ndoc(result_subset), 6)
    expect_true(all(docvars(result_subset)$n == 6))

    # Verify original docvars are preserved
    expect_true(all(c("Year", "President", "FirstName", "Party") %in% names(docvars(result))))
})

test_that("add_count() and add_tally() with wt argument", {
    corp <- data_corpus_inaugural[1:10]

    # Create a corpus with a weight variable
    corp_wt <- mutate(corp, weight = Year - min(Year) + 1)

    # Test add_count with weights
    result_count_wt <- add_count(corp_wt, President, wt = weight)
    expect_true("n" %in% names(docvars(result_count_wt)))

    # Check that weighted count for Washington is sum of weights
    # Washington appears at indices 1 and 2 (years 1789 and 1793)
    # weights are: (1789-1789+1)=1 and (1793-1789+1)=5, sum=6
    washington_weighted <- unique(docvars(result_count_wt)[docvars(result_count_wt)$President == "Washington", "n"])
    expect_equal(washington_weighted, 6)

    # Test add_tally with weights
    result_tally_wt <- add_tally(corp_wt, wt = weight)
    expect_true("n" %in% names(docvars(result_tally_wt)))
    # All rows should have the same weighted total
    expected_total <- sum(docvars(corp_wt)$weight)
    expect_true(all(docvars(result_tally_wt)$n == expected_total))
})

Try the quanteda.tidy package in your browser

Any scripts or data that you put into this service are public.

quanteda.tidy documentation built on Dec. 17, 2025, 5:09 p.m.