tests/testthat/test-left-join.R

context("test left_join")

test_that("left_join works with regular docvars", {
    corp <- corpus(c(doc1 = "text1", doc2 = "text2", doc3 = "text3"),
                   docvars = data.frame(id = 1:3, name = c("a", "b", "c")))
    df <- data.frame(id = 1:4, info = c("x", "y", "z", "w"))

    result <- left_join(corp, df, by = "id")

    expect_identical(docnames(result), c("doc1", "doc2", "doc3"))
    expect_identical(
        docvars(result),
        data.frame(
            id = 1:3,
            name = c("a", "b", "c"),
            info = c("x", "y", "z")
        )
    )
})

test_that("left_join handles unmatched rows with NA", {
    corp <- corpus(c(doc1 = "text1", doc2 = "text2", doc3 = "text3"),
                   docvars = data.frame(id = 1:3))
    df <- data.frame(id = c(1, 3, 5), info = c("a", "c", "e"))

    result <- left_join(corp, df, by = "id")

    expect_equal(
        docvars(result),
        data.frame(
            id = 1:3,
            info = c("a", NA, "c")
        )
    )
})

test_that("left_join works with docname in y", {
    corp <- corpus(c(doc1 = "text1", doc2 = "text2", doc3 = "text3"),
                   docvars = data.frame(id = 1:3))
    df <- data.frame(
        docname = c("doc1", "doc2", "doc4"),
        info = c("a", "b", "d")
    )

    result <- left_join(corp, df, by = "docname")

    expect_identical(docnames(result), c("doc1", "doc2", "doc3"))
    expect_identical(
        docvars(result),
        data.frame(
            id = 1:3,
            info = c("a", "b", NA)
        )
    )
    # docname should not be added as a docvar
    expect_false("docname" %in% names(docvars(result)))
})

test_that("left_join works with join_by(docname == other_col)", {
    skip_if_not_installed("dplyr", minimum_version = "1.1.0")
    corp <- corpus(c(doc1 = "text1", doc2 = "text2", doc3 = "text3"),
                   docvars = data.frame(id = 1:3))
    df <- data.frame(
        doc_id = c("doc1", "doc2", "doc4"),
        info = c("a", "b", "d")
    )

    result <- left_join(corp, df, by = dplyr::join_by(docname == doc_id))

    expect_identical(docnames(result), c("doc1", "doc2", "doc3"))
    expect_identical(
        docvars(result),
        data.frame(
            id = 1:3,
            info = c("a", "b", NA)
        )
    )
    # docname should not be added as a docvar
    expect_false("docname" %in% names(docvars(result)))
})

test_that("left_join works with named vector c(docname = other_col)", {
    corp <- corpus(c(doc1 = "text1", doc2 = "text2", doc3 = "text3"),
                   docvars = data.frame(id = 1:3))
    df <- data.frame(
        doc_id = c("doc1", "doc2", "doc4"),
        info = c("a", "b", "d")
    )

    result <- left_join(corp, df, by = c("docname" = "doc_id"))

    expect_identical(docnames(result), c("doc1", "doc2", "doc3"))
    expect_identical(
        docvars(result),
        data.frame(
            id = 1:3,
            info = c("a", "b", NA)
        )
    )
    # docname should not be added as a docvar
    expect_false("docname" %in% names(docvars(result)))
})

test_that("left_join uses existing docname docvar when present", {
    # When docname exists as a docvar, it should use that instead of docnames(x)
    corp <- corpus(c(a = "text1", b = "text2", c = "text3"),
                   docvars = data.frame(
                       docname = c("doc1", "doc2", "doc3"),
                       id = 1:3
                   ))
    df <- data.frame(
        docname = c("doc1", "doc2", "doc4"),
        info = c("a", "b", "d")
    )

    result <- left_join(corp, df, by = "docname")

    # Document names should be unchanged (a, b, c)
    expect_identical(docnames(result), c("a", "b", "c"))
    # Join should use the docname docvar, not docnames()
    expect_identical(
        docvars(result),
        data.frame(
            docname = c("doc1", "doc2", "doc3"),
            id = 1:3,
            info = c("a", "b", NA)
        )
    )
})

test_that("left_join works with multiple join columns", {
    corp <- corpus(c(doc1 = "text1", doc2 = "text2", doc3 = "text3"),
                   docvars = data.frame(
                       year = c(2020, 2021, 2020),
                       country = c("US", "US", "UK")
                   ))
    df <- data.frame(
        year = c(2020, 2021, 2020, 2021),
        country = c("US", "US", "UK", "UK"),
        value = c(10, 20, 30, 40)
    )

    result <- left_join(corp, df, by = c("year", "country"))

    expect_identical(
        docvars(result),
        data.frame(
            year = c(2020, 2021, 2020),
            country = c("US", "US", "UK"),
            value = c(10, 20, 30)
        )
    )
})

test_that("left_join preserves corpus metadata", {
    corp <- corpus(c(doc1 = "text1", doc2 = "text2"),
                   docvars = data.frame(id = 1:2))
    meta(corp, "source") <- "test"
    meta(corp, "notes") <- "example"

    df <- data.frame(id = 1:2, info = c("a", "b"))
    result <- left_join(corp, df, by = "id")

    expect_identical(meta(result, "source"), "test")
    expect_identical(meta(result, "notes"), "example")
})

test_that("left_join handles suffix parameter correctly", {
    corp <- corpus(c(doc1 = "text1", doc2 = "text2"),
                   docvars = data.frame(id = 1:2, value = c("x", "y")))
    df <- data.frame(id = 1:2, value = c("a", "b"))

    result <- left_join(corp, df, by = "id", suffix = c("_corp", "_df"))

    expect_identical(
        names(docvars(result)),
        c("id", "value_corp", "value_df")
    )
})

Try the quanteda.tidy package in your browser

Any scripts or data that you put into this service are public.

quanteda.tidy documentation built on Dec. 17, 2025, 5:09 p.m.