tests/testthat/test-merge-datasets.R

context("Merge/extend dataset")

with_mock_crunch({
    ds1 <- cachedLoadDataset("test ds")
    ds2 <- cachedLoadDataset("ECON.sav")

    testPayloadNoFilterArg <- paste0(
        '{"function":"adapt",',
        '"args":[{"dataset":"https://app.crunch.io/api/datasets/3/"},',
        '{"variable":"https://app.crunch.io/api/datasets/3/variables/birthyr/"},',
        '{"variable":"https://app.crunch.io/api/datasets/1/variables/birthyr/"}]'
    )
    testPayload <- paste0(testPayloadNoFilterArg, "}")
    genderFilter <- paste0(
        '{"function":"==","args":[',
        '{"variable":"https://app.crunch.io/api/datasets/3/variables/gender/"},{"value":1}]}'
    )
    testPayloadWithFilter <- paste0(
        testPayloadNoFilterArg, ',"filter":',
        genderFilter, "}"
    )
    testSubsetPayloadPart1 <- paste0(
        '{"function":"select","args":[{"map":{',
        '"66ae9881e3524f7db84970d556c34552":',
        '{"variable":"https://app.crunch.io/api/datasets/3/variables/gender/"},',
        '"f78ca47313144b57adfb495893968e70":',
        '{"variable":"https://app.crunch.io/api/datasets/3/variables/birthyr/"}}}],',
        '"frame":'
    )
    testSubsetPayload <- paste0(testSubsetPayloadPart1, testPayload, "}")
    testSubsetPayloadWithFilter <- paste0(
        testSubsetPayloadPart1, testPayload,
        ',"filter":', genderFilter, "}"
    )

    test_that("Correct payload without filtering", {
        expect_warning(
            expect_POST(
                merge(ds1, ds2, by.x = ds1$birthyr, ds2$birthyr),
                "https://app.crunch.io/api/datasets/1/variables/",
                testPayload
            ),
            "Variable birthyr is hidden"
        )
    })

    test_that("Can reference variables by alias", {
        expect_warning(
            expect_POST(
                merge(ds1, ds2, by.x = "birthyr", by.y = "birthyr"),
                "https://app.crunch.io/api/datasets/1/variables/",
                testPayload
            ),
            "Variable birthyr is hidden"
        )
        expect_warning(
            expect_POST(
                merge(ds1, ds2, by = "birthyr"),
                "https://app.crunch.io/api/datasets/1/variables/",
                testPayload
            ),
            "Variable birthyr is hidden"
        )
    })
    test_that("joinDatasets with default copy=TRUE redirects here", {
        expect_warning(
            expect_POST(
                joinDatasets(ds1, ds2, by.x = ds1$birthyr, ds2$birthyr),
                "https://app.crunch.io/api/datasets/1/variables/",
                testPayload
            ),
            "Variable birthyr is hidden"
        )
    })
    test_that("joinDatasets with copy=FALSE is (no longer/not yet) valid", {
        expect_error(
            joinDatasets(ds1, ds2, by.x = ds1$birthyr, ds2$birthyr, copy = FALSE),
            "Virtual joins are not yet supported."
        )
    })

    test_that("merge a subset of variables", {
        expect_warning(
            expect_POST(
                merge(ds1, ds2[c("gender", "birthyr")], by = "birthyr"),
                "https://app.crunch.io/api/datasets/1/variables/",
                testSubsetPayload
            ),
            "Variable birthyr is hidden"
        )
    })

    test_that("filter rows in merge", {
        expect_warning(
            expect_POST(
                merge(ds1, ds2[ds2$gender == "Male", ], by = "birthyr"),
                "https://app.crunch.io/api/datasets/1/variables/",
                testPayloadWithFilter
            ),
            "Variable birthyr is hidden"
        )
    })

    test_that("filter rows and variables in merge", {
        expect_warning(
            expect_POST(
                merge(ds1, ds2[ds2$gender == "Male", c("gender", "birthyr")],
                    by = "birthyr"
                ),
                "https://app.crunch.io/api/datasets/1/variables/",
                testSubsetPayloadWithFilter
            ),
            "Variable birthyr is hidden"
        )
    })

    test_that("Input validation for merge/extend (plus method dispatch)", {
        expect_error(
            extendDataset(1),
            "x must be a Crunch Dataset"
        )
        expect_error(
            merge(ds1, 1, by.x = ds1[[1]]),
            "y must be a Crunch Dataset"
        )
        expect_error(
            merge(ds1, ds2, by.x = 1),
            "by.x must be a Crunch Variable"
        )
        expect_error(
            merge(ds1, ds2, by.x = ds2[[1]]),
            "by.x must be a variable in x"
        )
        expect_error(
            merge(ds1, ds2, by.x = ds1[[1]], by.y = 1),
            "by.y must be a Crunch Variable"
        )
        expect_error(
            merge(ds1, ds2, by.x = ds1[[1]], by.y = ds1[[1]]),
            "by.y must be a variable in y"
        )
        expect_warning(
            expect_error(
                merge(ds1, ds2, by.x = ds1$birthyr, by.y = ds2$birthyr, all = TRUE),
                'Option "all" not supported.'
            ),
            "Variable birthyr is hidden"
        ) ## In ds2
        expect_warning(
            expect_error(
                merge(ds1, ds2, by.x = ds1$birthyr, by.y = ds2$birthyr, all.x = FALSE),
                'Option "all.x=FALSE" not supported.'
            ),
            "Variable birthyr is hidden"
        ) ## In ds2
        expect_warning(
            expect_error(
                merge(ds1, ds2, by.x = ds1$birthyr, by.y = ds2$birthyr, all.y = TRUE),
                'Option "all.y" not supported.'
            ),
            "Variable birthyr is hidden"
        ) ## In ds2
    })

    test_that("Categorical and array variables can't be used as keys", {
        expect_error(
            merge(ds1, ds2, by.x = ds1$gender, by.y = ds2$birthyr),
            "by.x must be type numeric or text"
        )
        expect_error(
            merge(ds1, ds2, by.x = ds1$birthyr, by.y = ds2$gender),
            "by.y must be type numeric or text"
        )
    })

    test_that("Providing != 1 alias gives useful error message", {
        expect_error(
            merge(ds1, ds2),
            "by.x must reference one and only one variable"
        )
        ## Default "by" is intersection of names
        expect_error(
            merge(ds1, ds2, by.x = ds1$birthyr),
            "by.y must reference one and only one variable"
        )
    })

    test_that("An invalid alias gives a useful error message", {
        expect_error(
            merge(ds1, ds2, by.x = "NOTAVARIABLE"),
            "NOTAVARIABLE does not reference a variable in x"
        )
        expect_error(
            merge(ds1, ds2, by.x = ds1$birthyr, by.y = "NOTAVARIABLE"),
            "NOTAVARIABLE does not reference a variable in y"
        )
    })
})

printed_order_apidocs2 <- c(
    "[+] Key Pet Indicators",
    "    All pets owned",
    "    Pet",
    "    Pets by location",
    "[+] Dog Metrics",
    "    Number of dogs",
    "    [+] Number of dogs by type",
    "        Number of dogs -- With papers",
    "        Number of dogs -- Mutts",
    "[+] Details",
    "    Pet name",
    "[+] Dimensions",
    "    Country",
    "    Wave",
    "Person ID",
    "[+] __hidden__",
    "    Weight",
    "    Case ID"
)

printed_order_apidocs2_merge_by_stringid <- c( # nolint
    "    [+] Key Pet Indicators",
    "        All pets owned",
    "        Pet",
    "        Pets by location",
    "    [+] Dog Metrics",
    "        Number of dogs",
    "        [+] Number of dogs by type",
    "            Number of dogs -- With papers",
    "            Number of dogs -- Mutts",
    "    [+] Details",
    "        Pet name",
    "    [+] Dimensions",
    "        Country",
    "        Wave",
    "[+] __hidden__",
    "    [+] Example dataset",
    "        Weight",
    "        Case ID"
)

printed_order_apidocs2_merge_by_caseid <- c( # nolint
    "    [+] Key Pet Indicators",
    "        All pets owned",
    "        Pet",
    "        Pets by location",
    "    [+] Dog Metrics",
    "        Number of dogs",
    "        [+] Number of dogs by type",
    "            Number of dogs -- With papers",
    "            Number of dogs -- Mutts",
    "    [+] Details",
    "        Pet name",
    "    [+] Dimensions",
    "        Country",
    "        Wave",
    "    Person ID",
    "[+] __hidden__",
    "    [+] Example dataset",
    "        Weight"
)

with_test_authentication({
    ds1 <- newDatasetFromFixture("join-apidocs2-to-me")
    with_consent(ds1$allpets_1 <- NULL)
    ds2 <- newDatasetFromFixture("apidocs2")
    test_that("Shape of apidocs2", {
        expect_prints(ordering(ds2),
            paste(printed_order_apidocs2, collapse = "\n"),
            fixed = TRUE
        )
        expect_identical(names(ds2), c(
            "allpets", "q1", "petloc", "ndogs",
            "ndogs_a", "ndogs_b", "q3", "country", "wave", "stringid"
        ))
        expect_identical(dim(ds2), c(20L, 10L))
    })
    test_that("Shape of join-to-me", {
        expect_identical(names(ds1), c("id", "matches", "other_var"))
        expect_identical(dim(ds1), c(14L, 3L))
    })

    test_that("An uncomplicated merge on a text key", {
        ds1 <- merge(ds1, ds2, by.x = ds1$id, by.y = ds2$stringid)
        expect_is(ds1, "CrunchDataset")
        expect_identical(dim(ds1), c(14L, 12L))
        expect_prints(ordering(ds1),
            paste(c(
                "ID",
                "Join matches",
                "Another variable",
                paste0("[+] ", name(ds2)),
                paste0(printed_order_apidocs2_merge_by_stringid) # nolint
            ),
            collapse = "\n"
            ),
            fixed = TRUE
        )
        expect_identical(
            names(ds1),
            c(
                "id", "matches", "other_var", "allpets", "q1", "petloc", "ndogs",
                "ndogs_a", "ndogs_b", "q3", "country", "wave"
            )
        )
        expect_identical(hiddenVariables(ds1, "name"), c("Case ID", "Weight"))
    })

    test_that("Similarly uncomplicated merge, but numeric and hidden key", {
        # When merging datasets during jenkins run tests, the order of the
        # variables is *sometimes* not copied (usually Pet Name/q3 is missing),
        # which makes this test fail. The ordering should be fully copied
        # during the merge operation. Reproduction of the bug on production or
        # alpha did not yield errors with order. Repro. locally is possible
        ds1 <- newDatasetFromFixture("join-apidocs2-to-me")
        with_consent(ds1$allpets_1 <- NULL)
        type(ds1$id) <- "numeric"

        expect_warning(
            ds1 <- merge(ds1, ds2, by.x = ds1$id, by.y = ds2$caseid),
            "Variable caseid is hidden"
        )
        expect_is(ds1, "CrunchDataset")
        expect_identical(dim(ds1), c(14L, 13L))
        expect_prints(ordering(ds1),
            paste(c(
                "ID",
                "Join matches",
                "Another variable",
                paste0("[+] ", name(ds2)),
                paste0(printed_order_apidocs2_merge_by_caseid) # nolint
            ),
            collapse = "\n"
            ),
            fixed = TRUE
        )
        expect_identical(
            names(ds1),
            c(
                "id", "matches", "other_var", "allpets", "q1", "petloc", "ndogs",
                "ndogs_a", "ndogs_b", "q3", "country", "wave", "stringid"
            )
        )
        expect_identical(hiddenVariables(ds1, "name"), "Weight")
    })

    ## More tests to write:
    ## 1) check for handling of the conflicted alias
    ## 2) weight_variables?
    ## 3) apply exclusion filter on either dataset

    test_that("Can select variables to join", {
        ds1 <- newDatasetFromFixture("join-apidocs2-to-me")
        with_consent(ds1$allpets_1 <- NULL)
        ds1 <- merge(ds1, ds2[c("stringid", "q1", "petloc")],
            by.x = "id", by.y = "stringid"
        )
        expect_identical(
            names(ds1),
            c("id", "matches", "other_var", "q1", "petloc")
        )
    })

    test_that("Can select variables and rows to join", {
        ds1 <- newDatasetFromFixture("join-apidocs2-to-me")
        with_consent(ds1$allpets_1 <- NULL)
        ds1 <- merge(ds1, ds2[ds2$stringid == "43805958", c("stringid", "q1", "petloc")],
            by.x = "id", by.y = "stringid"
        )
        expect_prints(ordering(ds1),
            paste(c(
                "ID",
                "Join matches",
                "Another variable",
                paste0("[+] ", name(ds2)),
                paste("    ",
                    c(
                        "[+] Key Pet Indicators",
                        "    Pet",
                        "    Pets by location"
                    ),
                    sep = "", collapse = "\n"
                )
            ),
            collapse = "\n"
            ),
            fixed = TRUE
        )
        expect_identical(
            names(ds1),
            c("id", "matches", "other_var", "q1", "petloc")
        )
        expect_equal(sum(table(ds1$q1)), 1)
    })

    test_that("Can select rows to join", {
        # When merging datasets during jenkins run tests, the order of the
        # variables is *sometimes* not copied (usually Pet Name/q3 is missing),
        # which makes this test fail. The ordering should be fully copied
        # during the merge operation. Reproduction of the bug on production or
        # alpha did not yield errors with order. Repro. locally is possible
        ds1 <- newDatasetFromFixture("join-apidocs2-to-me")
        with_consent(ds1$allpets_1 <- NULL)
        ds1 <- merge(ds1, ds2[ds2$stringid == "43805958", ],
            by.x = "id", by.y = "stringid"
        )
        expect_prints(ordering(ds1),
            paste(c(
                "ID",
                "Join matches",
                "Another variable",
                paste0("[+] ", name(ds2)),
                paste0(printed_order_apidocs2_merge_by_stringid) # nolint
            ),
            collapse = "\n"
            ),
            fixed = TRUE
        )
        expect_identical(
            names(ds1),
            c(
                "id", "matches", "other_var", "allpets", "q1", "petloc", "ndogs",
                "ndogs_a", "ndogs_b", "q3", "country", "wave"
            )
        )
        expect_equal(sum(table(ds1$q1)), 1)
    })
})

Try the crunch package in your browser

Any scripts or data that you put into this service are public.

crunch documentation built on Aug. 31, 2023, 1:07 a.m.