test-derive-array.R
In crunch: Crunch.io Data Tools

context("Deriving array variables")

with_mock_crunch({
    ds <- cachedLoadDataset("test ds")
    test_that("deriveArray works with MR", {
        expect_POST(
            ds$derived_mr <- deriveArray(list(ds$gender),
                selections = list("Female"),
                name = "derivedMR",
                numeric = FALSE
            ),
            "https://app.crunch.io/api/datasets/1/variables/",
            '{"derivation":{"function":"select_categories","args":',
            '[{"function":"array","args":[{"function":"make_frame","args":',
            '[{"map":{"0001":{"variable":"https://app.crunch.io/api/datasets',
            '/1/variables/gender/"}}},{"value":["0001"]}]}],"kwargs":{"numeric":{"value":false}}},', # nolint
            '{"value":["Female"]}]},"name":"derivedMR","alias":"derived_mr"}'
        )
    })
})


with_test_authentication({
    ds <- newDatasetFromFixture("apidocs")

    ## Reference values:
    plh.values <- as.vector(ds$petloc$petloc_home)
    plw.values <- as.vector(ds$petloc$petloc_work)
    q1.values <- as.vector(ds$q1)
    plh.counts <- table(ds$petloc$petloc_home)
    # petloc_home
    #  Cat  Dog Bird
    #    5    3    3
    plw.counts <- table(ds$petloc$petloc_work)
    # petloc_work
    #  Cat  Dog Bird
    #    6    4    6

    vd <- deriveArray(list(ds$q1, ds$petloc$petloc_home), name = "Derived pets")
    test_that("deriveArray returns a VarDef", {
        expect_is(vd, "VariableDefinition")
    })
    ds$derivedarray <- vd
    test_that("Sending a derived array vardef creates a derived array", {
        expect_true(is.CA(ds$derivedarray))
        expect_identical(names(subvariables(ds$derivedarray)), c("Pet", "Home"))
        expect_identical_temp_nodata(
            names(categories(ds$derivedarray)),
            c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data")
        )
        expect_identical(as.vector(ds$derivedarray[[1]]), q1.values)
        expect_identical(as.vector(ds$derivedarray[[2]]), as.vector(ds$petloc$petloc_home))
    })

    ds$derivedmr <- deriveArray(
        list(ds$q1, ds$petloc$petloc_home),
        selections = "Dog", name = "Derived pets MR"
    ) # cat dog has id 2
    test_that("deriveArray can also derive MRs", {
        expect_true(is.MR(ds$derivedmr))
        expect_identical(names(subvariables(ds$derivedmr)), c("Pet", "Home"))
        expect_identical_temp_nodata(
            names(categories(ds$derivedmr)),
            c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data")
        )
        expect_identical(as.vector(ds$derivedmr[[1]]), q1.values)
        expect_true(is.selected(categories(ds$derivedmr)[[2]]))
    })

    test_that("Can edit metadata of the derived array, and parents are unaffected", {
        aliases(subvariables(ds$derivedarray)) <- c("dsub1", "dsub2")
        expect_identical(
            aliases(subvariables(ds$derivedarray)),
            c("dsub1", "dsub2")
        )
        names(categories(ds$derivedarray))[1:2] <- c("one", "two")
        expect_identical_temp_nodata(
            names(categories(ds$derivedarray)),
            c("one", "two", "Bird", "Skipped", "Not Asked", "No Data")
        )
        ds <- refresh(ds)
        expect_identical(
            aliases(subvariables(ds$derivedarray)),
            c("dsub1", "dsub2")
        )
        expect_identical_temp_nodata(
            names(categories(ds$derivedarray)),
            c("one", "two", "Bird", "Skipped", "Not Asked", "No Data")
        )
        expect_equivalent(
            table(ds$derivedarray$dsub2),
            structure(c(5, 3, 3),
                .Dim = 3L,
                .Dimnames = list(dsub2 = c("one", "two", "Bird")),
                class = "table"
            )
        )
        ## Parent unaffected
        expect_true(is.Categorical(ds$q1))
        expect_identical_temp_nodata(
            names(categories(ds$q1)),
            c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data")
        )
    })

    ## Deep copy array, rename it, etc.
    ds$petloc2 <- copy(ds$petloc, deep = TRUE, name = "Other pet loc")
    aliases(subvariables(ds$petloc2)) <- c("pl2_a", "pl2_b")

    ## Make a "flipped" version of that and the original
    ds$petloc_a <- deriveArray(list(ds$petloc2$pl2_a, ds$petloc$petloc_home),
        name = "Pet Location: Home", subreferences = list(
            list(name = "Copy", alias = "pla_copy"),
            list(name = "Original", alias = "pla_orig")
        )
    )
    ds$petloc_b <- deriveArray(list(ds$petloc2$pl2_b, ds$petloc$petloc_work),
        name = "Pet Location: Work", subreferences = list(
            list(name = "Copy", alias = "plb_copy"),
            list(name = "Original", alias = "plb_orig")
        )
    )

    test_that("Deriving arrays with subreferences specified", {
        expect_identical(
            aliases(subvariables(ds$petloc_a)),
            c("pla_copy", "pla_orig")
        )
        expect_identical(
            names(subvariables(ds$petloc_a)),
            c("Copy", "Original")
        )
    })

    ## Make derived array of derived array
    ds$metapetloc <- deriveArray(list(
        ds$petloc$petloc_home, ds$petloc_a$pla_copy,
        ds$petloc_b$plb_copy, ds$derivedarray$dsub2
    ),
    name = "Derived from derived and real"
    )
    test_that("Derived arrays of derived array subvariables with different categories", {
        expect_identical(
            names(subvariables(ds$metapetloc)),
            c("Home", "Copy", "Copy", "Home")
        )
        expect_identical_temp_nodata(
            names(categories(ds$metapetloc)),
            c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data", "one", "two")
        )

        t1 <- table(ds$metapetloc[[1]])
        expect_identical(names(t1), c("Cat", "Dog", "Bird", "one", "two"))
        expect_equal(as.numeric(t1), c(5, 3, 3, 0, 0))
        expect_identical(as.numeric(table(ds$metapetloc[[2]])), c(5, 3, 3, 0, 0))
        expect_identical(as.numeric(table(ds$metapetloc[[3]])), c(6, 4, 6, 0, 0))
        ## Note the shift: this variable has categories "one" and "two"
        expect_identical(as.numeric(table(ds$metapetloc[[4]])), c(0, 0, 3, 5, 3))
    })

    ds$metapet_combined <- combine(ds$metapetloc,
        name = "Metapet combined",
        combinations = list(
            list(name = "Cat", categories = c("Cat", "one")),
            list(name = "Dog", categories = c("Dog", "two"))
        )
    )
    test_that("Combine categories of derived array", {
        expect_identical(
            names(subvariables(ds$metapet_combined)),
            c("Home", "Copy", "Copy", "Home")
        )
        expect_identical_temp_nodata(
            names(categories(ds$metapet_combined)),
            c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data")
        )

        t1 <- table(ds$metapet_combined[[1]])
        expect_identical(names(t1), c("Cat", "Dog", "Bird"))
        expect_equal(as.numeric(t1), c(5, 3, 3))
        expect_identical(as.numeric(table(ds$metapet_combined[[2]])), c(5, 3, 3))
        expect_identical(as.numeric(table(ds$metapet_combined[[3]])), c(6, 4, 6))
        ## Now these are combined back together
        expect_identical(as.numeric(table(ds$metapet_combined[[4]])), c(5, 3, 3))
    })
    ## Append to that dataset, make sure all arrays update appropriately
    part2 <- newDatasetFromFixture("apidocs")
    ds <- appendDataset(ds, part2)
    test_that("When appending, derived arrays get data (or missing, as appropriate)", {
        ## No Data is added to the categories here
        # Replace this `expect_true(identical(new) || identical(old))`
        # construction with `expect_identical(new)` once the "default values"
        # ticket https://www.pivotaltracker.com/story/show/164939686 is released.
        expect_true(
            identical(
                names(categories(ds$metapetloc)),
                c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "one", "two", "No Data")
            ) || identical(
                names(categories(ds$metapetloc)),
                c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data", "one", "two")
            )
        )
        ## metapetloc: subvar 1 is derived from petloc$petloc_home
        expect_identical(
            as.numeric(table(ds$metapetloc[[1]])),
            2 * c(5, 3, 3, 0, 0)
        )
        ## subvar 2 is from petloc_a$pla_1, which points to petloc2, a deep copy
        expect_identical(
            as.numeric(table(ds$metapetloc[[2]])),
            c(5, 3, 3, 0, 0)
        )
        ## likewise for subvar 3
        expect_identical(
            as.numeric(table(ds$metapetloc[[3]])),
            c(6, 4, 6, 0, 0)
        )
        ## 4 comes from derivedarray$dsub2, which comes from petloc$petloc_home,
        ## but with different category names
        # This test expectation is long-standing but wrong:
        # expect_identical(
        #     as.numeric(table(ds$metapetloc[[4]])),
        #     c(5, 3, 6, 5, 3)
        # )
        # It should instead expect these numbers:
        # expect_identical(
        #     as.numeric(table(ds$metapetloc[[4]])),
        #     c(0, 0, 6, 10, 6)
        # )
        # TODO: Once the fix (behind the Crunch API) ships,
        # delete the wrong expectation and uncomment the right one.
    })
    test_that("Combined categories on top of derived array also gets the right data", {
        expect_identical(
            names(categories(ds$metapet_combined)),
            c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data")
        )
        expect_identical(
            as.numeric(table(ds$metapet_combined[[1]])),
            2 * c(5, 3, 3)
        )
        expect_identical(
            as.numeric(table(ds$metapet_combined[[2]])),
            c(5, 3, 3)
        )
        expect_identical(
            as.numeric(table(ds$metapet_combined[[3]])),
            c(6, 4, 6)
        )
        expect_identical(
            as.numeric(table(ds$metapet_combined[[4]])),
            2 * c(5, 3, 3)
        )
    })
    test_that("The array we edited in the beginning still has its edits after appending", {
        expect_identical(
            aliases(subvariables(ds$derivedarray)),
            c("dsub1", "dsub2")
        )
        expect_identical_temp_nodata(
            names(categories(ds$derivedarray)),
            c("one", "two", "Bird", "Skipped", "Not Asked", "No Data")
        )
        ## This is the data we should see in metapetloc[[4]]. It's correct here.
        expect_equivalent(
            table(ds$derivedarray$dsub2),
            structure(c(10, 6, 6),
                .Dim = 3L,
                .Dimnames = list(dsub2 = c("one", "two", "Bird")),
                class = "table"
            )
        )
    })

    ## Append a dataset where the categories in one of the base variables don't quite match
    part3 <- newDatasetFromFixture("apidocs")
    names(categories(part3$petloc))[2] <- "Beaver"
    aliases(subvariables(part3$petloc))[2] <- "petloc_school"
    names(subvariables(part3$petloc))[2] <- "School"
    ds <- appendDataset(ds, part3)

    test_that("petloc is updated appropriately", {
        expect_identical(
            aliases(subvariables(ds$petloc)),
            c("petloc_home", "petloc_work", "petloc_school")
        )
        ## No Data comes in after appending because petloc_work and petloc_school have data gaps
        # Replace this `expect_true(identical(new) || identical(old))`
        # construction with `expect_identical(new)` once the "default values"
        # ticket https://www.pivotaltracker.com/story/show/164939686 is released.
        expect_true(
            identical(
                names(categories(ds$petloc)),
                c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "Beaver", "No Data")
            ) || identical(
                names(categories(ds$petloc)),
                c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data", "Beaver")
            )
        )
    })

    # skip("Derivations of derivations need some work in zz9")
    # test_that("derivedarray gets updated appropriately", {
    #     expect_identical(aliases(subvariables(ds$derivedarray)),
    #         c("dsub1", "dsub2"))
    #     ## No Data and Beaver come in from petloc
    #     expect_identical(names(categories(ds$derivedarray)),
    #         c("one", "two", "Bird", "Skipped", "Not Asked", "Beaver", "No Data"))
    #     # print(table(ds$derivedarray))
    #     # expect_equivalent(table(ds$derivedarray),
    #     #     structure(c(10, 6, 6), .Dim=3L,
    #     #     .Dimnames=list(dsub2=c("one", "two", "Bird")),
    #     #     class="table"))
    # })
    # test_that("petloc_a gets updated appropriately", {
    #     ## petloc_a has Copy and Original, which is petloc_home
    #     expect_identical(aliases(subvariables(ds$petloc_a)),
    #         c("pla_copy", "pla_orig"))
    #     expect_identical(names(subvariables(ds$petloc_a)),
    #         c("Copy", "Original"))
    #     expect_identical(names(categories(ds$petloc_a)),
    #         c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data", "Beaver"))
    #     ## table(that)
    # })
    # test_that("petloc_b gets updated appropriately", {
    #     ## petloc_a has Copy and Original, which is petloc_work. But there is
    #     ## no petloc_work in the new batch
    #     expect_identical(aliases(subvariables(ds$petloc_b)),
    #         c("plb_copy", "plb_orig"))
    #     expect_identical(names(subvariables(ds$petloc_b)),
    #         c("Copy", "Original"))
    #     expect_identical(names(categories(ds$petloc_b)),
    #         c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data", "Beaver"))
    #     ## table(that). all NA for the new batch
    # })
    # test_that("metapetloc gets updated appropriately", {
    #     # petloc$petloc_home, petloc_a$pla_copy, petloc_b$plb_copy, derivedarray$dsub2
    #     expect_identical(names(categories(ds$metapetloc)),
    #         c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "one", "two", "No Data", "Beaver"))
    #     ## metapetloc: subvar 1 is derived from petloc$petloc_home.
    #     ## Beaver instead of Dog in new wave
    #     expect_identical(as.numeric(table(ds$metapetloc[[1]])),
    #         c(15, 6, 9, 0, 0, 3))
    #     ## subvar 2 is from petloc_a$pla_1, which points to petloc2, a deep copy
    #     expect_identical(as.numeric(table(ds$metapetloc[[2]])),
    #         c(5, 3, 3, 0, 0, 0))
    #     ## likewise for subvar 3
    #     expect_identical(as.numeric(table(ds$metapetloc[[3]])),
    #         c(6, 4, 6, 0, 0, 0))
    #     ## 4 comes from derivedarray$dsub2, which comes from petloc$petloc_home,
    #     ## but with different category names

    #     ## This assertion fails. The new data comes in as Cat Dog and not one two
    #     # print(table(ds$metapetloc[[4]]))
    #     expect_identical(as.numeric(table(ds$metapetloc[[4]])),
    #         c(0, 0, 9, 10, 6, 3))
    # })
    # test_that("metapet_combined gets updated appropriately", {
    #     expect_identical(names(categories(ds$metapet_combined)),
    #         c("Cat", "Dog", "Bird", "Skipped", "Not Asked", "No Data", "Beaver"))
    #     expect_identical(as.numeric(table(ds$metapet_combined[[1]])),
    #         c(15, 6, 9, 3))
    #     expect_identical(as.numeric(table(ds$metapet_combined[[2]])),
    #         c(5, 3, 3))
    #     expect_identical(as.numeric(table(ds$metapet_combined[[3]])),
    #         c(6, 4, 6))
    #     expect_identical(as.numeric(table(ds$metapet_combined[[4]])),
    #         c(15, 6, 9, 3))
    # })

    ## TODO: more things (filter entities, drop rows, etc., edit values of base
    ## variables; edit values of derived array?)
})