rcrunch: Crunch.io Data Tools

context("Getting values to make local R objects")



with_mock_crunch({
    ds <- cachedLoadDataset("Vegetables example")
    ds_dup <- loadDataset(paste0(envOrOption("crunch.api"), "datasets/dup"))
    test_that("setup", {
        expect_identical(nrow(ds), 210L)
        expect_identical(ncol(ds), 12L)

        expect_identical(
            names(ds),
            c(
                "wave", "age", "healthy_eater", "enjoy_mr", "veg_enjoy_ca",
                "ratings_numa", "funnel_aware_mr", "funnel_consider_mr", "funnel_buy_mr",
                "weight", "last_vegetable", "last_vegetable_date"

            )
        )
    })

    test_that("as.vector on Numeric Variable", {
        expect_true(is.numeric(as.vector(ds$age)))
    })

    test_that("as.vector on Categorical Variable", {
        expect_true(is.factor(as.vector(ds$healthy_eater)))
        expect_true(all(
            levels(as.vector(ds$healthy_eater)) %in% names(categories(ds$healthy_eater))
        ))
    })

    test_that("as.vector on Categorical Array", {
        expect_true(is.CA(ds$veg_enjoy_ca))
        vec <- as.vector(ds$veg_enjoy_ca)
        expect_true(is.data.frame(vec))
        expect_identical(nrow(vec), 210L)

        # if veg_filling doesn't have correct factor levels there may have been a mistake
        lvls <- c("Strongly Disagree", "Disagree", "Neither", "Agree", "Strongly Agree")
        expected <- data.frame(
            veg_enjoy_ca_healthy = structure(c(3, 1, 1, 1, 5), .Label = lvls, class = "factor"),
            veg_enjoy_ca_tasty = structure(c(5, 1, 1, 1, 1), .Label = lvls, class = "factor"),
            veg_enjoy_ca_filling = structure(c(1, 1, 1, 3, 4), .Label = lvls, class = "factor"),
            veg_enjoy_ca_env = structure(c(3, 2, NA, 3, 4), .Label = lvls, class = "factor")
        )
        expect_identical(vec[1:5, ], expected)
    })

    test_that("as.vector on Multiple Response", {
        expect_true(is.MR(ds$enjoy_mr))
        vec <- as.vector(ds$enjoy_mr)
        expect_true(is.data.frame(vec))
        ## Check that getting subvar by $ from the as.vector of the array
        ## is the same as just getting the subvar as.vector directly
        expect_identical(
            vec$enjoy_mr_savory,
            as.vector(ds$enjoy_mr$enjoy_mr_savory)
        )

        lvls <- c("Yes", "No")
        expected <- data.frame(
            enjoy_mr_savory = structure(c(1, 2, 1, 2, 1), .Label = lvls, class = "factor"),
            enjoy_mr_spicy = structure(c(2, 1, 2, 1, 2), .Label = lvls, class = "factor"),
            enjoy_mr_sweet = structure(c(2, 1, 1, 1, 1), .Label = lvls, class = "factor")
        )
        expect_identical(vec[1:5, ], expected)
    })

    test_that("as.vector on Multiple Response with mode", {
        expected <- data.frame(
            enjoy_mr_savory = c(1, 2, 1, 2, 1),
            enjoy_mr_spicy = c(2, 1, 2, 1, 2),
            enjoy_mr_sweet = c(2, 1, 1, 1, 1)
        )

        vec <- as.vector(ds$enjoy_mr, mode = "id")
        expect_identical(vec[1:5, ], expected)
        expect_identical(
            vec$enjoy_mr_savory,
            as.vector(ds$enjoy_mr$enjoy_mr_savory, mode = "id")
        )
    })

    test_that("as.data.frame on CrunchDataset yields CrunchDataFrame", {
        expect_false(is.data.frame(as.data.frame(ds)))
        expect_is(as.data.frame(ds), "CrunchDataFrame")
        expect_identical(dim(as.data.frame(ds)), c(210L, length(allVariables(ds))))
        expect_identical(names(as.data.frame(ds)), aliases(allVariables(ds)))
        expect_identical(as.data.frame(ds)$age, as.vector(ds$age))
        expect_identical(
            evalq(healthy_eater, as.data.frame(ds)),
            as.vector(ds$healthy_eater)
        )
    })

    test_that("as.data.frame(force = TRUE) generates a POST", {
        expect_POST(
            as.data.frame(ds, force = TRUE, include.hidden = FALSE),
            "https://app.crunch.io/api/datasets/veg/export/csv/",
            '{"filter":null,"options":{"header_field":"qualified_alias",',
            '"missing_values":"","use_category_ids":true}}'
        )
    })

    test_that("csvColInfo works in simple case with no duplicate aliases", {
        col_info <- csvColInfo(
            ds[, c("wave", "age", "enjoy_mr", "last_vegetable", "last_vegetable_date")]
        )

        expect_s4_class(attr(col_info, "meta"), "VariableCatalog")
        attr(col_info, "meta") <- NULL

        expected <- data.frame(
            stringsAsFactors = FALSE,
            orig_alias = c("wave","age",
                      "last_vegetable","last_vegetable_date","enjoy_mr_savory",
                      "enjoy_mr_spicy","enjoy_mr_sweet"),
            parent_alias = c(NA, NA, NA, NA, "enjoy_mr", "enjoy_mr", "enjoy_mr"),
            qualified_alias = c("wave","age",
                                      "last_vegetable","last_vegetable_date",
                                      "enjoy_mr[enjoy_mr_savory]","enjoy_mr[enjoy_mr_spicy]",
                                      "enjoy_mr[enjoy_mr_sweet]"),
            cond_qualified_alias = c("wave","age",
                                "last_vegetable","last_vegetable_date","enjoy_mr_savory",
                                "enjoy_mr_spicy","enjoy_mr_sweet"),
            var_type = c("categorical",
                         "numeric","text","datetime","categorical",
                         "categorical","categorical")
        )

        expect_equivalent(col_info, expected)
    })

    test_that("csvColInfo works when there are duplicate aliases", {
        col_info <- csvColInfo(ds_dup)

        expect_s4_class(attr(col_info, "meta"), "VariableCatalog")
        attr(col_info, "meta") <- NULL

        expected <- data.frame(
            stringsAsFactors = FALSE,
            orig_alias = c("x1","x2","y1",
                      "y2","z","x1","x2_derived","y1","z"),
            parent_alias = c(NA, NA, NA, NA, NA, "x", "x", "y", "y"),
            qualified_alias = c("x1","x2","y1",
                                      "y2","z","x[x1]","x[x2_derived]","y[y1]","y[z]"),
            cond_qualified_alias = c("x1","x2","y1",
                                "y2","z","x[x1]","x2_derived","y[y1]","y[z]"),
            var_type = c("numeric","numeric",
                         "categorical","categorical","categorical",
                         "numeric","numeric","categorical","categorical")
        )
        expect_equivalent(col_info, expected)
    })

    test_that("csvToDataFrame produces the correct data frame", {
        csv_df <- read.csv(
            datasetFixturePath("veg.csv"),
            stringsAsFactors = FALSE, check.names = FALSE#, na.strings = ""
        )
        expected <- readRDS(datasetFixturePath("veg_df.rds"))
        vars <- c(
            "wave", "age", "healthy_eater", "enjoy_mr", "veg_enjoy_ca", "ratings_numa",
            "last_vegetable", "last_vegetable_date"
        )
        cdf <- as.data.frame(ds[, vars])
        # test local CDF variables
        cdf$newvar <- expected$newvar <- c(1:209, NA)
        col_info <- csvColInfo(ds[, vars])
        actual <- csvToDataFrame(csv_df, cdf, col_info)
        expect_equal(actual, expected)
    })

    test_that("csvToDataFrame respects include.hidden", {
        # mock the include.hidden=FALSE by removing variables from csv_df
        col_info <- csvColInfo(ds)
        csv_df <- read.csv(datasetFixturePath("veg-no-hidden.csv"), stringsAsFactors = FALSE, check.names = FALSE)
        expected <- readRDS(datasetFixturePath("veg_hidden_df.rds"))
        cdf <- as.data.frame(ds, include.hidden = FALSE)
        actual <- csvToDataFrame(csv_df, cdf, col_info)
        expect_equal(actual, expected)
    })

    test_that("csvToDataFrame handles duplicate aliases", {
        csv_df <- read.csv(datasetFixturePath("dup.csv"), stringsAsFactors = FALSE, check.names = FALSE)
        col_info <- csvColInfo(ds_dup)

        expected_flat_df <- data.frame(
            v1 = 1:3,
            v2 = 2:4,
            v3 = factor(letters[1:3], levels = letters[1:5]),
            v4 = factor(letters[2:4], levels = letters[1:5]),
            v5 = factor(letters[11:13], levels = letters[11:15]),
            v6 = 1:3,
            v7 = 2:4,
            v8 = factor(letters[1:3], levels = letters[1:5]),
            v9 = factor(letters[2:4], levels = letters[1:5])
        )

        df_cond_qualified <- csvToDataFrame(csv_df, ds_dup, col_info)
        cond_qualified_names <- c("x1", "x2", "y1", "y2", "z", "x[x1]", "x2_derived", "y[y1]", "y[z]")
        expect_equal(names(df_cond_qualified), cond_qualified_names)
        expect_equivalent(
            setNames(df_cond_qualified, paste0("v", seq_len(ncol(df_cond_qualified)))),
            expected_flat_df
        )

        df_qualified <- csvToDataFrame(csv_df, ds_dup, col_info, "qualified_alias")
        qualified_names <- c("x1", "x2", "y1", "y2", "z", "x[x1]", "x[x2_derived]", "y[y1]", "y[z]")
        expect_equal(names(df_qualified), qualified_names)
        expect_equivalent(
            setNames(df_qualified, paste0("v", seq_len(ncol(df_qualified)))),
            expected_flat_df
        )

        expected_packed_df <- setNames(expected_flat_df[, 1:5], c("x1", "x2", "y1", "y2", "z"))
        expected_packed_df$v6 <- setNames(expected_flat_df[, 6:7], c("x1", "x2_derived"))
        expected_packed_df$v7 <- setNames(expected_flat_df[, 8:9], c("y1", "z"))
        df_packed <- csvToDataFrame(csv_df, ds_dup, col_info, "packed")
        expect_equivalent(df_packed, expected_packed_df)
    })

    test_that("as.data.frame when a variable has an apostrophe in its alias", {
        t2 <- forceVariableCatalog(ds)

        t2@variables@index[[2]]$alias <- "Quote 'unquote' alias"
        expect_is(as.data.frame(t2), "CrunchDataFrame")
    })

    test_that("write.csv(ds)", {
        expect_POST(
            write.csv(ds, file = ""),
            "https://app.crunch.io/api/datasets/veg/export/csv/",
            '{"filter":null,"options":{"use_category_ids":false}}'
        )
    })

    test_that("as.data.frame() works with hidden variables", {
        new_ds <- cachedLoadDataset("test ds")
        new_ds$gender@tuple[["discarded"]] <- TRUE
        new_ds_df <- as.data.frame(new_ds, include.hidden = TRUE)
        expect_equal(
            names(new_ds_df),
            aliases(allVariables(new_ds))
        )
        expect_equal(ncol(new_ds_df), 7)
    })

    test_that(".crunchPageSize", {
        expect_identical(.crunchPageSize(ds$age), 100000L)
        expect_identical(.crunchPageSize(ds$healthy_eater), 200000L)
        expect_identical(.crunchPageSize(ds$last_vegetable), 5000L)
        expect_identical(.crunchPageSize(ds$enjoy_mr), 200000L %/% 3L)
        expect_identical(.crunchPageSize(ds$veg_enjoy_ca), 200000L %/% 4L)
        expect_identical(.crunchPageSize(ds$last_vegetable_date), 100000L)
        expect_identical(.crunchPageSize(2016 - ds$birthyr), 50000L)
    })


    test.df <- as.data.frame(ds)


    test_that("model.frame thus works on CrunchDataset", {
        expect_identical(
            model.frame(age ~ healthy_eater, data = test.df),
            model.frame(age ~ healthy_eater, data = ds)
        )
    })


    test_that("so lm() should work too", {
        test.lm <- lm(age ~ healthy_eater, data = ds)
        expected <- lm(age ~ healthy_eater, data = test.df)
        expect_is(test.lm, "lm")
        expect_identical(names(test.lm), names(expected))
        for (i in setdiff(names(expected), "call")) {
            expect_identical(test.lm[[i]], expected[[i]])
        }
    })
})

with_test_authentication({
    ds <- newDataset(df)

    # change the alias of v6 to be something that includes spaces/punctuation
    alias(ds$v6) <- "vee six !"

    test_that("Check the types of our imported data", {
        expect_true(is.Numeric(ds[["v1"]]))
        expect_true(is.Text(ds[["v2"]]))
        expect_true(is.Numeric(ds[["v3"]]))
        expect_true(is.Categorical(ds[["v4"]]))
        expect_true(is.Datetime(ds$v5))
    })
    test_that("as.vector on Numeric", {
        expect_true(is.numeric(as.vector(ds$v1)))
        expect_identical(sum(is.na(as.vector(ds$v1))), 5L)
        expect_equivalent(as.vector(ds$v1), df$v1)
    })
    test_that("as.vector on Text", {
        expect_true(is.character(as.vector(ds$v2)))
        expect_identical(sum(is.na(as.vector(ds$v2))), 5L)
        expect_equivalent(as.vector(ds$v2), df$v2)
    })
    test_that("as.vector on a different Numeric", {
        expect_true(is.numeric(as.vector(ds$v3)))
        expect_equivalent(as.vector(ds$v3), df$v3)
    })
    ## Test on a version with missings
    ds$v4b <- df$v4
    ds$v4b[3:5] <- "No Data"
    values(categories(ds$v4b)[1:2]) <- c(5, 3)
    test_that("as.vector on a Categorical", {
        expect_true(is.factor(as.vector(ds$v4b)))
        expect_equivalent(as.vector(ds$v4b)[-(3:5)], df$v4[-(3:5)]) # nolint
        expect_true(all(is.na(as.vector(ds$v4b)[3:5])))
    })
    test_that("as.vector with mode specified on Categorical", {
        expect_identical(
            as.vector(ds$v4b, mode = "id"),
            c(1, 2, -1, -1, -1, 2, rep(1:2, 7))
        )
        expect_identical(
            as.vector(ds$v4b, mode = "numeric"),
            c(5, 3, NA, NA, NA, 3, rep(c(5, 3), 7))
        )
    })
    ## Delete v4b for the later tests (should just make a fresh ds)
    with(consent(), ds$v4b <- NULL)

    test_that("as.vector on Datetime", {
        expect_is(as.vector(ds$v5), "Date")
        expect_equivalent(as.vector(ds$v5), df$v5)
    })

    test_that("as.data.frame with API", {
        expect_false(is.data.frame(as.data.frame(ds)))
        expect_is(as.data.frame(ds), "CrunchDataFrame")
        expect_identical(dim(as.data.frame(ds)), dim(df))
        expect_identical(names(as.data.frame(ds)), aliases(variables(ds)))

        # check that all of the values are the same
        expect_identical(as.data.frame(ds)$v1, as.vector(ds$v1))
        expect_identical(as.data.frame(ds)$v2, as.vector(ds$v2))
        expect_identical(as.data.frame(ds)$v3, as.vector(ds$v3))
        expect_identical(as.data.frame(ds)$v4, as.vector(ds$v4))
        expect_identical(as.data.frame(ds)$v5, as.vector(ds$v5))
        expect_identical(as.data.frame(ds)$`vee six !`, as.vector(ds$`vee six !`))
    })

    test_that("as.data.frame(force) with API", {
        skip_on_local_backend("Vagrant host doesn't serve files correctly")
        expect_true(is.data.frame(as.data.frame(as.data.frame(ds))))

        df <- as.data.frame(ds, force = TRUE)
        expect_true(is.data.frame(df))

        # check that all of the values are the same
        expect_identical(df$v1, as.vector(ds$v1))
        # we only compare the non-na (1:15) in the text variable, becuase the NA
        # values come down as "No Data". This should probably be fixed to be NAs
        # at some point
        expect_identical(df$v2[1:15], as.vector(ds$v2)[1:15])
        expect_identical(df$v3, as.vector(ds$v3))
        expect_identical(df$v4, as.vector(ds$v4))
        expect_identical(df$v5, as.vector(ds$v5))
        expect_identical(df$`vee six !`, as.vector(ds$`vee six !`))
    })

    ds$hidden_var <- 1:20
    ds <- hideVariables(ds, "hidden_var")

    test_that("as.data.frame(force) retrieves hidden variables", {
        skip_on_local_backend("Vagrant host doesn't serve files correctly")
        expect_equal(hiddenVariables(ds), "hidden_var")

        df <- as.data.frame(ds, force = TRUE, include.hidden = FALSE)
        expect_equal(names(df), c("v1", "v2", "v3", "v4", "v5", "vee six !"))

        df <- as.data.frame(ds, force = TRUE, include.hidden = TRUE)
        expect_equal(names(df), c("v1", "v2", "v3", "v4", "v5", "vee six !", "hidden_var"))

        df <- as.data.frame(ds[, c("v1", "hidden_var")], force = TRUE)
        expect_equal(names(df), c("v1", "hidden_var"))
    })

    test_that("Multiple response variables in as.data.frame(force=TRUE)", {
        skip_on_local_backend("Vagrant host doesn't serve files correctly")
        mrds <- mrdf.setup(newDataset(mrdf, name = "test-mrdfmr"), selections = "1.0")
        mrds$MR2 <- deriveArray(
            list(
                VarDef(mrds$MR$mr_1, name = "dup mr_1", alias = "mr_1"),
                VarDef(mrds$MR$mr_2, name = "dup v4", alias = "v4")
            ),
            name = "MR 2", numeric = FALSE
        )
        expect_message(
            mrds_df <- as.data.frame(mrds, force = TRUE),
            paste0(
                "Some column names are qualified because there were duplicate aliases ",
                "in dataset:\nmr_1 -> MR[mr_1], mr_1 -> MR2[mr_1], v4 -> MR2[v4]"
            ),
            fixed = TRUE
        )

        expect_equal(ncol(mrds_df), 6)
        expect_equal(names(mrds_df), c("MR[mr_1]", "mr_2", "mr_3", "v4", "MR2[mr_1]", "MR2[v4]"))
        expect_equal(mrds_df[["MR[mr_1]"]], as.vector(mrds$MR$mr_1))
        expect_equal(mrds_df$mr_2, as.vector(mrds$MR$mr_2))
        expect_equal(mrds_df$mr_3, as.vector(mrds$MR$mr_3))
        expect_equal(mrds_df$v4, as.vector(mrds$v4))
        expect_equal(mrds_df[["MR2[mr_1]"]], as.vector(mrds$MR2$mr_1))
        expect_equal(mrds_df[["MR2[v4]"]], as.vector(mrds$MR2$v4))
    })

    v2 <- ds$v2
    # Force variable catalog so that it'll get stale
    ds <- forceVariableCatalog(ds)
    with_consent(delete(v2))
    test_that("CrunchDataFrame lazily fetches columns", {
        expect_true("v2" %in% names(ds)) ## ds is stale
        expect_is(as.data.frame(ds), "CrunchDataFrame")
        ## This should error because it will try to get values for v2
        expect_error(as.data.frame(ds, force = TRUE))
    })

    ds <- forceVariableCatalog(ds)
    uncached({
        with_mock(`crunch::.crunchPageSize` = function(x) 5L, {
            with(temp.option(httpcache.log = ""), {
                avlog <- capture.output(v1 <- as.vector(ds$v1))
            })
            test_that("getValues can be paginated", {
                logdf <- loadLogfile(textConnection(avlog))
                ## GET entity to get /values/ URL, then GET /values/ 4x
                ## to get data, then a 5th GET /values/ that returns 0
                ## values, which breaks the pagination loop
                expect_identical(logdf$verb, rep("GET", 6))
                expect_identical(grep("values", logdf$url), 2:6)
            })
            test_that("getValues returns the same result when paginated", {
                expect_equivalent(v1, df$v1)
            })
        })
    })

    test_that("model.frame thus works on CrunchDataset over API", {
        ## would like this to be "identical" instead of "equivalent"
        expect_equivalent(
            model.frame(v1 ~ v3, data = ds),
            model.frame(v1 ~ v3, data = df)
        )
    })

    test_that("so lm() should work too over the API", {
        test.lm <- lm(v1 ~ v3, data = ds)
        expected <- lm(v1 ~ v3, data = df)
        expect_is(test.lm, "lm")
        expect_identical(names(test.lm), names(expected))
        ## would like this to be "identical" instead of "equivalent"
        for (i in setdiff(names(expected), "call")) {
            expect_equivalent(test.lm[[i]], expected[[i]])
        }
    })
})