rcrunch: Crunch.io Data Tools

context("Add a variable to a dataset")

test_that("toVariable parses R numerics", {
    expect_identical(
        toVariable(2L:4L, name = "Numbers!", alias = "num"),
        structure(list(
            values = 2L:4L, type = "numeric", name = "Numbers!",
            alias = "num"
        ), class = "VariableDefinition")
    )
    expect_equivalent(
        toVariable(2L:4L, name = "Numbers!", alias = "num"),
        list(values = 2L:4L, type = "numeric", name = "Numbers!", alias = "num")
    )
})
test_that("toVariable parses R characters", {
    expect_identical(
        toVariable(letters[1:3]),
        structure(list(values = c("a", "b", "c"), type = "text"),
            class = "VariableDefinition"
        )
    )
})
test_that("toVariable parses factors", {
    expect_identical(
        toVariable(as.factor(c(rep(LETTERS[2:3], 3), NA))),
        VarDef(
            values = c(1L, 2L, 1L, 2L, 1L, 2L, -1L),
            type = "categorical",
            categories = list(
                list(id = 1L, name = "B", numeric_value = 1L, missing = FALSE),
                list(id = 2L, name = "C", numeric_value = 2L, missing = FALSE),
                list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
            )
        )
    )
})

test_that("toVariable parses logical", {
    expect_equivalent(
        toVariable(c(TRUE, FALSE, FALSE, NA, TRUE)),
        VarDef(values = c(1L, 0L, 0L, -1L, 1L), type = "categorical", categories = list(
            list(id = 1L, name = "True", numeric_value = 1L, missing = FALSE, selected = TRUE),
            list(id = 0L, name = "False", numeric_value = 0L, missing = FALSE),
            list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
        ))
    )
})
test_that("toVariable parses AsIses", {
    expect_identical(
        toVariable(I(1:5)),
        structure(list(values = 1L:5L, type = "numeric"),
            class = "VariableDefinition"
        )
    )
    expect_identical(
        toVariable(I(letters[1:3])),
        structure(list(values = c("a", "b", "c"), type = "text"),
            class = "VariableDefinition"
        )
    )
})

test_that("toVariable parses haven::labelled", {
    labelled <- haven::labelled(
        rep(LETTERS[1:3], 3),
        structure(LETTERS[1:3], names = LETTERS[1:3])
    )
    expect_equivalent(
        toVariable(labelled),
        list(values = rep(1:3, 3), type = "categorical", categories = list(
            list(id = 1L, name = "A", numeric_value = 1L, missing = FALSE),
            list(id = 2L, name = "B", numeric_value = 2L, missing = FALSE),
            list(id = 3L, name = "C", numeric_value = 3L, missing = FALSE),
            list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
        ))
    )

    # backwards compatilbity with old class names
    class(labelled) <- "labelled"
    expect_equivalent(
        toVariable(labelled),
        list(values = rep(1:3, 3), type = "categorical", categories = list(
            list(id = 1L, name = "A", numeric_value = 1L, missing = FALSE),
            list(id = 2L, name = "B", numeric_value = 2L, missing = FALSE),
            list(id = 3L, name = "C", numeric_value = 3L, missing = FALSE),
            list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
        ))
    )

    # even if only some values are labelled, the values are still used
    labelled <- haven::labelled(
        rep(LETTERS[1:3], 3),
        structure(LETTERS[2], names = LETTERS[2])
    )
    expect_equivalent(
        toVariable(labelled),
        list(values = rep(1:3, 3), type = "categorical", categories = list(
            list(id = 1L, name = "A", numeric_value = 1L, missing = FALSE),
            list(id = 2L, name = "B", numeric_value = 2L, missing = FALSE),
            list(id = 3L, name = "C", numeric_value = 3L, missing = FALSE),
            list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
        ))
    )


    # If the values are numeric, we still get a categorical
    labelled <- haven::labelled(
        rep(1:3, 3),
        structure(2,
            names = LETTERS[2]
        )
    )
    expect_equivalent(
        toVariable(labelled),
        list(values = rep(1:3, 3), type = "categorical", categories = list(
            list(id = 1L, name = "1", numeric_value = 1L, missing = FALSE),
            list(id = 2L, name = "2", numeric_value = 2L, missing = FALSE),
            list(id = 3L, name = "3", numeric_value = 3L, missing = FALSE),
            list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
        ))
    )
})

test_that("toVariable parses haven::labelled_spss", {
    labelled <- haven::labelled_spss(rep(LETTERS[1:3], 3),
        structure(LETTERS[2:3],
            names = LETTERS[2:3]
        ),
        na_values = LETTERS[1]
    )
    expect_equivalent(
        toVariable(labelled),
        list(values = rep(1:3, 3), type = "categorical", categories = list(
            list(id = 1L, name = "A", numeric_value = 1L, missing = TRUE),
            list(id = 2L, name = "B", numeric_value = 2L, missing = FALSE),
            list(id = 3L, name = "C", numeric_value = 3L, missing = FALSE),
            list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
        ))
    )

    # backwards compatilbity with old class names
    class(labelled) <- c("labelled_spss", "labelled")

    # hack to make the the is.na method act like the old one if we have a newer
    # haven
    if (packageVersion("haven") > "1.1.2") {
        is.na.labelled_spss <<- haven:::is.na.haven_labelled_spss
    }
    expect_equivalent(
        toVariable(labelled),
        list(values = rep(1:3, 3), type = "categorical", categories = list(
            list(id = 1L, name = "A", numeric_value = 1L, missing = TRUE),
            list(id = 2L, name = "B", numeric_value = 2L, missing = FALSE),
            list(id = 3L, name = "C", numeric_value = 3L, missing = FALSE),
            list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
        ))
    )

    # even if only some values are labelled, the values are still used
    labelled <- haven::labelled_spss(rep(LETTERS[1:3], 3),
        structure(LETTERS[2],
            names = LETTERS[2]
        ),
        na_values = LETTERS[1]
    )
    expect_equivalent(
        toVariable(labelled),
        list(values = rep(1:3, 3), type = "categorical", categories = list(
            list(id = 1L, name = "A", numeric_value = 1L, missing = TRUE),
            list(id = 2L, name = "B", numeric_value = 2L, missing = FALSE),
            list(id = 3L, name = "C", numeric_value = 3L, missing = FALSE),
            list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
        ))
    )


    # If the values are numeric, we still get a categorical
    labelled <- haven::labelled_spss(rep(1:3, 3),
        structure(2,
            names = LETTERS[2]
        ),
        na_values = 1
    )
    expect_equivalent(
        toVariable(labelled),
        list(values = rep(1:3, 3), type = "categorical", categories = list(
            list(id = 1L, name = "1", numeric_value = 1L, missing = TRUE),
            list(id = 2L, name = "2", numeric_value = 2L, missing = FALSE),
            list(id = 3L, name = "3", numeric_value = 3L, missing = FALSE),
            list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
        ))
    )
})

test_that("toVariable handles duplicate factor levels", {
    ## Duplicate factor labels were deprecated and are forbidden in the `factor`
    ## constructor function starting in R 3.4.0, but create one anyway in case
    ## older versions are encountered, and because it apparently is still
    ## technically possible to create one like this:
    v <- structure(1:4, .Label = c("a", "b", "b", "c"), class = "factor")
    expect_warning(
        expect_equivalent(
            toVariable(v),
            list(values = 1:4, type = "categorical", categories = list(
                list(id = 1L, name = "a", numeric_value = 1L, missing = FALSE),
                list(id = 2L, name = "b", numeric_value = 2L, missing = FALSE),
                list(id = 3L, name = "b  (1)", numeric_value = 3L, missing = FALSE),
                list(id = 4L, name = "c", numeric_value = 4L, missing = FALSE),
                list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
            ))
        ),
        "Duplicate factor levels given: disambiguating them in translation to Categorical type"
    )
})
test_that("categoriesFromLevels parses levels correctly", {
    expect_identical(
        categoriesFromLevels(levels(iris$Species)),
        list(
            list(id = 1L, name = "setosa", numeric_value = 1L, missing = FALSE),
            list(id = 2L, name = "versicolor", numeric_value = 2L, missing = FALSE),
            list(id = 3L, name = "virginica", numeric_value = 3L, missing = FALSE),
            .no.data
        )
    )
})
test_that("toVariable parses R Date class", {
    expect_equivalent(
        toVariable(as.Date(c("2014-12-16", "2014-12-17"))),
        list(
            values = c("2014-12-16", "2014-12-17"), type = "datetime",
            resolution = "D"
        )
    )
})

test_that("toVariable handles POSIX datetimes (and timezones)", {
    numtime <- 1454238117.123

    expect_equivalent(
        toVariable(as.POSIXct(numtime, origin = "1970-01-01", tz = "UTC")),
        list(
            values = "2016-01-31T11:01:57.123", type = "datetime",
            resolution = "ms"
        )
    )

    # We store times as UTC when they go between R and crunch's database, but we assume
    # they actually refer to local times, so when we convert a variable of eg 5AM central
    # time it gets stored as 5AM UTC
    expect_equivalent(
        toVariable(as.POSIXct(numtime, origin = "1970-01-01", tz = "America/Chicago")),
        list(
            values = "2016-01-31T05:01:57.123", type = "datetime",
            resolution = "ms"
        )
    )

    expect_equivalent(
        toVariable(as.POSIXct(numtime, origin = "1970-01-01", tz = "America/New_York")),
        list(
            values = "2016-01-31T06:01:57.123", type = "datetime",
            resolution = "ms"
        )
    )
})

test_that("POSTNewVariable rejects invalid categories", {
    expect_error(
        POSTNewVariable(
            "",
            list(
                type = "categorical", name = "bad ids",
                categories = list(
                    list(id = -1L, name = "B", numeric_value = 1L, missing = FALSE),
                    list(id = 2L, name = "C", numeric_value = 2L, missing = FALSE),
                    list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
                )
            )
        ),
        "Invalid category ids: must be unique"
    )
    expect_error(
        POSTNewVariable(
            "",
            list(
                type = "categorical", name = "bad names",
                categories = list(
                    list(id = 1L, name = "Name 1", numeric_value = 1L, missing = FALSE),
                    list(id = 2L, name = "Name 1", numeric_value = 2L, missing = FALSE),
                    list(id = -1L, name = "No Data", numeric_value = NULL, missing = TRUE)
                )
            )
        ),
        "Invalid category names: must be unique"
    )
})

test_that("checkVarDefErrors errors correctly", {
    test_errs <- lapply(list("a", "b", 29), function(x) try(log(x), silent = TRUE))
    expect_error(
        checkVarDefErrors(test_errs),
        "The following variable definitions errored on upload: 1, 2"
    )
    test_errs <- lapply(list(29, 23, 24), function(x) try(log(x), silent = TRUE))
    expect_silent(checkVarDefErrors(test_errs))
})

with_mock_crunch({
    ds <- cachedLoadDataset("test ds")
    test_that("assignment restrictions", {
        expect_error(
            ds[[2]] <- 1:25,
            "Only character \\(name\\) indexing supported"
        )
    })
    test_that("Input length validation", {
        expect_error(
            ds$newvar <- 1:13,
            "replacement has 13 rows, data has 25"
        )
        expect_error(
            ds$newvar <- rep(6, 11),
            "replacement has 11 rows, data has 25"
        )
    })

    test_that("Adding a variable with all the same values gets sent more concisely", {
        expect_POST(
            ds$newvar <- rep(5, 25),
            "https://app.crunch.io/api/datasets/1/variables/",
            '{"values":5,"type":"numeric","name":"newvar","alias":"newvar"}'
        )
    })

    test_that("Adding a variable with all missing doesn't send 'values'", {
        expect_POST(
            ds$newvar <- NA_real_,
            "https://app.crunch.io/api/datasets/1/variables/",
            '{"type":"numeric","name":"newvar","alias":"newvar"}'
        )
        expect_POST(
            ds$newvar <- rep(NA_real_, 25),
            "https://app.crunch.io/api/datasets/1/variables/",
            '{"type":"numeric","name":"newvar","alias":"newvar"}'
        )
    })
})

with_test_authentication({
    ds <- newDataset(df)
    test_that("addVariable creates a new remote numeric variable", {
        ds <- addVariables(
            ds,
            VariableDefinition(df$v3, name = "New var", alias = "newVar")
        )
        expect_true("newVar" %in% names(ds))
        nv <- ds$newVar
        expect_true(is.Numeric(nv))
        expect_true(is.Numeric(ds[["v3"]]))
        expect_identical(as.vector(nv), as.vector(ds$v3))
    })
    test_that("addVariable creates text variables from character", {
        ds <- addVariables(
            ds,
            VariableDefinition(df$v2, name = "New var2", alias = "newVar2")
        )
        expect_true("newVar2" %in% names(ds))
        nv <- ds$newVar2
        expect_true(is.Text(nv))
        expect_identical(
            as.vector(nv)[1:15],
            as.vector(ds$v2)[1:15]
        )
        ## note that NAs aren't getting caught in the CSV importer
        ## anymore, but they're right in the addVariable method
    })
    test_that("addVariable creates categorical from factor", {
        ds <- addVariables(
            ds,
            VariableDefinition(df$v4, name = "New var3", alias = "newVar3")
        )
        expect_true("newVar3" %in% names(ds))
        nv <- ds$newVar3
        expect_true(is.Categorical(nv))
        expect_identical(as.vector(nv), as.vector(ds$v4))
    })
    test_that("addVariable creates datetime from Date", {
        ds <- addVariables(
            ds,
            VariableDefinition(df$v5, name = "New var4", alias = "newVar4")
        )
        expect_true("newVar4" %in% names(ds))
        nv <- ds$newVar4
        expect_true(is.Datetime(nv))
        expect_identical(as.vector(nv), as.vector(ds$v5))
    })
    test_that("addVariable creates datetime from POSIXct", {
        skip("Can't support POSIXt until the app supports timezones")
        ds <- addVariables(ds, VariableDefinition(as.POSIXct(df$v5),
            name = "New var 5", alias = "newVar5"
        ))
        expect_true("newVar5" %in% names(ds))
        nv <- ds$newVar5
        expect_true(is.Datetime(nv))
        expect_identical(as.vector(nv), as.vector(ds$v5))
    })
    test_that("[[<- adds variables", {
        ds$newvariable <- 20:1
        expect_true(is.Numeric(ds$newvariable))
        expect_identical(mean(ds$newvariable), 10.5)
    })
    test_that("Variable lengths must match, in an R way", {
        expect_error(
            ds[["not valid"]] <- 1:7,
            "replacement has 7 rows, data has 20"
        )
        ds$ok <- 1
        expect_identical(as.vector(ds$ok), rep(1, 20))
    })

    test_that("Adding text variables (debugging)", {
        ds <- newDataset(data.frame(x = 1:1024))
        ds$a_text_var <- "12345 Some text that is definitely >4 characters"
        ds$a_factor <- factor("Different text")
        ds$the_name <- name(ds)
        ds$another <- factor(rep(c(NA, "Longer text"), 512))
        expect_true(is.Text(ds$a_text_var))
        expect_true(is.Categorical(ds$a_factor))
        expect_true(is.Text(ds$the_name))
        expect_true(is.Categorical(ds$another))
        expect_equal(
            as.array(crtabs(~a_text_var, data = ds)),
            array(1024L,
                dim = 1L,
                dimnames = list(a_text_var = "12345 Some text that is definitely >4 characters")
            )
        )
        expect_equal(
            as.array(crtabs(~a_factor, data = ds)),
            array(1024L,
                dim = 1L,
                dimnames = list(a_factor = "Different text")
            )
        )
        expect_equal(
            as.array(crtabs(~the_name, data = ds)),
            array(1024L,
                dim = 1L,
                dimnames = list(the_name = name(ds))
            )
        )
        expect_equal(
            as.array(crtabs(~another, data = ds, useNA = "always")),
            array(c(512L, 512L),
                dim = 2L,
                dimnames = list(another = c("Longer text", "No Data"))
            )
        )
        expect_identical(
            head(as.vector(ds$a_text_var), 1),
            "12345 Some text that is definitely >4 characters"
        )
    })

    ds <- newDataset(df)
    test_that("Categorical to R and back", {
        v4 <- as.vector(ds$v4)
        expect_identical(levels(v4), c("B", "C"))
        ds$v4a <- v4
        expect_equivalent(as.vector(ds$v4), as.vector(ds$v4a))
    })
    exclusion(ds) <- ds$v3 == 10
    test_that("Categorical to R and back with an exclusion", {
        v4b <- as.vector(ds$v4)
        expect_identical(levels(v4b), c("B", "C"))
        expect_length(v4b, 19)
        ds$v4b <- v4b
        expect_equivalent(as.vector(ds$v4b), as.vector(ds$v4a))
    })
})