test_that("aggregate_data() requires a source_data param", {
expect_error(aggregate_data(),
class = "invalid_param_missing"
)
})
test_that("aggregate_data() requires source_data param to be a source_data object", {
expect_error(aggregate_data(source_data = data.frame("Fieldname" = 123)),
class = "invalid_param_type"
)
})
test_that("aggregate_data() requires aggregation_timeunit param to be one of day/week/month/quarter/year", {
expect_error(
aggregate_data(
source_data = structure(list(data_fields = NA), class = "daiquiri_source_data"),
aggregation_timeunit = "hello"
),
class = "invalid_param_type"
)
})
test_that("aggregate_data() creates aggregate object correctly", {
df <- read_data(test_path("testdata", "completetestset.csv"))
source_data <-
prepare_data(
df,
field_types = field_types(
col_timepoint_err = ft_ignore(),
col_timepoint = ft_timepoint(),
col_date_time_err = ft_ignore(),
col_date_time = ft_datetime(),
col_date_only_err = ft_ignore(),
col_date_only = ft_datetime(includes_time = FALSE),
col_date_uk_err = ft_ignore(),
col_date_uk = ft_datetime(includes_time = FALSE, format = "%d/%m/%Y"),
col_id_num_err = ft_ignore(),
col_id_num = ft_uniqueidentifier(),
col_id_string_err = ft_ignore(),
col_id_string = ft_uniqueidentifier(),
col_numeric_clean_err = ft_ignore(),
col_numeric_clean = ft_numeric(),
col_numeric_dirty_err = ft_ignore(),
col_numeric_dirty = ft_numeric(),
col_categorical_small_err = ft_ignore(),
col_categorical_small = ft_categorical(aggregate_by_each_category = TRUE),
col_categorical_large_err = ft_ignore(),
col_categorical_large = ft_categorical(),
col_freetext_err = ft_ignore(),
col_freetext = ft_freetext(),
col_simple_err = ft_ignore(),
col_simple = ft_simple(),
col_numeric_missing_err = ft_ignore(),
col_numeric_missing = ft_numeric()
),
dataset_description = "completetestset",
show_progress = FALSE
)
aggregated_data <-
aggregate_data(source_data,
aggregation_timeunit = "week",
show_progress = FALSE
)
expect_s3_class(aggregated_data, "daiquiri_aggregated_data")
# check the basic structure is ok
expect_equal(aggregated_data$timepoint_field_name, "col_timepoint")
expect_equal(aggregated_data$aggregation_timeunit, "week")
expect_setequal(
names(aggregated_data$aggregated_fields),
c(
names(source_data$cols_imported_indexes),
"[DUPLICATES]",
"[ALL_FIELDS_COMBINED]"
)
)
expect_equal(nrow(aggregated_data$aggregated_fields$col_timepoint$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields$col_timepoint$values), 4)
expect_equal(nrow(aggregated_data$aggregated_fields$col_date_time$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields$col_date_time$values), 10)
expect_equal(nrow(aggregated_data$aggregated_fields$col_date_only$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields$col_date_only$values), 8)
expect_equal(nrow(aggregated_data$aggregated_fields$col_id_num$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields$col_id_num$values), 7)
expect_equal(nrow(aggregated_data$aggregated_fields$col_timepoint$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields$col_numeric_clean$values), 10)
expect_equal(nrow(aggregated_data$aggregated_fields$col_categorical_small$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields$col_categorical_small$values), 19)
expect_equal(nrow(aggregated_data$aggregated_fields$col_categorical_large$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields$col_categorical_large$values), 5)
expect_equal(nrow(aggregated_data$aggregated_fields$col_freetext$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields$col_freetext$values), 4)
expect_equal(nrow(aggregated_data$aggregated_fields$col_simple$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields$col_simple$values), 4)
expect_equal(nrow(aggregated_data$aggregated_fields$col_numeric_missing$values), 43)
expect_equal(nrow(aggregated_data$aggregated_fields[["[DUPLICATES]"]]$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields[["[DUPLICATES]"]]$values), 3)
expect_equal(nrow(aggregated_data$aggregated_fields[["[ALL_FIELDS_COMBINED]"]]$values), 43)
expect_equal(ncol(aggregated_data$aggregated_fields[["[ALL_FIELDS_COMBINED]"]]$values), 6)
})
test_that("export_aggregated_data() requires a aggregated_data param", {
expect_error(export_aggregated_data(save_directory = tempdir()),
class = "invalid_param_missing"
)
})
test_that("export_aggregated_data() requires a save_directory param", {
expect_error(
export_aggregated_data(
aggregated_data = structure(list(data_fields = NA), class = "daiquiri_aggregated_data")
),
class = "invalid_param_missing"
)
})
test_that("export_aggregated_data() requires aggregated_data param to be an aggregated_data object", {
expect_error(
export_aggregated_data(
aggregated_data = data.frame("Fieldname" = 123),
save_directory = tempdir()
),
class = "invalid_param_type"
)
})
test_that("export_aggregated_data() generates csv files", {
source_data <-
prepare_data(
data.frame(
col1 = rep("2022-01-01", 5),
col2 = rep(1, 5),
col3 = 1:5
),
field_types = field_types(
col1 = ft_timepoint(),
col2 = ft_simple(),
col3 = ft_ignore()
),
dataset_description = "exporttestset",
show_progress = FALSE
)
aggregated_data <- aggregate_data(source_data,
aggregation_timeunit = "day",
show_progress = FALSE
)
export_aggregated_data(aggregated_data,
save_directory = tempdir(),
save_file_prefix = "test_"
)
expect_snapshot_file(file.path(tempdir(), "test_col1.csv"), compare = compare_file_text)
expect_snapshot_file(file.path(tempdir(), "test_col2.csv"), compare = compare_file_text)
expect_snapshot_file(file.path(tempdir(), "test_[ALL_FIELDS_COMBINED].csv"), compare = compare_file_text)
expect_snapshot_file(file.path(tempdir(), "test_[DUPLICATES].csv"), compare = compare_file_text)
expect_false(file.exists(file.path(tempdir(), "test_col3.csv")))
# clean up
file.remove(file.path(tempdir(), "test_col1.csv"))
file.remove(file.path(tempdir(), "test_col2.csv"))
file.remove(file.path(tempdir(), "test_[ALL_FIELDS_COMBINED].csv"))
file.remove(file.path(tempdir(), "test_[DUPLICATES].csv"))
})
test_that("aggregate_combined_fields() removes NAs when rowsumming", {
df <-
data.table::data.table(
"col_timepoint" = paste0("2022-01-", 10 + c(seq(1, 3), seq(6, 21))),
"col_numeric" = c("", "", rep(1, 17)),
"col_numeric_missing" = ""
)
source_data <-
prepare_data(
df,
field_types = field_types(
col_timepoint = ft_timepoint(),
col_numeric = ft_numeric(),
col_numeric_missing = ft_numeric()
),
dataset_description = "blankplottest",
override_column_names = FALSE,
na = c("", "NULL"),
show_progress = FALSE
)
aggregated_data <-
aggregate_data(source_data,
aggregation_timeunit = "day",
show_progress = FALSE
)
expect_true(identical(
is.na(aggregated_data$aggregated_fields$`[ALL_FIELDS_COMBINED]`$values$missing_n),
c(rep(FALSE, 3), rep(TRUE, 2), rep(FALSE, 16))
))
})
test_that("aggregated_data object prints to console ok", {
source_data <-
prepare_data(
df = data.frame(
col1 = rep("2022-01-01", 5),
col2 = rep(1, 5),
col3 = 1:5
),
field_types = field_types(
col1 = ft_timepoint(),
col2 = ft_simple(),
col3 = ft_ignore()
),
dataset_description = "completetestset",
show_progress = FALSE
)
aggregated_data <-
aggregate_data(source_data,
aggregation_timeunit = "day",
show_progress = FALSE
)
expect_snapshot_output(print(aggregated_data))
})
test_that("aggregated values contain NAs instead of Infs or NaNs", {
df <-
data.table::data.table(
"col_timepoint" = paste0("2022-01-", 10 + c(seq(1, 3), seq(6, 21))),
"col_numeric" = c("", "", seq(
from = 2,
to = 3,
length.out = 17
)),
"col_datetime" = c("", "", rep("2022-01-01 00:00:00", 17)),
"col_uniqueidentifier" = c("", "", seq(1, 17))
)
source_data <-
prepare_data(
df,
field_types = field_types(
col_timepoint = ft_timepoint(),
col_numeric = ft_numeric(),
col_datetime = ft_datetime(includes_time = TRUE),
col_uniqueidentifier = ft_uniqueidentifier()
),
override_column_names = FALSE,
na = c("", "NULL"),
show_progress = FALSE
)
aggregated_data <-
aggregate_data(source_data,
aggregation_timeunit = "day",
show_progress = FALSE
)
expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_numeric$values$min)))
expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_numeric$values$max)))
expect_false(any(is.nan(aggregated_data$aggregated_fields$col_numeric$values$mean)))
expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_datetime$values$min)))
expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_datetime$values$max)))
expect_false(any(is.nan(aggregated_data$aggregated_fields$col_datetime$values$midnight_perc)))
expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_uniqueidentifier$values$min_length)))
expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_uniqueidentifier$values$max_length)))
expect_false(any(is.nan(aggregated_data$aggregated_fields$col_uniqueidentifier$values$mean_length)))
})
test_that("agg_fun_subcat_value() retrieves value correctly", {
expect_equal(agg_fun_subcat_value(agg_fun_colname = "subcat_n_16_thisvalue"),
"thisvalue"
)
})
test_that("aggregate_data() stratifies aggregations correctly", {
#TODO: TEST IT WORKS WHEN THERE ARE SPECIAL CHARS IN THE STRATA NAMES
df <-
data.table::data.table(
"col_timepoint" = c(rep("2022-01-01", 5), rep("2022-01-02", 5), rep("2022-01-04", 5), rep("2022-01-05", 5)),
"col_numeric" = seq(
from = 2,
to = 3,
length.out = 20
),
"col_datetime" = c(paste0("2022-01-", 10 + c(seq(1, 9))), rep("", 11)),
"col_uniqueidentifier" = c(seq(1, 20)),
"col_categorical" = c(rep(c("a", "b"), 8), rep("a", 4)),
"col_simple" = c(rep("", 10), rep("a", 10)),
"col_stratify" = c("", "", rep("SITE1", 6), rep(c("SITE1", "SITE2"), 6))
)
source_data <-
prepare_data(
df,
field_types = field_types(
col_timepoint = ft_timepoint(),
col_numeric = ft_numeric(),
col_datetime = ft_datetime(includes_time = FALSE),
col_uniqueidentifier = ft_uniqueidentifier(),
col_categorical = ft_categorical(),
col_simple = ft_simple(),
col_stratify = ft_strata()
),
na = c("", "NULL"),
show_progress = FALSE
)
aggregated_data <-
aggregate_data(
source_data = source_data,
aggregation_timeunit = "day",
show_progress = FALSE
)
# check the basic structure is ok
expect_setequal(
names(aggregated_data$aggregated_fields_stratified),
names(aggregated_data$aggregated_fields)
)
# the stratification categories should include NA
expect_equal(unique(aggregated_data$aggregated_fields_stratified$col_numeric$values$col_stratify),
c(NA, "SITE1", "SITE2")
)
# check the stratified values correspond to the overall value
expect_equal(
aggregated_data$aggregated_fields$col_numeric$values$n,
aggregated_data$aggregated_fields_stratified$col_numeric$values[, list(n = sum(n)), by = list(col_timepoint_byday)][[2]]
)
# TODO: more tests needed for other agg_funs
# (when all subvalues are NA the total should be NA, otherwise ignore the NAs)
# expect_equal(
# aggregated_data$aggregated_fields$col_numeric$values$missing_n,
# aggregated_data$aggregated_fields_stratified$col_numeric$values[, list(missing_n = sum(missing_n, na.rm = TRUE)), by = list(col_timepoint_byday)][[2]]
# )
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.