daiquiri: Data Quality Reporting for Temporal Datasets

test_that("aggregate_data() requires a source_data param", {
  expect_error(aggregate_data(),
    class = "invalid_param_missing"
  )
})

test_that("aggregate_data() requires source_data param to be a source_data object", {
  expect_error(aggregate_data(source_data = data.frame("Fieldname" = 123)),
    class = "invalid_param_type"
  )
})

test_that("aggregate_data() requires aggregation_timeunit param to be one of day/week/month/quarter/year", {
  expect_error(
    aggregate_data(
      source_data = structure(list(data_fields = NA), class = "daiquiri_source_data"),
      aggregation_timeunit = "hello"
    ),
    class = "invalid_param_type"
  )
})

test_that("aggregate_data() creates aggregate object correctly", {
  df <- read_data(test_path("testdata", "completetestset.csv"))
  source_data <-
    prepare_data(
      df,
      field_types = field_types(
        col_timepoint_err = ft_ignore(),
        col_timepoint = ft_timepoint(),
        col_date_time_err = ft_ignore(),
        col_date_time = ft_datetime(),
        col_date_only_err = ft_ignore(),
        col_date_only = ft_datetime(includes_time = FALSE),
        col_date_uk_err = ft_ignore(),
        col_date_uk = ft_datetime(includes_time = FALSE, format = "%d/%m/%Y"),
        col_id_num_err = ft_ignore(),
        col_id_num = ft_uniqueidentifier(),
        col_id_string_err = ft_ignore(),
        col_id_string = ft_uniqueidentifier(),
        col_numeric_clean_err = ft_ignore(),
        col_numeric_clean = ft_numeric(),
        col_numeric_dirty_err = ft_ignore(),
        col_numeric_dirty = ft_numeric(),
        col_categorical_small_err = ft_ignore(),
        col_categorical_small = ft_categorical(aggregate_by_each_category = TRUE),
        col_categorical_large_err = ft_ignore(),
        col_categorical_large = ft_categorical(),
        col_freetext_err = ft_ignore(),
        col_freetext = ft_freetext(),
        col_simple_err = ft_ignore(),
        col_simple = ft_simple(),
        col_numeric_missing_err = ft_ignore(),
        col_numeric_missing = ft_numeric()
      ),
      dataset_description = "completetestset",
      show_progress = FALSE
    )
  aggregated_data <-
    aggregate_data(source_data,
      aggregation_timeunit = "week",
      show_progress = FALSE
    )

  expect_s3_class(aggregated_data, "daiquiri_aggregated_data")

  # check the basic structure is ok
  expect_equal(aggregated_data$timepoint_field_name, "col_timepoint")
  expect_equal(aggregated_data$aggregation_timeunit, "week")
  expect_setequal(
    names(aggregated_data$aggregated_fields),
    c(
      names(source_data$cols_imported_indexes),
      "[DUPLICATES]",
      "[ALL_FIELDS_COMBINED]"
    )
  )
  expect_equal(nrow(aggregated_data$aggregated_fields$col_timepoint$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields$col_timepoint$values), 4)
  expect_equal(nrow(aggregated_data$aggregated_fields$col_date_time$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields$col_date_time$values), 10)
  expect_equal(nrow(aggregated_data$aggregated_fields$col_date_only$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields$col_date_only$values), 8)
  expect_equal(nrow(aggregated_data$aggregated_fields$col_id_num$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields$col_id_num$values), 7)
  expect_equal(nrow(aggregated_data$aggregated_fields$col_timepoint$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields$col_numeric_clean$values), 10)
  expect_equal(nrow(aggregated_data$aggregated_fields$col_categorical_small$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields$col_categorical_small$values), 19)
  expect_equal(nrow(aggregated_data$aggregated_fields$col_categorical_large$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields$col_categorical_large$values), 5)
  expect_equal(nrow(aggregated_data$aggregated_fields$col_freetext$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields$col_freetext$values), 4)
  expect_equal(nrow(aggregated_data$aggregated_fields$col_simple$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields$col_simple$values), 4)
  expect_equal(nrow(aggregated_data$aggregated_fields$col_numeric_missing$values), 43)
  expect_equal(nrow(aggregated_data$aggregated_fields[["[DUPLICATES]"]]$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields[["[DUPLICATES]"]]$values), 3)
  expect_equal(nrow(aggregated_data$aggregated_fields[["[ALL_FIELDS_COMBINED]"]]$values), 43)
  expect_equal(ncol(aggregated_data$aggregated_fields[["[ALL_FIELDS_COMBINED]"]]$values), 6)
})


test_that("export_aggregated_data() requires a aggregated_data param", {
  expect_error(export_aggregated_data(save_directory = tempdir()),
    class = "invalid_param_missing"
  )
})

test_that("export_aggregated_data() requires a save_directory param", {
  expect_error(
    export_aggregated_data(
      aggregated_data = structure(list(data_fields = NA), class = "daiquiri_aggregated_data")
    ),
    class = "invalid_param_missing"
  )
})

test_that("export_aggregated_data() requires aggregated_data param to be an aggregated_data object", {
  expect_error(
    export_aggregated_data(
      aggregated_data = data.frame("Fieldname" = 123),
      save_directory = tempdir()
    ),
    class = "invalid_param_type"
  )
})

test_that("export_aggregated_data() generates csv files", {
  source_data <-
    prepare_data(
      data.frame(
        col1 = rep("2022-01-01", 5),
        col2 = rep(1, 5),
        col3 = 1:5
      ),
      field_types = field_types(
        col1 = ft_timepoint(),
        col2 = ft_simple(),
        col3 = ft_ignore()
      ),
      dataset_description = "exporttestset",
      show_progress = FALSE
    )
  aggregated_data <- aggregate_data(source_data,
    aggregation_timeunit = "day",
    show_progress = FALSE
  )
  export_aggregated_data(aggregated_data,
    save_directory = tempdir(),
    save_file_prefix = "test_"
  )

  expect_snapshot_file(file.path(tempdir(), "test_col1.csv"), compare = compare_file_text)
  expect_snapshot_file(file.path(tempdir(), "test_col2.csv"), compare = compare_file_text)
  expect_snapshot_file(file.path(tempdir(), "test_[ALL_FIELDS_COMBINED].csv"), compare = compare_file_text)
  expect_snapshot_file(file.path(tempdir(), "test_[DUPLICATES].csv"), compare = compare_file_text)

  expect_false(file.exists(file.path(tempdir(), "test_col3.csv")))

  # clean up
  file.remove(file.path(tempdir(), "test_col1.csv"))
  file.remove(file.path(tempdir(), "test_col2.csv"))
  file.remove(file.path(tempdir(), "test_[ALL_FIELDS_COMBINED].csv"))
  file.remove(file.path(tempdir(), "test_[DUPLICATES].csv"))
})


test_that("aggregate_combined_fields() removes NAs when rowsumming", {
  df <-
    data.table::data.table(
      "col_timepoint" = paste0("2022-01-", 10 + c(seq(1, 3), seq(6, 21))),
      "col_numeric" = c("", "", rep(1, 17)),
      "col_numeric_missing" = ""
    )
  source_data <-
    prepare_data(
      df,
      field_types = field_types(
        col_timepoint = ft_timepoint(),
        col_numeric = ft_numeric(),
        col_numeric_missing = ft_numeric()
      ),
      dataset_description = "blankplottest",
      override_column_names = FALSE,
      na = c("", "NULL"),
      show_progress = FALSE
    )
  aggregated_data <-
    aggregate_data(source_data,
      aggregation_timeunit = "day",
      show_progress = FALSE
    )

  expect_true(identical(
    is.na(aggregated_data$aggregated_fields$`[ALL_FIELDS_COMBINED]`$values$missing_n),
    c(rep(FALSE, 3), rep(TRUE, 2), rep(FALSE, 16))
  ))
})


test_that("aggregated_data object prints to console ok", {
  source_data <-
    prepare_data(
      df = data.frame(
        col1 = rep("2022-01-01", 5),
        col2 = rep(1, 5),
        col3 = 1:5
      ),
      field_types = field_types(
        col1 = ft_timepoint(),
        col2 = ft_simple(),
        col3 = ft_ignore()
      ),
      dataset_description = "completetestset",
      show_progress = FALSE
    )
  aggregated_data <-
    aggregate_data(source_data,
      aggregation_timeunit = "day",
      show_progress = FALSE
    )

  expect_snapshot_output(print(aggregated_data))
})


test_that("aggregated values contain NAs instead of Infs or NaNs", {
  df <-
    data.table::data.table(
      "col_timepoint" = paste0("2022-01-", 10 + c(seq(1, 3), seq(6, 21))),
      "col_numeric" = c("", "", seq(
        from = 2,
        to = 3,
        length.out = 17
      )),
      "col_datetime" = c("", "", rep("2022-01-01 00:00:00", 17)),
      "col_uniqueidentifier" = c("", "", seq(1, 17))
    )
  source_data <-
    prepare_data(
      df,
      field_types = field_types(
        col_timepoint = ft_timepoint(),
        col_numeric = ft_numeric(),
        col_datetime = ft_datetime(includes_time = TRUE),
        col_uniqueidentifier = ft_uniqueidentifier()
      ),
      override_column_names = FALSE,
      na = c("", "NULL"),
      show_progress = FALSE
    )
  aggregated_data <-
    aggregate_data(source_data,
      aggregation_timeunit = "day",
      show_progress = FALSE
    )

  expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_numeric$values$min)))
  expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_numeric$values$max)))
  expect_false(any(is.nan(aggregated_data$aggregated_fields$col_numeric$values$mean)))

  expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_datetime$values$min)))
  expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_datetime$values$max)))
  expect_false(any(is.nan(aggregated_data$aggregated_fields$col_datetime$values$midnight_perc)))

  expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_uniqueidentifier$values$min_length)))
  expect_false(any(is.infinite(aggregated_data$aggregated_fields$col_uniqueidentifier$values$max_length)))
  expect_false(any(is.nan(aggregated_data$aggregated_fields$col_uniqueidentifier$values$mean_length)))
})

test_that("agg_fun_subcat_value() retrieves value correctly", {
  expect_equal(agg_fun_subcat_value(agg_fun_colname = "subcat_n_16_thisvalue"),
               "thisvalue"
  )
})

test_that("aggregate_data() stratifies aggregations correctly", {
  #TODO: TEST IT WORKS WHEN THERE ARE SPECIAL CHARS IN THE STRATA NAMES
  df <-
    data.table::data.table(
      "col_timepoint" = c(rep("2022-01-01", 5), rep("2022-01-02", 5), rep("2022-01-04", 5), rep("2022-01-05", 5)),
      "col_numeric" = seq(
        from = 2,
        to = 3,
        length.out = 20
      ),
      "col_datetime" = c(paste0("2022-01-", 10 + c(seq(1, 9))), rep("", 11)),
      "col_uniqueidentifier" = c(seq(1, 20)),
      "col_categorical" = c(rep(c("a", "b"), 8), rep("a", 4)),
      "col_simple" = c(rep("", 10), rep("a", 10)),
      "col_stratify" = c("", "", rep("SITE1", 6), rep(c("SITE1", "SITE2"), 6))
    )
  source_data <-
    prepare_data(
      df,
      field_types = field_types(
        col_timepoint = ft_timepoint(),
        col_numeric = ft_numeric(),
        col_datetime = ft_datetime(includes_time = FALSE),
        col_uniqueidentifier = ft_uniqueidentifier(),
        col_categorical = ft_categorical(),
        col_simple = ft_simple(),
        col_stratify = ft_strata()
      ),
      na = c("", "NULL"),
      show_progress = FALSE
    )
  aggregated_data <-
    aggregate_data(
      source_data = source_data,
      aggregation_timeunit = "day",
      show_progress = FALSE
    )

  # check the basic structure is ok
  expect_setequal(
    names(aggregated_data$aggregated_fields_stratified),
    names(aggregated_data$aggregated_fields)
  )

  # the stratification categories should include NA
  expect_equal(unique(aggregated_data$aggregated_fields_stratified$col_numeric$values$col_stratify),
               c(NA, "SITE1", "SITE2")
  )

  # check the stratified values correspond to the overall value
  expect_equal(
    aggregated_data$aggregated_fields$col_numeric$values$n,
    aggregated_data$aggregated_fields_stratified$col_numeric$values[, list(n = sum(n)), by = list(col_timepoint_byday)][[2]]
    )
  # TODO: more tests needed for other agg_funs
  # (when all subvalues are NA the total should be NA, otherwise ignore the NAs)
  # expect_equal(
  #   aggregated_data$aggregated_fields$col_numeric$values$missing_n,
  #   aggregated_data$aggregated_fields_stratified$col_numeric$values[, list(missing_n = sum(missing_n, na.rm = TRUE)), by = list(col_timepoint_byday)][[2]]
  #   )


})
phuongquan/daiquiri documentation built on April 5, 2024, 10:36 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
phuongquan/daiquiri
Data Quality Reporting for Temporal Datasets

tests/testthat/test-aggregate_data.R
In phuongquan/daiquiri: Data Quality Reporting for Temporal Datasets

R Package Documentation

Browse R Packages

We want your feedback!

phuongquan/daiquiri Data Quality Reporting for Temporal Datasets

tests/testthat/test-aggregate_data.R In phuongquan/daiquiri: Data Quality Reporting for Temporal Datasets

R Package Documentation

Browse R Packages

We want your feedback!

phuongquan/daiquiri
Data Quality Reporting for Temporal Datasets

tests/testthat/test-aggregate_data.R
In phuongquan/daiquiri: Data Quality Reporting for Temporal Datasets