tests/testthat/test-source_data.R

test_that("validate_column_names() checks that column names in data and field_types can match exactly", {
  expect_silent(validate_column_names(c("nonsense", "set", "of"),
    c("nonsense", "set", "of"),
    check_length_only = FALSE
  ))
})

test_that("validate_column_names() checks that column names in data and field_types can match in different order", {
  expect_silent(validate_column_names(c("nonsense", "set", "of"),
    c("nonsense", "of", "set"),
    check_length_only = FALSE
  ))
})

test_that("validate_column_names() checks that column names in data and field_types must match in length if check_length_only = TRUE", {
  expect_silent(validate_column_names(c("nonsense", "set"),
    c("nonsense", "names"),
    check_length_only = TRUE
  ))

  expect_error(
    validate_column_names(c("nonsense", "set", "of"),
      c("nonsense", "set"),
      check_length_only = TRUE
    ),
    class = "invalid_column_names"
  )
})

test_that("validate_column_names() checks that duplicate column names in data not allowed", {
  expect_error(
    validate_column_names(
      c("nonsense", "set", "of", "nonsense", "names"),
      c("nonsense", "set", "of", "names"),
      check_length_only = FALSE
    ),
    class = "invalid_column_names"
  )
})

test_that("validate_column_names() checks that column names in data but not in field_types is not allowed", {
  expect_error(
    validate_column_names(
      c("nonsense", "set", "of", "stuff", "names"),
      c("nonsense", "set", "of", "stuff"),
      check_length_only = FALSE
    ),
    class = "invalid_column_names"
  )
})

test_that("validate_column_names() checks that column names in field_types but not in data is not allowed", {
  expect_error(
    validate_column_names(
      c("nonsense", "set", "of"),
      c("nonsense", "set", "of", "stuff"),
      check_length_only = FALSE
    ),
    class = "invalid_column_names"
  )
})

test_that("validate_column_names() checks that cannot have override_column_names and .default_field_type", {
  expect_error(
    validate_column_names(
      c("nonsense", "set", "of", "stuff"),
      c("nonsense", "set", "of", ".default_field_type"),
      check_length_only = TRUE
    ),
    class = "invalid_column_names"
  )
})

test_that("validate_column_names() allows unnecessary .default_field_type", {
  expect_silent(
    validate_column_names(
      c("nonsense", "set", "of"),
      c("nonsense", "set", "of", ".default_field_type"),
      check_length_only = FALSE
      )
    )
})

test_that("validate_column_names() allows specnames to be subset of sourcenames if .default_field_type present", {
  expect_silent(
    validate_column_names(
      c("nonsense", "set", "of"),
      c("nonsense", ".default_field_type"),
      check_length_only = FALSE
    )
  )
})


test_that("prepare_data() requires a df param", {
  expect_error(prepare_data(field_types = field_types(Col_tp = ft_timepoint())),
    class = "invalid_param_missing"
  )
})

test_that("prepare_data() requires a field_types param", {
  expect_error(prepare_data(df = data.frame("Fieldname" = 123)),
    class = "invalid_param_missing"
  )
})

test_that("prepare_data() requires df param to be a data frame", {
  expect_error(
    prepare_data(c("Fieldname", 123),
      field_types = field_types(Col_tp = ft_timepoint())
    ),
    class = "invalid_param_type"
  )
})

test_that("prepare_data() requires field_types param to be a field_types object", {
  expect_error(
    prepare_data(
      df = data.frame("Fieldname" = 123),
      field_types = TRUE
    ),
    class = "invalid_param_type"
  )
})

test_that("prepare_data() checks that at least one valid timepoint value is present", {
  expect_error(
    prepare_data(
      df = data.frame(
        col1 = rep("01/01/2022", 5),
        col2 = rep(1, 5),
        col3 = 1:5
      ),
      field_types = field_types(
        col1 = ft_timepoint(),
        col2 = ft_simple(),
        col3 = ft_ignore()
      ),
      show_progress = FALSE
    ),
    class = "invalid_param_type"
  )
})


test_that("prepare_data() creates source_data object correctly", {
  df <- read_data(test_path("testdata", "completetestset.csv"))
  source_data <- prepare_data(
    df,
    field_types = field_types(
      col_timepoint_err = ft_ignore(),
      col_timepoint = ft_timepoint(),
      col_date_time_err = ft_ignore(),
      col_date_time = ft_datetime(),
      col_date_only_err = ft_ignore(),
      col_date_only = ft_datetime(includes_time = FALSE),
      col_date_uk_err = ft_ignore(),
      col_date_uk = ft_datetime(includes_time = FALSE, format = "%d/%m/%Y"),
      col_id_num_err = ft_ignore(),
      col_id_num = ft_uniqueidentifier(),
      col_id_string_err = ft_ignore(),
      col_id_string = ft_uniqueidentifier(),
      col_numeric_clean_err = ft_ignore(),
      col_numeric_clean = ft_numeric(),
      col_numeric_dirty_err = ft_ignore(),
      col_numeric_dirty = ft_numeric(),
      col_categorical_small_err = ft_ignore(),
      col_categorical_small = ft_categorical(aggregate_by_each_category = TRUE),
      col_categorical_large_err = ft_ignore(),
      col_categorical_large = ft_categorical(),
      col_freetext_err = ft_ignore(),
      col_freetext = ft_freetext(),
      col_simple_err = ft_ignore(),
      col_simple = ft_simple(),
      col_numeric_missing_err = ft_ignore(),
      col_numeric_missing = ft_numeric()
    ),
    dataset_description = "completetestset",
    show_progress = FALSE
  )

  expect_s3_class(source_data, "daiquiri_source_data")

  expect_equal(source_data$timepoint_field_name, "col_timepoint")
  expect_equal(source_data$timepoint_missing_n, 6)
  expect_equal(source_data$rows_source_n, 900)
  expect_equal(source_data$rows_imported_n, 890)
  expect_equal(source_data$rows_duplicates_n, 4)
  expect_equal(source_data$cols_source_n, 26)
  expect_equal(source_data$cols_imported_n, 13)
  expect_equal(source_data$dataset_description, "completetestset")

  expect_snapshot(source_data$validation_warnings)
})


test_that("prepare_data() gets strata info correctly", {
  df <-
    data.table::data.table(
      "col_timepoint" = c(rep("2022-01-01", 5), rep("2022-01-02", 5), rep("2022-01-04", 5), rep("2022-01-05", 5)),
      "col_numeric" = seq(
        from = 2,
        to = 3,
        length.out = 20
      ),
      "col_datetime" = c(paste0("2022-01-", 10 + c(seq(1, 9))), rep("", 11)),
      "col_uniqueidentifier" = c(seq(1, 20)),
      "col_categorical" = c(rep(c("a", "b"), 8), rep("a", 4)),
      "col_simple" = c(rep("", 10), rep("a", 10)),
      "col_stratify" = c("", "", rep("SITE2", 6), rep(c("SITE1", "SITE2"), 6))
    )
  source_data <-
    prepare_data(
      df,
      field_types = field_types(
        col_timepoint = ft_timepoint(),
        col_numeric = ft_numeric(),
        col_datetime = ft_datetime(includes_time = FALSE),
        col_uniqueidentifier = ft_uniqueidentifier(),
        col_categorical = ft_categorical(),
        col_simple = ft_simple(),
        col_stratify = ft_strata()
      ),
      na = c("", "NULL"),
      show_progress = FALSE
    )

  expect_equal(source_data$strata_field_name, "col_stratify")

  # the strata labels should be alphabetical with NA last
  expect_equal(source_data$strata_labels,
               c("SITE1", "SITE2", NA))

})


test_that("prepare_data() ignores nonchar columns (since readr::type_convert fails to skip nonchar cols)", {
  source_data <- prepare_data(
    df = data.frame(
      col1 = rep("2022-01-01", 5),
      col2 = rep(1, 5),
      col3 = 1:5
    ),
    field_types = field_types(
      col1 = ft_timepoint(),
      col2 = ft_simple(),
      col3 = ft_ignore()
    ),
    dataset_description = "ignore nonchar columns",
    show_progress = FALSE
  )

  expect_equal(source_data$cols_imported_n, 2)
})


test_that("prepare_data() generates a validation warning when nonchar columns are provided", {
  source_data <- prepare_data(
    df = data.frame(
      col1 = rep("2022-01-01", 5),
      col2 = 1:5
    ),
    field_types = field_types(
      col1 = ft_timepoint(),
      col2 = ft_numeric()
    ),
    dataset_description = "nonchar columns",
    show_progress = FALSE
  )

  expect_equal(source_data$validation_warnings$field_name, "col2")
  expect_match(source_data$validation_warnings$message, "instead of character")
})


test_that("prepare_data() matches nonchar validation warnings to column names correctly when fieldtypes are supplied in different order to df columns", {
  source_data <- prepare_data(
    df = data.frame(
      col1 = rep("2022-01-01", 5),
      col2 = 1:5,
      col3 = 1:5,
      col4 = 1:5
    ),
    field_types = field_types(
      col2 = ft_numeric(),
      col1 = ft_timepoint(),
      col3 = ft_ignore(),
      col4 = ft_numeric()
    ),
    dataset_description = "nonchar columns",
    show_progress = FALSE
  )

  expect_equal(source_data$validation_warnings$field_name, c("col2", "col4"))
  expect_match(source_data$validation_warnings$message, "instead of character")
})


test_that("prepare_data() matches type_convert() validation warnings to column names correctly when fieldtypes are supplied in different order to df columns", {
  source_data <- prepare_data(
    df = data.frame(
      col_tp = rep("2022-01-01", 5),
      col_ign = as.character(1:5),
      col_num = c("some text", 1:4),
      col_date = rep("2022-01-01", 5)
    ),
    field_types = field_types(
      col_num = ft_numeric(),
      col_tp = ft_timepoint(),
      col_ign = ft_ignore(),
      col_date = ft_datetime()
    ),
    dataset_description = "nonchar columns",
    show_progress = FALSE
  )

  expect_equal(source_data$validation_warnings$field_name, "col_num")
  expect_match(source_data$validation_warnings$message, "expected a double")
})


test_that("prepare_data() overrides column names correctly", {
  source_data <- prepare_data(
    df = data.frame(
      col1 = rep("2022-01-01", 5),
      col2 = rep("1", 5)
    ),
    field_types = field_types(
      cola = ft_timepoint(),
      colb = ft_simple()
    ),
    override_column_names = TRUE,
    dataset_description = "override colnames",
    show_progress = FALSE
  )

  expect_equal(names(source_data$data_fields)[1:2], c("cola", "colb"))
})


test_that("prepare_data() can accept a data.table with nonchar cols without error", {
  source_data <- prepare_data(
    df = data.table::data.table(
      col1 = rep("2022-01-01", 5),
      col2 = rep(1, 5)
    ),
    field_types = field_types(
      col1 = ft_timepoint(),
      col2 = ft_simple()
    ),
    override_column_names = TRUE,
    dataset_description = "pass in nonchar data.table",
    show_progress = FALSE
  )

  expect_equal(source_data$cols_imported_n, 2)
})


test_that("prepare_data() gets dataset_description from call if NULL (default) passed in", {
  dfobj <- data.frame(
    col1 = rep("2022-01-01", 5),
    col2 = rep(1, 5),
    col3 = 1:5
  )
  source_data <- prepare_data(
    df = dfobj,
    field_types = field_types(
      col1 = ft_timepoint(),
      col2 = ft_simple(),
      col3 = ft_ignore()
    ),
    show_progress = FALSE
  )

  expect_equal(source_data$dataset_description, "dfobj")

  source_data <- prepare_data(
    df = data.frame(
      col1 = rep("2022-01-01", 5),
      col2 = rep(1, 5),
      col3 = 1:5
    ),
    field_types = field_types(
      col1 = ft_timepoint(),
      col2 = ft_simple(),
      col3 = ft_ignore()
    ),
    show_progress = FALSE
  )

  expect_equal(
    source_data$dataset_description,
    "data.frame(col1 = rep(\"2022-01-01\", 5), col2 = rep(1, 5), col3 = 1:5)"
  )
})


test_that("source_data object prints to console ok", {
  df <- read_data(test_path("testdata", "completetestset.csv"))
  source_data <- prepare_data(
    df,
    field_types = field_types(
      col_timepoint_err = ft_ignore(),
      col_timepoint = ft_timepoint(),
      col_date_time_err = ft_ignore(),
      col_date_time = ft_datetime(),
      col_date_only_err = ft_ignore(),
      col_date_only = ft_datetime(includes_time = FALSE),
      col_date_uk_err = ft_ignore(),
      col_date_uk = ft_datetime(includes_time = FALSE, format = "%d/%m/%Y"),
      col_id_num_err = ft_ignore(),
      col_id_num = ft_uniqueidentifier(),
      col_id_string_err = ft_ignore(),
      col_id_string = ft_uniqueidentifier(),
      col_numeric_clean_err = ft_ignore(),
      col_numeric_clean = ft_numeric(),
      col_numeric_dirty_err = ft_ignore(),
      col_numeric_dirty = ft_numeric(),
      col_categorical_small_err = ft_ignore(),
      col_categorical_small = ft_categorical(aggregate_by_each_category = TRUE),
      col_categorical_large_err = ft_ignore(),
      col_categorical_large = ft_categorical(),
      col_freetext_err = ft_ignore(),
      col_freetext = ft_freetext(),
      col_simple_err = ft_ignore(),
      col_simple = ft_simple(),
      col_numeric_missing_err = ft_ignore(),
      col_numeric_missing = ft_numeric()
    ),
    dataset_description = "completetestset",
    show_progress = FALSE
  )

  expect_snapshot_output(print(source_data))
})


test_that("source_data object prints to console ok when there is a strata field", {
  df <-
    data.table::data.table(
      "col_timepoint" = c(rep("2022-01-01", 5), rep("2022-01-02", 5), rep("2022-01-04", 5), rep("2022-01-05", 5)),
      "col_numeric" = seq(
        from = 2,
        to = 3,
        length.out = 20
      ),
      "col_datetime" = c(paste0("2022-01-", 10 + c(seq(1, 9))), rep("", 11)),
      "col_uniqueidentifier" = c(seq(1, 20)),
      "col_categorical" = c(rep(c("a", "b"), 8), rep("a", 4)),
      "col_simple" = c(rep("", 10), rep("a", 10)),
      "col_stratify" = c("", "", rep("SITE2", 6), rep(c("SITE1", "SITE2"), 6))
    )
  source_data <-
    prepare_data(
      df,
      field_types = field_types(
        col_timepoint = ft_timepoint(),
        col_numeric = ft_numeric(),
        col_datetime = ft_datetime(includes_time = FALSE),
        col_uniqueidentifier = ft_uniqueidentifier(),
        col_categorical = ft_categorical(),
        col_simple = ft_simple(),
        col_stratify = ft_strata()
      ),
      na = c("", "NULL"),
      show_progress = FALSE
    )

  expect_snapshot_output(print(source_data))
})


test_that("remove_rows() removes specified rows", {
  numrows <- 2000000
  testdt <- data.table::data.table(
    col1 = rep("2022-01-01", numrows),
    col2 = seq(1, numrows),
    col3 = rep("XXXXXXXXXXX", numrows)
  )
  rowstoremove <- c(1, 4, 5, 6, numrows)
  row_indicator <- rep(FALSE, numrows)
  row_indicator[rowstoremove] <- TRUE
  postdt <- remove_rows(data.table::copy(testdt), row_indicator = row_indicator)

  # correct no. of rows removed
  expect_equal(nrow(postdt), numrows - length(rowstoremove))
  # correct rows removed
  expect_false(any(postdt$col2 %in% rowstoremove))
  # all columns still present
  expect_equal(names(testdt), names(postdt))
  # the contents of the first non-deleted row is the same before and after
  expect_equal(unlist(testdt[which(!row_indicator)[1]]), unlist(postdt[1]))
})

test_that("identify_duplicate_rows() identifies all exactly duplicated rows", {
  # unbatched
  numrows <- 200000
  testdt <- data.table::data.table(
    col1 = rep("2022-01-01", numrows),
    col2 = seq(1, numrows),
    col3 = rep("XXXXXXXXXXX", numrows)
  )
  duplicaterows <- c(2, 5, 6, 7, numrows)
  duplicatevals <- c(1, 4, 4, 4, numrows - 1)
  testdt$col2[duplicaterows] <- duplicatevals

  result <- identify_duplicate_rows(testdt, "col2", show_progress = FALSE)

  expect_true(all(result[duplicaterows]))
  expect_true(all(!result[-duplicaterows]))

  # batched
  numrows <- 20000000
  testdt <- data.table::data.table(
    col1 = rep("2022-01-01", numrows),
    col2 = seq(1, numrows),
    col3 = rep("XXXXXXXXXXX", numrows)
  )
  duplicaterows <- c(2, 5, 6, 7, numrows)
  duplicatevals <- c(1, 4, 4, 4, numrows - 1)
  testdt$col2[duplicaterows] <- duplicatevals

  result <- identify_duplicate_rows(testdt, "col2", show_progress = FALSE)

  expect_true(all(result[duplicaterows]))
  expect_true(all(!result[-duplicaterows]))
})

test_that("data_field_count() calculates number of values correctly", {

  testvalues <- c("2021-06-01", "1",
                  "2021-06-02", "",
                  "2021-06-02", "",
                  "2021-06-03", "",
                  "2021-06-03", "1",
                  "2021-06-03", "",
                  "2021-06-05", "",
                  "2021-06-05", "",
                  "2021-06-05", "",
                  "2021-06-05", ""
                  )
  source_data <-
    prepare_data_testhelper(
      testvalues = testvalues,
      field_type = ft_numeric(),
      add_uid_field = TRUE)

  expect_equal(data_field_count(source_data$data_fields[["col_values"]]), 2)
})

test_that("data_field_count() returns 0 when all values are missing", {

  testvalues <- c("2021-06-01", "",
                  "2021-06-02", "",
                  "2021-06-02", "",
                  "2021-06-03", "",
                  "2021-06-03", "",
                  "2021-06-03", "",
                  "2021-06-05", "",
                  "2021-06-05", "",
                  "2021-06-05", "",
                  "2021-06-05", ""
                  )
  source_data <-
    prepare_data_testhelper(
      testvalues = testvalues,
      field_type = ft_numeric(),
      add_uid_field = TRUE)

  expect_equal(data_field_count(source_data$data_fields[["col_values"]]), 0)
})

test_that("column-specific missing value strings get set to NA", {

  df <-
    data.table::data.table(
      "col_timepoint" = c(paste0("2022-01-", 10 + c(seq(1, 9))), "1900-01-01"),
      "col_numeric" = c("", "0", seq(
        from = 2,
        to = 3,
        length.out = 8
      )),
      "col_datetime" = c("", "1800-01-01", "1900-01-01", rep("2022-01-01 00:00:00", 7)),
      "col_uniqueidentifier" = c("", "999", "0", seq(1, 7)),
      "col_categorical" = c("", "Unknown", seq(1, 8)),
      "col_freetext" = c("", ".", "Unknown", "Not known", seq(1, 6)),
      "col_simple" = c("", "Unk", seq(1, 8)),
      "col_none" = c("", "Unk", seq(1, 8))
    )

  source_data <-
    prepare_data(
      df = df,
      field_types = field_types(
        col_timepoint = ft_timepoint(na = "1900-01-01", includes_time = FALSE),
        col_numeric = ft_numeric(na = "0"),
        col_datetime = ft_datetime(na = "1800-01-01"),
        col_uniqueidentifier = ft_uniqueidentifier(na = "999"),
        col_categorical = ft_categorical(na = "Unknown"),
        col_freetext = ft_freetext(na = c(".", "Unknown", "Not known")),
        col_simple = ft_simple(na = "Unk"),
        col_none = ft_simple()
      ),
      na = "",
      show_progress = FALSE)

  expect_equal(nrow(source_data$data_fields[["col_timepoint"]]$values), 9)
  expect_equal(source_data$timepoint_missing_n, 1)

  expect_true(is.na(source_data$data_fields[["col_numeric"]]$values[1][[1]]))
  expect_true(is.na(source_data$data_fields[["col_numeric"]]$values[2][[1]]))
  expect_false(is.na(source_data$data_fields[["col_numeric"]]$values[3][[1]]))

  expect_true(is.na(source_data$data_fields[["col_datetime"]]$values[1][[1]]))
  expect_true(is.na(source_data$data_fields[["col_datetime"]]$values[2][[1]]))
  expect_false(is.na(source_data$data_fields[["col_datetime"]]$values[3][[1]]))
  expect_false(is.na(source_data$data_fields[["col_datetime"]]$values[4][[1]]))

  expect_true(is.na(source_data$data_fields[["col_uniqueidentifier"]]$values[1][[1]]))
  expect_true(is.na(source_data$data_fields[["col_uniqueidentifier"]]$values[2][[1]]))
  expect_false(is.na(source_data$data_fields[["col_uniqueidentifier"]]$values[3][[1]]))
  expect_false(is.na(source_data$data_fields[["col_uniqueidentifier"]]$values[4][[1]]))

  expect_true(is.na(source_data$data_fields[["col_categorical"]]$values[1][[1]]))
  expect_true(is.na(source_data$data_fields[["col_categorical"]]$values[2][[1]]))
  expect_false(is.na(source_data$data_fields[["col_categorical"]]$values[3][[1]]))

  expect_true(is.na(source_data$data_fields[["col_freetext"]]$values[1][[1]]))
  expect_true(is.na(source_data$data_fields[["col_freetext"]]$values[2][[1]]))
  expect_true(is.na(source_data$data_fields[["col_freetext"]]$values[3][[1]]))
  expect_true(is.na(source_data$data_fields[["col_freetext"]]$values[4][[1]]))
  expect_false(is.na(source_data$data_fields[["col_freetext"]]$values[5][[1]]))

  expect_true(is.na(source_data$data_fields[["col_simple"]]$values[1][[1]]))
  expect_true(is.na(source_data$data_fields[["col_simple"]]$values[2][[1]]))
  expect_false(is.na(source_data$data_fields[["col_simple"]]$values[3][[1]]))

  expect_true(is.na(source_data$data_fields[["col_none"]]$values[1][[1]]))
  expect_false(is.na(source_data$data_fields[["col_none"]]$values[2][[1]]))
})

test_that("prepare_data() creates source_data object correctly when using field_types_advanced()", {
  source_data <- prepare_data(
    df = data.frame(
      col1 = rep("2022-01-01", 5),
      col2 = rep("1", 5),
      col3 = rep("hello", 5),
      col4 = rep("1", 5),
      col5 = 1:5
    ),
    field_types = field_types_advanced(
      col1 = ft_timepoint(),
      col4 = ft_numeric(),
      .default_field_type = ft_simple()
    ),
    show_progress = FALSE
  )

  expect_s3_class(source_data, "daiquiri_source_data")

  expect_equal(source_data$timepoint_field_name, "col1")
  expect_equal(source_data$rows_source_n, 5)
  expect_equal(source_data$rows_imported_n, 5)
  expect_equal(source_data$cols_source_n, 5)
  expect_equal(source_data$cols_imported_n, 5)
  expect_equal(source_data$data_fields[["col1"]]$field_type$type, "timepoint")
  expect_equal(source_data$data_fields[["col2"]]$field_type$type, "simple")
  expect_equal(source_data$data_fields[["col3"]]$field_type$type, "simple")
  expect_equal(source_data$data_fields[["col4"]]$field_type$type, "numeric")
  expect_equal(source_data$data_fields[["col5"]]$field_type$type, "simple")
})
phuongquan/daiquiri documentation built on April 5, 2024, 10:36 a.m.