test_that("validate_column_names() checks that column names in data and field_types can match exactly", {
expect_silent(validate_column_names(c("nonsense", "set", "of"),
c("nonsense", "set", "of"),
check_length_only = FALSE
))
})
test_that("validate_column_names() checks that column names in data and field_types can match in different order", {
expect_silent(validate_column_names(c("nonsense", "set", "of"),
c("nonsense", "of", "set"),
check_length_only = FALSE
))
})
test_that("validate_column_names() checks that column names in data and field_types must match in length if check_length_only = TRUE", {
expect_silent(validate_column_names(c("nonsense", "set"),
c("nonsense", "names"),
check_length_only = TRUE
))
expect_error(
validate_column_names(c("nonsense", "set", "of"),
c("nonsense", "set"),
check_length_only = TRUE
),
class = "invalid_column_names"
)
})
test_that("validate_column_names() checks that duplicate column names in data not allowed", {
expect_error(
validate_column_names(
c("nonsense", "set", "of", "nonsense", "names"),
c("nonsense", "set", "of", "names"),
check_length_only = FALSE
),
class = "invalid_column_names"
)
})
test_that("validate_column_names() checks that column names in data but not in field_types is not allowed", {
expect_error(
validate_column_names(
c("nonsense", "set", "of", "stuff", "names"),
c("nonsense", "set", "of", "stuff"),
check_length_only = FALSE
),
class = "invalid_column_names"
)
})
test_that("validate_column_names() checks that column names in field_types but not in data is not allowed", {
expect_error(
validate_column_names(
c("nonsense", "set", "of"),
c("nonsense", "set", "of", "stuff"),
check_length_only = FALSE
),
class = "invalid_column_names"
)
})
test_that("validate_column_names() checks that cannot have override_column_names and .default_field_type", {
expect_error(
validate_column_names(
c("nonsense", "set", "of", "stuff"),
c("nonsense", "set", "of", ".default_field_type"),
check_length_only = TRUE
),
class = "invalid_column_names"
)
})
test_that("validate_column_names() allows unnecessary .default_field_type", {
expect_silent(
validate_column_names(
c("nonsense", "set", "of"),
c("nonsense", "set", "of", ".default_field_type"),
check_length_only = FALSE
)
)
})
test_that("validate_column_names() allows specnames to be subset of sourcenames if .default_field_type present", {
expect_silent(
validate_column_names(
c("nonsense", "set", "of"),
c("nonsense", ".default_field_type"),
check_length_only = FALSE
)
)
})
test_that("prepare_data() requires a df param", {
expect_error(prepare_data(field_types = field_types(Col_tp = ft_timepoint())),
class = "invalid_param_missing"
)
})
test_that("prepare_data() requires a field_types param", {
expect_error(prepare_data(df = data.frame("Fieldname" = 123)),
class = "invalid_param_missing"
)
})
test_that("prepare_data() requires df param to be a data frame", {
expect_error(
prepare_data(c("Fieldname", 123),
field_types = field_types(Col_tp = ft_timepoint())
),
class = "invalid_param_type"
)
})
test_that("prepare_data() requires field_types param to be a field_types object", {
expect_error(
prepare_data(
df = data.frame("Fieldname" = 123),
field_types = TRUE
),
class = "invalid_param_type"
)
})
test_that("prepare_data() checks that at least one valid timepoint value is present", {
expect_error(
prepare_data(
df = data.frame(
col1 = rep("01/01/2022", 5),
col2 = rep(1, 5),
col3 = 1:5
),
field_types = field_types(
col1 = ft_timepoint(),
col2 = ft_simple(),
col3 = ft_ignore()
),
show_progress = FALSE
),
class = "invalid_param_type"
)
})
test_that("prepare_data() creates source_data object correctly", {
df <- read_data(test_path("testdata", "completetestset.csv"))
source_data <- prepare_data(
df,
field_types = field_types(
col_timepoint_err = ft_ignore(),
col_timepoint = ft_timepoint(),
col_date_time_err = ft_ignore(),
col_date_time = ft_datetime(),
col_date_only_err = ft_ignore(),
col_date_only = ft_datetime(includes_time = FALSE),
col_date_uk_err = ft_ignore(),
col_date_uk = ft_datetime(includes_time = FALSE, format = "%d/%m/%Y"),
col_id_num_err = ft_ignore(),
col_id_num = ft_uniqueidentifier(),
col_id_string_err = ft_ignore(),
col_id_string = ft_uniqueidentifier(),
col_numeric_clean_err = ft_ignore(),
col_numeric_clean = ft_numeric(),
col_numeric_dirty_err = ft_ignore(),
col_numeric_dirty = ft_numeric(),
col_categorical_small_err = ft_ignore(),
col_categorical_small = ft_categorical(aggregate_by_each_category = TRUE),
col_categorical_large_err = ft_ignore(),
col_categorical_large = ft_categorical(),
col_freetext_err = ft_ignore(),
col_freetext = ft_freetext(),
col_simple_err = ft_ignore(),
col_simple = ft_simple(),
col_numeric_missing_err = ft_ignore(),
col_numeric_missing = ft_numeric()
),
dataset_description = "completetestset",
show_progress = FALSE
)
expect_s3_class(source_data, "daiquiri_source_data")
expect_equal(source_data$timepoint_field_name, "col_timepoint")
expect_equal(source_data$timepoint_missing_n, 6)
expect_equal(source_data$rows_source_n, 900)
expect_equal(source_data$rows_imported_n, 890)
expect_equal(source_data$rows_duplicates_n, 4)
expect_equal(source_data$cols_source_n, 26)
expect_equal(source_data$cols_imported_n, 13)
expect_equal(source_data$dataset_description, "completetestset")
expect_snapshot(source_data$validation_warnings)
})
test_that("prepare_data() gets strata info correctly", {
df <-
data.table::data.table(
"col_timepoint" = c(rep("2022-01-01", 5), rep("2022-01-02", 5), rep("2022-01-04", 5), rep("2022-01-05", 5)),
"col_numeric" = seq(
from = 2,
to = 3,
length.out = 20
),
"col_datetime" = c(paste0("2022-01-", 10 + c(seq(1, 9))), rep("", 11)),
"col_uniqueidentifier" = c(seq(1, 20)),
"col_categorical" = c(rep(c("a", "b"), 8), rep("a", 4)),
"col_simple" = c(rep("", 10), rep("a", 10)),
"col_stratify" = c("", "", rep("SITE2", 6), rep(c("SITE1", "SITE2"), 6))
)
source_data <-
prepare_data(
df,
field_types = field_types(
col_timepoint = ft_timepoint(),
col_numeric = ft_numeric(),
col_datetime = ft_datetime(includes_time = FALSE),
col_uniqueidentifier = ft_uniqueidentifier(),
col_categorical = ft_categorical(),
col_simple = ft_simple(),
col_stratify = ft_strata()
),
na = c("", "NULL"),
show_progress = FALSE
)
expect_equal(source_data$strata_field_name, "col_stratify")
# the strata labels should be alphabetical with NA last
expect_equal(source_data$strata_labels,
c("SITE1", "SITE2", NA))
})
test_that("prepare_data() ignores nonchar columns (since readr::type_convert fails to skip nonchar cols)", {
source_data <- prepare_data(
df = data.frame(
col1 = rep("2022-01-01", 5),
col2 = rep(1, 5),
col3 = 1:5
),
field_types = field_types(
col1 = ft_timepoint(),
col2 = ft_simple(),
col3 = ft_ignore()
),
dataset_description = "ignore nonchar columns",
show_progress = FALSE
)
expect_equal(source_data$cols_imported_n, 2)
})
test_that("prepare_data() generates a validation warning when nonchar columns are provided", {
source_data <- prepare_data(
df = data.frame(
col1 = rep("2022-01-01", 5),
col2 = 1:5
),
field_types = field_types(
col1 = ft_timepoint(),
col2 = ft_numeric()
),
dataset_description = "nonchar columns",
show_progress = FALSE
)
expect_equal(source_data$validation_warnings$field_name, "col2")
expect_match(source_data$validation_warnings$message, "instead of character")
})
test_that("prepare_data() matches nonchar validation warnings to column names correctly when fieldtypes are supplied in different order to df columns", {
source_data <- prepare_data(
df = data.frame(
col1 = rep("2022-01-01", 5),
col2 = 1:5,
col3 = 1:5,
col4 = 1:5
),
field_types = field_types(
col2 = ft_numeric(),
col1 = ft_timepoint(),
col3 = ft_ignore(),
col4 = ft_numeric()
),
dataset_description = "nonchar columns",
show_progress = FALSE
)
expect_equal(source_data$validation_warnings$field_name, c("col2", "col4"))
expect_match(source_data$validation_warnings$message, "instead of character")
})
test_that("prepare_data() matches type_convert() validation warnings to column names correctly when fieldtypes are supplied in different order to df columns", {
source_data <- prepare_data(
df = data.frame(
col_tp = rep("2022-01-01", 5),
col_ign = as.character(1:5),
col_num = c("some text", 1:4),
col_date = rep("2022-01-01", 5)
),
field_types = field_types(
col_num = ft_numeric(),
col_tp = ft_timepoint(),
col_ign = ft_ignore(),
col_date = ft_datetime()
),
dataset_description = "nonchar columns",
show_progress = FALSE
)
expect_equal(source_data$validation_warnings$field_name, "col_num")
expect_match(source_data$validation_warnings$message, "expected a double")
})
test_that("prepare_data() overrides column names correctly", {
source_data <- prepare_data(
df = data.frame(
col1 = rep("2022-01-01", 5),
col2 = rep("1", 5)
),
field_types = field_types(
cola = ft_timepoint(),
colb = ft_simple()
),
override_column_names = TRUE,
dataset_description = "override colnames",
show_progress = FALSE
)
expect_equal(names(source_data$data_fields)[1:2], c("cola", "colb"))
})
test_that("prepare_data() can accept a data.table with nonchar cols without error", {
source_data <- prepare_data(
df = data.table::data.table(
col1 = rep("2022-01-01", 5),
col2 = rep(1, 5)
),
field_types = field_types(
col1 = ft_timepoint(),
col2 = ft_simple()
),
override_column_names = TRUE,
dataset_description = "pass in nonchar data.table",
show_progress = FALSE
)
expect_equal(source_data$cols_imported_n, 2)
})
test_that("prepare_data() gets dataset_description from call if NULL (default) passed in", {
dfobj <- data.frame(
col1 = rep("2022-01-01", 5),
col2 = rep(1, 5),
col3 = 1:5
)
source_data <- prepare_data(
df = dfobj,
field_types = field_types(
col1 = ft_timepoint(),
col2 = ft_simple(),
col3 = ft_ignore()
),
show_progress = FALSE
)
expect_equal(source_data$dataset_description, "dfobj")
source_data <- prepare_data(
df = data.frame(
col1 = rep("2022-01-01", 5),
col2 = rep(1, 5),
col3 = 1:5
),
field_types = field_types(
col1 = ft_timepoint(),
col2 = ft_simple(),
col3 = ft_ignore()
),
show_progress = FALSE
)
expect_equal(
source_data$dataset_description,
"data.frame(col1 = rep(\"2022-01-01\", 5), col2 = rep(1, 5), col3 = 1:5)"
)
})
test_that("source_data object prints to console ok", {
df <- read_data(test_path("testdata", "completetestset.csv"))
source_data <- prepare_data(
df,
field_types = field_types(
col_timepoint_err = ft_ignore(),
col_timepoint = ft_timepoint(),
col_date_time_err = ft_ignore(),
col_date_time = ft_datetime(),
col_date_only_err = ft_ignore(),
col_date_only = ft_datetime(includes_time = FALSE),
col_date_uk_err = ft_ignore(),
col_date_uk = ft_datetime(includes_time = FALSE, format = "%d/%m/%Y"),
col_id_num_err = ft_ignore(),
col_id_num = ft_uniqueidentifier(),
col_id_string_err = ft_ignore(),
col_id_string = ft_uniqueidentifier(),
col_numeric_clean_err = ft_ignore(),
col_numeric_clean = ft_numeric(),
col_numeric_dirty_err = ft_ignore(),
col_numeric_dirty = ft_numeric(),
col_categorical_small_err = ft_ignore(),
col_categorical_small = ft_categorical(aggregate_by_each_category = TRUE),
col_categorical_large_err = ft_ignore(),
col_categorical_large = ft_categorical(),
col_freetext_err = ft_ignore(),
col_freetext = ft_freetext(),
col_simple_err = ft_ignore(),
col_simple = ft_simple(),
col_numeric_missing_err = ft_ignore(),
col_numeric_missing = ft_numeric()
),
dataset_description = "completetestset",
show_progress = FALSE
)
expect_snapshot_output(print(source_data))
})
test_that("source_data object prints to console ok when there is a strata field", {
df <-
data.table::data.table(
"col_timepoint" = c(rep("2022-01-01", 5), rep("2022-01-02", 5), rep("2022-01-04", 5), rep("2022-01-05", 5)),
"col_numeric" = seq(
from = 2,
to = 3,
length.out = 20
),
"col_datetime" = c(paste0("2022-01-", 10 + c(seq(1, 9))), rep("", 11)),
"col_uniqueidentifier" = c(seq(1, 20)),
"col_categorical" = c(rep(c("a", "b"), 8), rep("a", 4)),
"col_simple" = c(rep("", 10), rep("a", 10)),
"col_stratify" = c("", "", rep("SITE2", 6), rep(c("SITE1", "SITE2"), 6))
)
source_data <-
prepare_data(
df,
field_types = field_types(
col_timepoint = ft_timepoint(),
col_numeric = ft_numeric(),
col_datetime = ft_datetime(includes_time = FALSE),
col_uniqueidentifier = ft_uniqueidentifier(),
col_categorical = ft_categorical(),
col_simple = ft_simple(),
col_stratify = ft_strata()
),
na = c("", "NULL"),
show_progress = FALSE
)
expect_snapshot_output(print(source_data))
})
test_that("remove_rows() removes specified rows", {
numrows <- 2000000
testdt <- data.table::data.table(
col1 = rep("2022-01-01", numrows),
col2 = seq(1, numrows),
col3 = rep("XXXXXXXXXXX", numrows)
)
rowstoremove <- c(1, 4, 5, 6, numrows)
row_indicator <- rep(FALSE, numrows)
row_indicator[rowstoremove] <- TRUE
postdt <- remove_rows(data.table::copy(testdt), row_indicator = row_indicator)
# correct no. of rows removed
expect_equal(nrow(postdt), numrows - length(rowstoremove))
# correct rows removed
expect_false(any(postdt$col2 %in% rowstoremove))
# all columns still present
expect_equal(names(testdt), names(postdt))
# the contents of the first non-deleted row is the same before and after
expect_equal(unlist(testdt[which(!row_indicator)[1]]), unlist(postdt[1]))
})
test_that("identify_duplicate_rows() identifies all exactly duplicated rows", {
# unbatched
numrows <- 200000
testdt <- data.table::data.table(
col1 = rep("2022-01-01", numrows),
col2 = seq(1, numrows),
col3 = rep("XXXXXXXXXXX", numrows)
)
duplicaterows <- c(2, 5, 6, 7, numrows)
duplicatevals <- c(1, 4, 4, 4, numrows - 1)
testdt$col2[duplicaterows] <- duplicatevals
result <- identify_duplicate_rows(testdt, "col2", show_progress = FALSE)
expect_true(all(result[duplicaterows]))
expect_true(all(!result[-duplicaterows]))
# batched
numrows <- 20000000
testdt <- data.table::data.table(
col1 = rep("2022-01-01", numrows),
col2 = seq(1, numrows),
col3 = rep("XXXXXXXXXXX", numrows)
)
duplicaterows <- c(2, 5, 6, 7, numrows)
duplicatevals <- c(1, 4, 4, 4, numrows - 1)
testdt$col2[duplicaterows] <- duplicatevals
result <- identify_duplicate_rows(testdt, "col2", show_progress = FALSE)
expect_true(all(result[duplicaterows]))
expect_true(all(!result[-duplicaterows]))
})
test_that("data_field_count() calculates number of values correctly", {
testvalues <- c("2021-06-01", "1",
"2021-06-02", "",
"2021-06-02", "",
"2021-06-03", "",
"2021-06-03", "1",
"2021-06-03", "",
"2021-06-05", "",
"2021-06-05", "",
"2021-06-05", "",
"2021-06-05", ""
)
source_data <-
prepare_data_testhelper(
testvalues = testvalues,
field_type = ft_numeric(),
add_uid_field = TRUE)
expect_equal(data_field_count(source_data$data_fields[["col_values"]]), 2)
})
test_that("data_field_count() returns 0 when all values are missing", {
testvalues <- c("2021-06-01", "",
"2021-06-02", "",
"2021-06-02", "",
"2021-06-03", "",
"2021-06-03", "",
"2021-06-03", "",
"2021-06-05", "",
"2021-06-05", "",
"2021-06-05", "",
"2021-06-05", ""
)
source_data <-
prepare_data_testhelper(
testvalues = testvalues,
field_type = ft_numeric(),
add_uid_field = TRUE)
expect_equal(data_field_count(source_data$data_fields[["col_values"]]), 0)
})
test_that("column-specific missing value strings get set to NA", {
df <-
data.table::data.table(
"col_timepoint" = c(paste0("2022-01-", 10 + c(seq(1, 9))), "1900-01-01"),
"col_numeric" = c("", "0", seq(
from = 2,
to = 3,
length.out = 8
)),
"col_datetime" = c("", "1800-01-01", "1900-01-01", rep("2022-01-01 00:00:00", 7)),
"col_uniqueidentifier" = c("", "999", "0", seq(1, 7)),
"col_categorical" = c("", "Unknown", seq(1, 8)),
"col_freetext" = c("", ".", "Unknown", "Not known", seq(1, 6)),
"col_simple" = c("", "Unk", seq(1, 8)),
"col_none" = c("", "Unk", seq(1, 8))
)
source_data <-
prepare_data(
df = df,
field_types = field_types(
col_timepoint = ft_timepoint(na = "1900-01-01", includes_time = FALSE),
col_numeric = ft_numeric(na = "0"),
col_datetime = ft_datetime(na = "1800-01-01"),
col_uniqueidentifier = ft_uniqueidentifier(na = "999"),
col_categorical = ft_categorical(na = "Unknown"),
col_freetext = ft_freetext(na = c(".", "Unknown", "Not known")),
col_simple = ft_simple(na = "Unk"),
col_none = ft_simple()
),
na = "",
show_progress = FALSE)
expect_equal(nrow(source_data$data_fields[["col_timepoint"]]$values), 9)
expect_equal(source_data$timepoint_missing_n, 1)
expect_true(is.na(source_data$data_fields[["col_numeric"]]$values[1][[1]]))
expect_true(is.na(source_data$data_fields[["col_numeric"]]$values[2][[1]]))
expect_false(is.na(source_data$data_fields[["col_numeric"]]$values[3][[1]]))
expect_true(is.na(source_data$data_fields[["col_datetime"]]$values[1][[1]]))
expect_true(is.na(source_data$data_fields[["col_datetime"]]$values[2][[1]]))
expect_false(is.na(source_data$data_fields[["col_datetime"]]$values[3][[1]]))
expect_false(is.na(source_data$data_fields[["col_datetime"]]$values[4][[1]]))
expect_true(is.na(source_data$data_fields[["col_uniqueidentifier"]]$values[1][[1]]))
expect_true(is.na(source_data$data_fields[["col_uniqueidentifier"]]$values[2][[1]]))
expect_false(is.na(source_data$data_fields[["col_uniqueidentifier"]]$values[3][[1]]))
expect_false(is.na(source_data$data_fields[["col_uniqueidentifier"]]$values[4][[1]]))
expect_true(is.na(source_data$data_fields[["col_categorical"]]$values[1][[1]]))
expect_true(is.na(source_data$data_fields[["col_categorical"]]$values[2][[1]]))
expect_false(is.na(source_data$data_fields[["col_categorical"]]$values[3][[1]]))
expect_true(is.na(source_data$data_fields[["col_freetext"]]$values[1][[1]]))
expect_true(is.na(source_data$data_fields[["col_freetext"]]$values[2][[1]]))
expect_true(is.na(source_data$data_fields[["col_freetext"]]$values[3][[1]]))
expect_true(is.na(source_data$data_fields[["col_freetext"]]$values[4][[1]]))
expect_false(is.na(source_data$data_fields[["col_freetext"]]$values[5][[1]]))
expect_true(is.na(source_data$data_fields[["col_simple"]]$values[1][[1]]))
expect_true(is.na(source_data$data_fields[["col_simple"]]$values[2][[1]]))
expect_false(is.na(source_data$data_fields[["col_simple"]]$values[3][[1]]))
expect_true(is.na(source_data$data_fields[["col_none"]]$values[1][[1]]))
expect_false(is.na(source_data$data_fields[["col_none"]]$values[2][[1]]))
})
test_that("prepare_data() creates source_data object correctly when using field_types_advanced()", {
source_data <- prepare_data(
df = data.frame(
col1 = rep("2022-01-01", 5),
col2 = rep("1", 5),
col3 = rep("hello", 5),
col4 = rep("1", 5),
col5 = 1:5
),
field_types = field_types_advanced(
col1 = ft_timepoint(),
col4 = ft_numeric(),
.default_field_type = ft_simple()
),
show_progress = FALSE
)
expect_s3_class(source_data, "daiquiri_source_data")
expect_equal(source_data$timepoint_field_name, "col1")
expect_equal(source_data$rows_source_n, 5)
expect_equal(source_data$rows_imported_n, 5)
expect_equal(source_data$cols_source_n, 5)
expect_equal(source_data$cols_imported_n, 5)
expect_equal(source_data$data_fields[["col1"]]$field_type$type, "timepoint")
expect_equal(source_data$data_fields[["col2"]]$field_type$type, "simple")
expect_equal(source_data$data_fields[["col3"]]$field_type$type, "simple")
expect_equal(source_data$data_fields[["col4"]]$field_type$type, "numeric")
expect_equal(source_data$data_fields[["col5"]]$field_type$type, "simple")
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.