tests/testthat/test-te_datastore_duckdb.R

test_that("save_to_duckdb works as expected", {
  temp_dir <- withr::local_tempdir(pattern = "duckdb_dir", tempdir(TRUE))
  ddb <- save_to_duckdb(temp_dir)
  expect_equal(ddb@N, 0)
  expect_class(ddb@con, "duckdb_connection")
  expect_equal(ddb@table, "trial_data")
  expect_equal(
    normalizePath(dirname(ddb@path)),
    normalizePath(temp_dir)
  )
  DBI::dbDisconnect(ddb@con)
})

test_that("read_expanded_data can read te_datastore_duckdb data", {
  temp_dir <- withr::local_tempdir(pattern = "duckdb_dir", tempdir(TRUE))
  data <- data_censored
  data$id <- as.factor(data$id)

  trial_to_expand <- trial_sequence("ITT") |>
    set_data(data = data) |>
    set_outcome_model(adjustment_terms = ~age) |>
    set_expansion_options(output = save_to_duckdb(temp_dir), chunk_size = 500)

  expanded_duckdb_data <- expand_trials(trial_to_expand)@expansion@datastore


  # check if no columns get added or removed by read_expanded_data
  expect_equal(ncol(read_expanded_data(expanded_duckdb_data)), 8)

  # check if omitting period reads in all data
  expect_equal(nrow(read_expanded_data(expanded_duckdb_data)), 1558)

  # check if period argument subsets data correctly
  expect_equal(nrow(read_expanded_data(expanded_duckdb_data, period = 1)), 327)
  expect_equal(nrow(read_expanded_data(expanded_duckdb_data, period = c(5, 8, 12))), 63)

  # check if method throws an error when using a character as period
  expect_error(read_expanded_data(expanded_duckdb_data, period = "1"), "period")

  # check if no new NAs are introduced
  expect_equal(sum(is.na.data.frame(read_expanded_data(expanded_duckdb_data))), 0)

  # check if factor variables are kept
  expect_factor(read_expanded_data(expanded_duckdb_data)$id)

  # check if subset_condition subsets data correctly
  expect_equal(
    nrow(read_expanded_data(expanded_duckdb_data, subset_condition = "followup_time <= 5 | age < 40")),
    945
  )

  DBI::dbDisconnect(expanded_duckdb_data@con)
})


test_that("sample_controls works with trial_sequence objects containing te_datastore_duckdb objects", {
  skip_if_not_installed("duckdb", minimum_version = "1.2.0")

  trial_itt_dir <- withr::local_tempdir("trial_itt", tempdir(TRUE))

  trial_itt <- trial_sequence(estimand = "ITT") |>
    set_data(data = data_censored) |>
    set_outcome_model(adjustment_terms = ~ x1 + x2)

  trial_itt_duckdb <- set_expansion_options(
    trial_itt,
    output = save_to_duckdb(file.path(trial_itt_dir, "trial_duckdb")),
    chunk_size = 500
  ) |>
    expand_trials()


  # sample_controls works without additional arguments
  sc_01 <- sample_controls(trial_itt_duckdb, p_control = 0.01, seed = 1221)
  expect_equal(
    sort(sc_01@outcome_data@data$id),
    c(
      7L, 9L, 14L, 15L, 21L, 21L, 29L, 32L, 38L, 38L, 39L, 49L, 49L,
      54L, 54L, 54L, 54L, 54L, 59L, 61L, 65L, 65L, 68L, 68L, 71L, 71L,
      74L, 76L, 83L, 83L, 89L, 98L, 99L
    )
  )

  # sample_controls works with p_control
  sc_02 <- sample_controls(trial_itt_duckdb, p_control = 0.5, seed = 5678)
  expect_equal(sc_02@outcome_data@n_rows, 761)

  # sample_controls works with p_control = 0
  sc_03 <- sample_controls(trial_itt_duckdb, p_control = 0)
  expect_equal(sc_03@outcome_data@n_rows, 14)

  # cases are kept
  expect_equal(sum(sc_01@outcome_data@data$outcome), 14)
  expect_equal(sum(sc_02@outcome_data@data$outcome), 14)
  expect_equal(sum(sc_03@outcome_data@data$outcome), 14)

  # all columns are kept and sample_weight column is added
  expect_equal(
    colnames(sc_01@outcome_data@data),
    c(colnames(read_expanded_data(trial_itt_duckdb@expansion@datastore)), "sample_weight")
  )

  # sample_weight calculated correctly
  expect_equal(sort(sc_01@outcome_data@data$sample_weight), c(rep(1, 14), rep(100, 19)))
  expect_equal(sort(sc_02@outcome_data@data$sample_weight), c(rep(1, 14), rep(2, 747)))

  # sample_controls subsets data correctly
  sc_04 <- sample_controls(
    trial_itt_duckdb,
    period = 1:10,
    subset_condition = "x2 <= 1 & treatment == 1 & (id %in% 40:90 | followup_time %in% c(2, 3, 4, 5, 6))",
    p_control = 0.2,
    seed = 2332
  )
  expect_equal(
    sort(sc_04@outcome_data@data$id),
    c(
      16L, 20L, 44L, 44L, 44L, 44L, 44L, 49L, 53L, 53L, 54L, 54L,
      59L, 59L, 59L, 59L, 59L, 59L, 59L, 60L, 60L, 60L, 74L, 74L, 74L,
      95L, 95L, 95L
    )
  )
  expect_true(all(sc_04@outcome_data@data$x2 <= 1))
  expect_true(all(sc_04@outcome_data@data$treatment == 1))
  expect_true(all(sc_04@outcome_data@data$id %in% 40:90 | sc_04@outcome_data@data$followup_time %in% 2:6))
  expect_true(all(sc_04@outcome_data@data$period %in% 1:10))

  # sample_controls returns the correct classes
  expect_class(sc_04, "trial_sequence_ITT")
  expect_class(sc_04@outcome_data, "te_outcome_data")
  expect_class(sc_04@outcome_data@data, "data.table")

  DBI::dbDisconnect(trial_itt_duckdb@expansion@datastore@con)
})


test_that("load_expanded_data works with trial_sequence objects containing te_datastore_duckdb objects", {
  skip_if_not_installed("duckdb", minimum_version = "1.2.0")

  trial_itt_dir <- withr::local_tempdir("trial_itt", tempdir(TRUE))

  trial_itt <- trial_sequence(estimand = "ITT") |>
    set_data(data = data_censored) |>
    set_outcome_model(adjustment_terms = ~ x1 + x2)

  trial_itt_duckdb <- set_expansion_options(
    trial_itt,
    output = save_to_duckdb(file.path(trial_itt_dir, "trial_duckdb")),
    chunk_size = 500
  ) |>
    expand_trials()

  # load_expanded_data works without additional arguments
  sc_00 <- load_expanded_data(trial_itt_duckdb)
  expect_equal(sc_00@outcome_data@n_rows, 1558)

  # load_expanded_data works with p_control
  sc_01 <- load_expanded_data(trial_itt_duckdb, p_control = 0.01, seed = 1221)
  expect_equal(
    sort(sc_01@outcome_data@data$id),
    c(
      7L, 9L, 14L, 15L, 21L, 21L, 29L, 32L, 38L, 38L, 39L, 49L, 49L,
      54L, 54L, 54L, 54L, 54L, 59L, 61L, 65L, 65L, 68L, 68L, 71L, 71L,
      74L, 76L, 83L, 83L, 89L, 98L, 99L
    )
  )

  # load_expanded_data works with p_control
  sc_02 <- load_expanded_data(trial_itt_duckdb, p_control = 0.5, seed = 5678)
  expect_equal(sc_02@outcome_data@n_rows, 761)

  # load_expanded_data works with p_control = 0
  sc_03 <- load_expanded_data(trial_itt_duckdb, p_control = 0)
  expect_equal(sc_03@outcome_data@n_rows, 14)

  # cases are kept
  expect_equal(sum(sc_00@outcome_data@data$outcome), 14)
  expect_equal(sum(sc_01@outcome_data@data$outcome), 14)
  expect_equal(sum(sc_02@outcome_data@data$outcome), 14)
  expect_equal(sum(sc_03@outcome_data@data$outcome), 14)

  # all columns are kept and sample_weight column is added
  expect_equal(
    colnames(sc_01@outcome_data@data),
    c(colnames(read_expanded_data(trial_itt_duckdb@expansion@datastore)), "sample_weight")
  )

  # sample_weight calculated correctly
  expect_equal(sort(sc_00@outcome_data@data$sample_weight), rep(1, 1558))
  expect_equal(sort(sc_01@outcome_data@data$sample_weight), c(rep(1, 14), rep(100, 19)))
  expect_equal(sort(sc_02@outcome_data@data$sample_weight), c(rep(1, 14), rep(2, 747)))

  # load_expanded_data subsets data correctly
  sc_04 <- load_expanded_data(
    trial_itt_duckdb,
    period = 1:10,
    subset_condition = "x2 <= 1 & treatment == 1 & (id %in% 40:90 | followup_time %in% c(2, 3, 4, 5, 6))",
    p_control = 0.2,
    seed = 2332
  )
  expect_equal(
    sort(sc_04@outcome_data@data$id),
    c(
      16L, 20L, 44L, 44L, 44L, 44L, 44L, 49L, 53L, 53L, 54L, 54L,
      59L, 59L, 59L, 59L, 59L, 59L, 59L, 60L, 60L, 60L, 74L, 74L, 74L,
      95L, 95L, 95L
    )
  )
  expect_true(all(sc_04@outcome_data@data$x2 <= 1))
  expect_true(all(sc_04@outcome_data@data$treatment == 1))
  expect_true(all(sc_04@outcome_data@data$id %in% 40:90 | sc_04@outcome_data@data$followup_time %in% 2:6))
  expect_true(all(sc_04@outcome_data@data$period %in% 1:10))

  # load_expanded_data subsets data correctly without p_control
  sc_05 <- load_expanded_data(
    trial_itt_duckdb,
    period = 1:10,
    subset_condition = "followup_time <= 20 & treatment == 1",
  )
  expect_equal(
    sort(sc_05@outcome_data@data$id),
    c(
      2, 2, 2, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 27, 27, 27,
      27, 27, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 38, 38,
      44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
      44, 44, 44, 44, 44, 44, 44, 44, 47, 47, 47, 49, 49, 49, 50, 50, 50, 50, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54,
      54, 54, 54, 54, 54, 54, 54, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
      59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 65, 65, 65, 65, 70, 70, 70, 71, 71, 71, 71, 71, 71,
      71, 71, 73, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 83, 83, 83, 95, 95, 95, 95, 95, 95,
      95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
      95, 95, 96, 96, 96, 96, 96, 96
    )
  )

  # load_expanded_data returns the correct classes
  expect_class(sc_04, "trial_sequence_ITT")
  expect_class(sc_04@outcome_data, "te_outcome_data")
  expect_class(sc_04@outcome_data@data, "data.table")

  DBI::dbDisconnect(trial_itt_duckdb@expansion@datastore@con)
})


test_that("translate_to_sql works as intended", {
  input_string <- "a == 1 & b >= 0 & c < 10 & d != -1 & (e %in% 1:10 | f %in% c(4, 7, 9)"
  output_string <- translate_to_sql(input_string)
  expect_equal(
    output_string,
    "a = 1 AND b >= 0 AND c < 10 AND d != -1 AND (e IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10) OR f IN (4, 7, 9)"
  )
})

test_that("translate_to_sql catches the error and provides a message", {
  input_string <- "(e%in%1:10|f%in%c(4, 7, 9)"
  expect_warning(expect_error(
    translate_to_sql(input_string),
    "Error translating"
  ))
})
CAM-Roche/RandomisedTrialsEmulation documentation built on April 14, 2025, 7:44 a.m.