tests/testthat/test-tb_subset.R

test_that("you can subset on drecno, age, meddra_id", {

  wd_in <- tempdir()

  wd_in <- paste0(wd_in, "/", "tb_subset_t1") # avoid windows #1224
  # never rewrite on the same tempfile in ANY test.

  dir.create(path = wd_in)

  mini_data <- rlang::list2(
    demo =
      data.table(
        UMCReportId = c(1, 2, 3, 4),
        AgeGroup = c(1, 2, 7, 9)
      ),
       drug =
         data.table(
           UMCReportId = c(1, 2, 3, 4),
           Drug_Id = c("d1", "d2", "d3", "d4"),
           DrecNo = c(133138448, 133138448, 111841511, 111841511),
           MedicinalProd_Id = c(25027716, 97354576, 104264760, 37484408)
         ),
       adr  =
         data.table(
           UMCReportId = c(1, 2, 3, 4),
           Adr_Id = c("a1", "a2", "a3", "a4"),
           MedDRA_Id = c(110049083, 146319904, 146319904, 72535511)
         ),
    link =
      data.table(
        Drug_Id = c("d1", "d2", "d3", "d4"),
        Adr_Id = c("a1", "a2", "a3", "a4")
      ),
    srce =
      data.table(
        UMCReportId = c(1, 2, 3, 4)
      ),
    ind  =
      data.table(
        Drug_Id = c("d1", "d2", "d3", "d4")
      ),
    out  =
      data.table(
        UMCReportId = c(1, 2, 3, 4)
      ),
    followup =
      data.table(
        UMCReportId = c(1, 2, 3, 4)
      ),
    suspdup =
      data.table(
        UMCReportId = c(3),
        SuspectedduplicateReportId = c(4)
      )
  )

     purrr::iwalk(
       mini_data,
       function(dataset, name)
         arrow::write_parquet(
           dataset |>
             arrow::as_arrow_table(),
           sink = paste0(wd_in, "/", name, ".parquet")
         )

     )

  # ---- drecno

  sv_selection_drecno <-
    list(ipi = c(133138448, 133138448))

  expect_snapshot(
    tb_subset(
      wd_in = paste0(wd_in, "/"),
      wd_out = paste0(wd_in, "/", "subset_drecno", "/"),
      subset_var = "drecno",
      sv_selection = sv_selection_drecno
    ),
    transform =
      function(chr_line)
        stringr::str_replace(
          chr_line,
          "(?>=\\d{1,3}\\%\\s| ).*(?= \\|)",
          " percent, seconds"
        )
  )


  drug_sub <-
    dt_parquet(paste0(wd_in, "/", "subset_drecno", "/"), "drug")

  demo_sub <-
    dt_parquet(paste0(wd_in, "/", "subset_drecno", "/"), "demo")

  expect_equal(
    sort(unique(demo_sub$UMCReportId)),
    sort(unique(drug_sub$UMCReportId))
    )

  para_test <-
    drug_sub %>%
    dplyr::group_by(UMCReportId) %>%
    dplyr::summarise(
      has_para = max(DrecNo %in% sv_selection_drecno$ipi)
    )

  expect_equal(
    all(para_test$has_para == 1),
    TRUE
  )

  # --- drecno several items

  sv_selection_drecno_item <-
    list(
      ipi = c(133138448),
      nivo = c(111841511)
    )

  expect_snapshot(tb_subset(
    wd_in = paste0(wd_in, "/"),
    wd_out = paste0(wd_in, "/", "subset_drecno_item", "/"),
    subset_var = "drecno",
    sv_selection = sv_selection_drecno_item
  ),
  transform =
    function(chr_line)
      stringr::str_replace(
        chr_line,
        "(?>=\\d{1,3}\\%\\s| ).*(?= \\|)",
        " percent, seconds"
      ))

  demo_sub_item <-
    dt_parquet(paste0(wd_in, "/", "subset_drecno_item", "/"), "demo")

  drug_sub_item <-
    dt_parquet(paste0(wd_in, "/", "subset_drecno_item", "/"), "drug")

  expect_equal(
    nrow(demo_sub_item),
    3 # one duplicate
  )

  # ---- age

  expect_snapshot(tb_subset(
    wd_in = paste0(wd_in, "/"),
    wd_out = paste0(wd_in, "/", "subset_age", "/"),
    subset_var = "age",
    sv_selection = c(7, 8)
  ),
  transform =
    function(chr_line)
      stringr::str_replace(
        chr_line,
        "(?>=\\d{1,3}\\%\\s| ).*(?= \\|)",
        " percent, seconds"
      ))

  drug_sub <-
    dt_parquet(paste0(wd_in, "/", "subset_age", "/"), "drug")

  demo_sub <-
    dt_parquet(paste0(wd_in, "/", "subset_age", "/"), "demo")

  expect_equal(
    sort(unique(demo_sub$UMCReportId)),
    sort(unique(drug_sub$UMCReportId))
  )

  age_test <-
    demo_sub %>%
    dplyr::mutate(
      good_age = AgeGroup %in% c(7, 8)
    )

  expect_equal(
    all(age_test$good_age == 1),
    TRUE
  )

  # ---- adr

  sv_selection_mid <-
    list(colitis = c(146319904, 72535511))

  wd_out <- paste0(wd_in, "/", "subset_meddraid", "/")

  expect_snapshot(tb_subset(
    wd_in = paste0(wd_in, "/"),
    wd_out = paste0(wd_in, "/", "subset_meddraid", "/"),
    subset_var = "meddra_id",
    sv_selection = sv_selection_mid
  ),
  transform =
    function(chr_line)
      stringr::str_replace(
        chr_line,
        "(?>=\\d{1,3}\\%\\s| ).*(?= \\|)",
        " percent, seconds"
      ))

  adr_sub <-
    dt_parquet(paste0(wd_in, "/", "subset_meddraid", "/"), "adr")

  demo_sub <-
    dt_parquet(paste0(wd_in, "/", "subset_meddraid", "/"), "demo")

  expect_equal(
    sort(unique(demo_sub$UMCReportId)),
    sort(unique(adr_sub$UMCReportId))
  )

  colitis_test <-
    adr_sub %>%
    dplyr::group_by(UMCReportId) %>%
    dplyr::summarise(
      has_colitis = max(MedDRA_Id %in% sv_selection_mid$colitis)
    )

  expect_equal(
    all(colitis_test$has_colitis == 1),
    TRUE
  )

  unlink(wd_in, recursive = TRUE)
  # attempt to clean note on debian
  if(dir.exists(wd_in) & Sys.info()[["sysname"]] != "Windows")
    file.remove(wd_in)

})

test_that("you can keep suspdup", {

  wd_in <- tempdir()

  wd_in <- paste0(wd_in, "/", "sub") # avoid windows #1224

  dir.create(path = wd_in)

  mini_data <- rlang::list2(
    demo =
      data.table(
        UMCReportId = c(1, 2, 3, 4),
        AgeGroup = c(1, 7, 7, 8)
      ),
    drug =
      data.table(
        UMCReportId = c(1, 2, 3, 4),
        Drug_Id = c("d1", "d2", "d3", "d4"),
        DrecNo = c(133138448, 133138448, 111841511, 111841511),
        MedicinalProd_Id = c(25027716, 97354576, 104264760, 37484408)
      ),
    adr  =
      data.table(
        UMCReportId = c(1, 2, 3, 4),
        Adr_Id = c("a1", "a2", "a3", "a4"),
        MedDRA_Id = c(110049083, 31672047, 146319904, 72535511)
      ),
    link =
      data.table(
        Drug_Id = c("d1", "d2", "d3", "d4"),
        Adr_Id = c("a1", "a2", "a3", "a4")
      ),
    srce =
      data.table(
        UMCReportId = c(1, 2, 3, 4)
      ),
    ind  =
      data.table(
        Drug_Id = c("d1", "d2", "d3", "d4")
      ),
    out  =
      data.table(
        UMCReportId = c(1, 2, 3, 4)
      ),
    followup =
      data.table(
        UMCReportId = c(1, 2, 3, 4)
      ),
    suspdup =
      data.table(
        UMCReportId = c(3),
        SuspectedduplicateReportId = c(4)
      )
  )

  purrr::iwalk(
    mini_data,
    function(dataset, name)
      arrow::write_parquet(
        dataset |>
          arrow::as_arrow_table(),
        sink = paste0(wd_in, "/", name, ".parquet")
      )

  )

  # ---- age

  expect_snapshot(tb_subset(
    wd_in = paste0(wd_in, "/"),
    wd_out = paste0(wd_in, "/", "subset_age_suspdup", "/"),
    subset_var = "age",
    sv_selection = c(7, 8),
    rm_suspdup = FALSE
  ),
  transform =
    function(chr_line)
      stringr::str_replace(
        chr_line,
        "(?>=\\d{1,3}\\%\\s| ).*(?= \\|)",
        " percent, seconds"
      ))

  expect_snapshot(tb_subset(
    wd_in = paste0(wd_in, "/"),
    wd_out = paste0(wd_in, "/", "subset_age", "/"),
    subset_var = "age",
    sv_selection = c(7, 8),
    rm_suspdup = TRUE
  ),
  transform =
    function(chr_line)
      stringr::str_replace(
        chr_line,
        "(?>=\\d{1,3}\\%\\s| ).*(?= \\|)",
        " percent, seconds"
      ))

  drug_sub <-
    dt_parquet(paste0(wd_in, "/", "subset_age_suspdup", "/"), "drug")

  demo_sub <-
    dt_parquet(paste0(wd_in, "/", "subset_age_suspdup", "/"), "demo")

  demo_nosuspdup <-
    dt_parquet(paste0(wd_in, "/", "subset_age", "/"), "demo")

  expect_equal(
    sort(unique(demo_sub$UMCReportId)),
    sort(unique(drug_sub$UMCReportId))
  )

  age_test <-
    demo_sub %>%
    dplyr::mutate(
      good_age = AgeGroup %in% c(7, 8)
    )

  expect_equal(
    all(age_test$good_age == 1),
    TRUE
  )

  expect_equal(
    nrow(demo_sub),
    nrow(demo_nosuspdup) + 1
  )

  unlink(wd_in, recursive = TRUE)
  if(dir.exists(wd_in) & Sys.info()[["sysname"]] != "Windows")
    file.remove(wd_in)


})


test_that("alternative syntaxes work", {

  wd_in <- tempdir()

  wd_in <- paste0(wd_in, "/", "sub2") # avoid windows #1224

  dir.create(path = wd_in)

  mini_data <- rlang::list2(
    demo =
      data.table(
        UMCReportId = c(1, 2, 3, 4),
        AgeGroup = c(1, 7, 7, 8)
      ),
    drug =
      data.table(
        UMCReportId = c(1, 2, 3, 4),
        Drug_Id = c("d1", "d2", "d3", "d4"),
        DrecNo = c(133138448, 133138448, 111841511, 111841511),
        MedicinalProd_Id = c(25027716, 97354576, 104264760, 37484408)
      ),
    adr  =
      data.table(
        UMCReportId = c(1, 2, 3, 4),
        Adr_Id = c("a1", "a2", "a3", "a4"),
        MedDRA_Id = c(110049083, 31672047, 146319904, 72535511)
      ),
    link =
      data.table(
        Drug_Id = c("d1", "d2", "d3", "d4"),
        Adr_Id = c("a1", "a2", "a3", "a4")
      ),
    srce =
      data.table(
        UMCReportId = c(1, 2, 3, 4)
      ),
    ind  =
      data.table(
        Drug_Id = c("d1", "d2", "d3", "d4")
      ),
    out  =
      data.table(
        UMCReportId = c(1, 2, 3, 4)
      ),
    followup =
      data.table(
        UMCReportId = c(1, 2, 3, 4)
      ),
    suspdup =
      data.table(
        UMCReportId = c(3),
        SuspectedduplicateReportId = c(4)
      )
  )

  purrr::iwalk(
    mini_data,
    function(dataset, name)
      arrow::write_parquet(
        dataset |>
          arrow::as_arrow_table(),
        sink = paste0(wd_in, "/", name, ".parquet")
      )

  )

  # no end slashes at wd_in and wd_out
  expect_snapshot(
    tb_subset(
    wd_in = wd_in,
    wd_out = paste0(wd_in, "/", "subset_age"),
    subset_var = "age",
    sv_selection = c(7, 8)
  ),
  transform =
    function(chr_line)
      stringr::str_replace(
        chr_line,
        "(?>=\\d{1,3}\\%\\s| ).*(?= \\|)",
        " percent, seconds"
      ))

  drug_sub <-
    dt_parquet(paste0(wd_in, "/", "subset_age", "/"), "drug")

  demo_sub <-
    dt_parquet(paste0(wd_in, "/", "subset_age", "/"), "demo")

  expect_equal(
    sort(unique(demo_sub$UMCReportId)),
    sort(unique(drug_sub$UMCReportId))
  )

  age_test <-
    demo_sub %>%
    dplyr::mutate(
      good_age = AgeGroup %in% c(7, 8)
    )

  expect_equal(
    all(age_test$good_age == 1),
    TRUE
  )


  unlink(wd_in, recursive = TRUE)
  if(dir.exists(wd_in) & Sys.info()[["sysname"]] != "Windows")
    file.remove(here::here(wd_in))
})

test_that("error if no wd_in", {
  cnd <- rlang::catch_cnd(
    error_dir_exists("wd_in", "/wrong_path/")
  )

  expect_s3_class(cnd, "no_dir")
  expect_equal(cnd$dir, "wd_in")
  expect_equal(cnd$wrong_dir, "/wrong_path/")

  path_wrong <- "/wrong_path/"
  expect_error(
    check_dir_exists(path_wrong),
    class = "no_dir",
    regexp = path_wrong
  )

  # silent if dir exists

  expect_silent(
    check_dir_exists(tempdir())
  )

  expect_snapshot(error = TRUE,
    tb_subset(
      wd_in = "that_dir_doesnt_exist",
      sv_selection = list(a = c(1, 2))
    ),
    cnd_class = TRUE
  )
})

cli::test_that_cli("printing error no dir", {
  expect_snapshot(error = TRUE,
                  error_dir_exists("wd_in", "/wrong_path/")
  )
})

unlink(tempdir(), recursive = TRUE)

Try the vigicaen package in your browser

Any scripts or data that you put into this service are public.

vigicaen documentation built on April 3, 2025, 8:55 p.m.