tests/testthat/test-create_analytic_cohort.R

# Tests - No GENIE Access Required ---------------------------------------------
test_that("No specifications- runs with no error", {
  expect_error(create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data
  ), NA)
})

test_that("pull data synapse object is missing", {
  expect_error(create_analytic_cohort())
})

test_that("Institution- argument check", {
  expect_error(msk <- create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    institution = "MSK"
  ), NA)

  expect_equal("MSK", unique(msk[[1]]$institution))

  expect_error(dfci <- create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    institution = "DFCI"
  ), NA)

  expect_equal("DFCI", unique(dfci[[1]]$institution))

  expect_error(vicc <- create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    institution = "VICC"
  ), NA)

  expect_equal("VICC", unique(vicc[[1]]$institution))

  expect_error(uhn <- create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    institution = "UHN"
  ), NA)

  expect_equal("UHN", unique(uhn[[1]]$institution))

  expect_error(create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    institution = "non-existant"
  ), "Select from*")
})


test_that("stage_dx- argument check", {

  expect_error(create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    stage_dx = "Stage I"
  ), NA)

  expect_error(create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    stage_dx = "Stage II"
  ), NA)

  expect_error(create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    stage_dx = "Stage III"
  ), NA)

  expect_error(create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    stage_dx = "Stage IV"
  ), NA)

  expect_error(create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    stage_dx = "staGe IV"
  ), NA)

  expect_error(create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    stage_dx = c("Stage I", "Stage IV")
  ), NA)

  expect_error(create_analytic_cohort(
    data_synapse = genieBPC::nsclc_test_data,
    stage_dx = "none"
  ), "*")
})




# Tests - Requiring GENIE Access -----------------------------------------------

# pull data for each cohort
# return to avoid having to re-run pull_data_synapse for
# each test
# pull data for each cohort
# return to avoid having to re-run pull_data_synapse for
# each test
testthat::expect_true(if (.is_connected_to_genie()) {
  # data frame of each release to use for pmap
  data_releases <- synapse_tables %>%
    distinct(cohort, version) %>%
    # define expected number of dataframes based on whether TM and RT data were released
    mutate(expected_n_dfs = case_when(
      # no TM or RT
      cohort == "NSCLC" ~ 11,
      # TM, no RT
      cohort %in% c("CRC", "BrCa") ~ 12,
      # RT, no TM
      cohort == "BLADDER" ~ 12,
      # TM and RT
      cohort %in% c("PANC", "Prostate") ~ 13
    ),
    expected_n_dfs_with_summary = expected_n_dfs + 4)

  # for each data release, pull data into the R environment
  data_releases_pull_data <- pmap(
    data_releases %>%
      select(cohort, version),
    pull_data_synapse
  )

  # name the items in the list
  names(data_releases_pull_data) <- paste0(
    data_releases$cohort, "_",
    data_releases$version
  )

  length(data_releases_pull_data) > 0
} else {0 == 0})

testthat::expect_true(if (.is_connected_to_genie()) {
  # for each data release, run create analytic cohort
  # get first object from each item in the list
  # then run create analytic cohort
  data_releases_create_cohort <- map(data_releases_pull_data, 1) %>%
    map(., create_analytic_cohort)

  # name the items in the list
  names(data_releases_create_cohort) <- paste0(
    data_releases$cohort, "_",
    data_releases$version
  )

  # create analytic cohort with return summary = TRUE
  data_releases_create_cohort_with_summary <- map(data_releases_pull_data, 1) %>%
    map(., create_analytic_cohort, return_summary = TRUE)

  # apply same names to list with summaries
  names(data_releases_create_cohort_with_summary) <- paste0(
    data_releases$cohort, "_",
    data_releases$version
  )

  length(data_releases_create_cohort_with_summary) > 0
} else {0 == 0})

# will update once we merge in PR to allow multiple cohorts in create_analytic_cohort
test_that("multiple cohorts- argument check", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data[1:2]
  ))
})

test_that("non-existent data_synapse", {
  # a non-existent data_synapse is specified
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$TEST_NONEXIST
  ))
})

test_that("correct number of objects returned from create cohort", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # check that number of items returned is correct
  # data releases with RT and TM
  actual_length <- map_df(data_releases_create_cohort, length) %>%
    pivot_longer(
      cols = everything(),
      names_to = "data_release",
      values_to = "length"
    )

  # compare to expected length
  expect_equal(data_releases$expected_n_dfs, actual_length$length)


  # check thta number of items is also correct when a summary is returned
  actual_length_with_summary <- map_df(data_releases_create_cohort_with_summary, length) %>%
    pivot_longer(
      cols = everything(),
      names_to = "data_release",
      values_to = "length"
    )

  # compare to expected length when return_summary = TRUE
  # +4 for the additional tables returned when return_summary = TRUE
  expect_equal(data_releases$expected_n_dfs_with_summary, actual_length_with_summary$length)

  # check that class is correct
  map2(
    map(data_releases_create_cohort, class),
    rep("list", nrow(data_releases)),
    expect_equal
  )
})


test_that("correct cohort returned from create cohort", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # for each data frame returned with a cohort, get the cohort variable
  # remove genomic data frames since we don't expect them to have a cohort variable
  data_releases_create_cohort_no_genomic <- map(
    data_releases_create_cohort,
    ~ within(
      .x,
      rm(
        cohort_cna,
        cohort_fusions,
        cohort_mutations_extended
      )
    )
  )

  # for each dataframe returned for a data release, get the cohort variable
  cohort_returned <- map_depth(data_releases_create_cohort_no_genomic, select, "cohort",
    .depth = 2
  ) %>%
    map(., bind_rows, .id = "df") %>%
    map(., distinct) %>%
    bind_rows(., .id = "data_release") %>%
    separate(data_release,
      into = c("cohort_expected", "data_release"),
      sep = "_"
    )

  expect_equal(cohort_returned$cohort_expected, cohort_returned$cohort)
})

test_that("check first index cancer default", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # no diagnosis criteria specified
  # expect that the first index cancer is returned without any other
  # incl criteria

  # for each index cancer dataset, pick the first index cancer
  data_releases_create_cohort_ca_dx_index <- map_depth(data_releases_pull_data,
    .depth = 2,
    pluck,
    "ca_dx_index"
  ) %>%
    map_depth(., .depth = 2, group_by, record_id) %>%
    map_depth(., .depth = 2, slice_min, ca_seq) %>%
    map_depth(., .depth = 2, ungroup) %>%
    map(., 1)

  # expect the default from create cohort to match the first index cancer
  # check that the first index cancer diagnosis is returned by create analytic cohort
  map2(
    data_releases_create_cohort_ca_dx_index,
    map(data_releases_create_cohort, "cohort_ca_dx"),
    expect_equal
  )
})

test_that("index_ca_seq", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # not really cohort specific, all cohorts will have index_ca_seq
  # for now, only test on lung
  # first and second index cancer is specified
  # if patient only has 1 index cancer, it should be returned
  # if patient has 2+ index cancers, the first two should be returned
  test_1a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    index_ca_seq = c(1, 2),
    return_summary = TRUE
  )

  test_1b <- data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
    group_by(cohort, record_id) %>%
    arrange(cohort, record_id, ca_seq) %>%
    mutate(index_ca_seq = 1:n()) %>%
    ungroup() %>%
    filter(index_ca_seq %in% c(1, 2)) %>%
    select(-index_ca_seq)

  expect_equal(test_1a$cohort_ca_dx, test_1b)


  # an index cancer # that doesn't exist in the data is specified
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    index_ca_seq = 100
  ))

  ## index cancer #s in cohort_ngs match those in cohort_ca_dx
  test2a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`CRC_v2.0-public`$CRC_v2.0,
    index_ca_seq = c(1, 2)
  )

  expect_equal(
    test2a$cohort_ca_dx %>%
      select(record_id, ca_seq) %>%
      arrange(record_id, ca_seq),
    test2a$cohort_ngs %>%
      distinct(record_id, ca_seq) %>%
      arrange(record_id, ca_seq)
  )
})

test_that("institution", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # institution is specified and correct institution is returned
  # institution will be available across data releases,
  # don't need to test on each
  test_1a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    institution = "dfci"
  )

  test_1b <- data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
    group_by(cohort, record_id) %>%
    slice(which.min(ca_seq)) %>%
    ungroup() %>%
    filter(institution == "DFCI")

  expect_equal(test_1a$cohort_ca_dx, test_1b)


  # multiple institutions specified
  test_2a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`BrCa_v1.1-consortium`$BrCa_v1.1,
    institution = c("dfci", "msk")
  )

  test_2b <- data_releases_pull_data$`BrCa_v1.1-consortium`$BrCa_v1.1$ca_dx_index %>%
    group_by(cohort, record_id) %>%
    slice(which.min(ca_seq)) %>%
    ungroup() %>%
    filter(institution %in% c("MSK", "DFCI"))

  expect_equal(test_2a$cohort_ca_dx, test_2b)

  # a non-existent institution is specified
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    institution = "uDFCI"
  ))

  # UHN didn't participate in CRC
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`CRC_v2.0-public`$CRC_v2.0,
    institution = "UHN"
  ))
})

test_that("stage_dx", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # stage dx is specified and correct stage is returned
  # not cohort specific, all cohorts will have stage
  # test only on one cohort for now
  test_1a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    stage_dx = "stage ii"
  )

  test_1b <- data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
    group_by(cohort, record_id) %>%
    slice(which.min(ca_seq)) %>%
    ungroup() %>%
    filter(stage_dx == "Stage II")

  expect_equal(test_1a$cohort_ca_dx, test_1b)

  # multiple stage values are specified
  test_2a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    stage_dx = c("Stage I", "stage ii")
  )

  test_2b <- data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
    group_by(cohort, record_id) %>%
    slice(which.min(ca_seq)) %>%
    ungroup() %>%
    filter(stage_dx %in% c("Stage I", "Stage II"))

  expect_equal(test_2a$cohort_ca_dx, test_2b)

  # non-existent stage is specified
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    stage_dx = "3A"
  ))
})

test_that("histology", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # no histology is specified, all are returned
  test0b <- data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
    group_by(cohort, record_id) %>%
    slice(which.min(ca_seq)) %>%
    ungroup()

  expect_equal(
    data_releases_create_cohort$`NSCLC_v2.0-public`$cohort_ca_dx,
    test0b
  )

  # histology is specified and correct histology is returned
  test_1a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    histology = "adenocarcinoma"
  )

  test_1b <- data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
    group_by(cohort, record_id) %>%
    slice(which.min(ca_seq)) %>%
    ungroup() %>%
    filter(ca_hist_adeno_squamous == "Adenocarcinoma")

  expect_equal(test_1a$cohort_ca_dx, test_1b)

  # repeat for BrCa, which has specific histologies available
  test_1c <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`BrCa_v1.1-consortium`$BrCa_v1.1,
    histology = "invasive ductal carcinoma"
  )

  test_1d <- data_releases_pull_data$`BrCa_v1.1-consortium`$BrCa_v1.1$ca_dx_index %>%
    group_by(cohort, record_id) %>%
    slice(which.min(ca_seq)) %>%
    ungroup() %>%
    filter(ca_hist_brca == "Invasive ductal carcinoma")

  expect_equal(test_1c$cohort_ca_dx, test_1d)

  # multiple histologies are specified and returned
  test_2a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    histology = c("adenocarcinoma", "squamous cell")
  )

  test_2b <- data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
    group_by(cohort, record_id) %>%
    slice(which.min(ca_seq)) %>%
    ungroup() %>%
    filter(ca_hist_adeno_squamous %in% c("Adenocarcinoma", "Squamous cell"))

  expect_equal(test_2a$cohort_ca_dx, test_2b)

  # a non-existent histology is specified
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    histology = "squamous_adeno"
  ))

  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`BrCa_v1.2-consortium`$BrCa_v1.2,
    histology = "squamous_adeno"
  ))
})

test_that("no regimen specified", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # all regimens are returned
  # should match all regimens given for a patients first index cancer
  test_1b <- inner_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      ungroup() %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c("cohort", "record_id", "ca_seq")
  )

  expect_equal(
    data_releases_create_cohort$`NSCLC_v2.0-public`$cohort_ca_drugs,
    test_1b
  )
})

test_that("drug regimen specified, order not specified", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # one drug regimen specified, but order not specified
  test_1a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin, Pemetrexed Disodium")
  )

  # expect all times that drug was received (for the first index ca)
  # to be returned
  test_1b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      ungroup() %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c(
      "cohort", "record_id", "ca_seq"
    ),
    multiple = "all"
  ) %>%
    filter(regimen_drugs == c("Carboplatin, Pemetrexed Disodium"))

  expect_equal(test_1a$cohort_ca_drugs, test_1b)

  # one drug regimen specified with drugs out of ABC order and in mixed case
  # regimen order not specified
  test_2a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Pemetrexed DISODIUM, carboplatin")
  )

  # expect all times that drug was received (for the first index ca)
  # to be returned
  # same as above

  expect_equal(test_2a$cohort_ca_drugs, test_1b)

  # multiple drug regimens specified, but order not specified
  test_3a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin, Pemetrexed Disodium", "Nivolumab")
  )

  # expect all times that drug was received (for the first index ca)
  # to be returned
  test_3b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      ungroup() %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c(
      "cohort", "record_id", "ca_seq"
    ),
    multiple = "all"
  ) %>%
    filter(regimen_drugs %in% c(
      "Carboplatin, Pemetrexed Disodium",
      "Nivolumab"
    ))

  expect_equal(test_3a$cohort_ca_drugs, test_3b)

  # multiple drug regimens specified, regimen_type = containing
  test_4a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin", "Nivolumab"),
    regimen_type = "containING"
  )

  # expect all times that drug was received (for the first index ca)
  # to be returned
  test_4b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      ungroup() %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c(
      "cohort", "record_id", "ca_seq"
    ),
    multiple = "all"
  ) %>%
    filter(grepl("Carboplatin", regimen_drugs) |
      grepl("Nivolumab", regimen_drugs))

  expect_equal(test_4a$cohort_ca_drugs, test_4b)
})

test_that("drug regimen specified, order specified to be within cancer", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # regimen of a certain number but drug name not specified
  # all patients whose first drug after diagnosis was carbo pem
  test_0a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_order = 1,
    regimen_order_type = "within cancer"
  )

  # compare to data
  test_0b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c("cohort", "record_id", "ca_seq"),
    multiple = "all"
  ) %>%
    group_by(record_id) %>%
    slice(which.min(regimen_number)) %>%
    ungroup()

  expect_equal(test_0a$cohort_ca_drugs, test_0b)

  # all patients whose first drug after diagnosis was carbo pem
  test_1a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin, Pemetrexed Disodium"),
    regimen_type = "Exact",
    regimen_order = 1,
    regimen_order_type = "within cancer"
  )

  # compare to data
  test_1b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c("cohort", "record_id", "ca_seq"),
    multiple = "all"
  ) %>%
    group_by(record_id) %>%
    slice(which.min(regimen_number)) %>%
    ungroup() %>%
    filter(regimen_drugs == "Carboplatin, Pemetrexed Disodium")

  expect_equal(test_1a$cohort_ca_drugs, test_1b)

  # second regimen after diagnosis was carbo pem
  test_2a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin, Pemetrexed Disodium"),
    regimen_type = "Exact",
    regimen_order = 2,
    regimen_order_type = "within cancer"
  )

  # compare to data
  test_2b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      ungroup() %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c(
      "cohort", "record_id", "ca_seq"
    ),
    multiple = "all"
  ) %>%
    group_by(record_id) %>%
    mutate(new_reg_number = 1:n()) %>%
    ungroup() %>%
    filter(regimen_drugs == "Carboplatin, Pemetrexed Disodium") %>%
    filter(new_reg_number == 2) %>%
    select(-new_reg_number)

  expect_equal(test_2a$cohort_ca_drugs, test_2b)

  # first AND/OR second regimen after diagnosis was carbo pem
  test_3a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin, Pemetrexed Disodium"),
    regimen_type = "Exact",
    regimen_order = c(1, 2),
    regimen_order_type = "within cancer"
  )

  # compare to data
  test_3b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      ungroup() %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c(
      "cohort", "record_id", "ca_seq"
    ),
    multiple = "all"
  ) %>%
    group_by(record_id) %>%
    mutate(new_reg_number = 1:n()) %>%
    ungroup() %>%
    filter(regimen_drugs == "Carboplatin, Pemetrexed Disodium") %>%
    filter(new_reg_number %in% c(1, 2)) %>%
    select(-new_reg_number)

  expect_equal(test_3a$cohort_ca_drugs, test_3b)

  # first AND/OR second regimen after diagnosis was carbo pem
  # regimen_type = containing rather than default of exact
  test_4a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin, Pemetrexed Disodium"),
    regimen_type = "containing",
    regimen_order = c(1, 2),
    regimen_order_type = "within cancer"
  )

  test_4b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      ungroup() %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c(
      "cohort", "record_id", "ca_seq"
    ),
    multiple = "all"
  ) %>%
    group_by(record_id) %>%
    mutate(new_reg_number = 1:n()) %>%
    ungroup() %>%
    filter(grepl("Carboplatin, Pemetrexed Disodium", regimen_drugs)) %>%
    filter(new_reg_number %in% c(1, 2)) %>%
    select(-new_reg_number)

  expect_equal(test_4a$cohort_ca_drugs, test_4b)
})


test_that("exact drug regimen specified,
          order specified to be within regimen", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # single regimen specified, want first time that regimen
  # was given for all cancers
  test_1a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin, Pemetrexed Disodium"),
    regimen_order = c(1),
    regimen_order_type = "within REGimen"
  )

  test_1b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c(
      "cohort", "record_id", "ca_seq"
    ),
    multiple = "all"
  ) %>%
    group_by(record_id, regimen_drugs) %>%
    mutate(new_reg_number = 1:n()) %>%
    ungroup() %>%
    filter(regimen_drugs == "Carboplatin, Pemetrexed Disodium") %>%
    filter(new_reg_number %in% c(1)) %>%
    select(-new_reg_number)

  expect_equal(test_1a$cohort_ca_drugs, test_1b)

  # multiple regimens specified, want first time each given
  test_2a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin, Pemetrexed Disodium", "Nivolumab"),
    regimen_order = c(1),
    regimen_order_type = "within REGimen"
  )

  test_2b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c(
      "cohort", "record_id", "ca_seq"
    ),
    multiple = "all"
  ) %>%
    group_by(record_id, regimen_drugs) %>%
    mutate(new_reg_number = 1:n()) %>%
    ungroup() %>%
    filter(regimen_drugs %in% c(
      "Carboplatin, Pemetrexed Disodium",
      "Nivolumab"
    )) %>%
    filter(new_reg_number %in% c(1)) %>%
    select(-new_reg_number)

  expect_equal(test_2a$cohort_ca_drugs, test_2b)

  # multiple regimens specified
  # first and/or second time they were received
  # multiple regimens specified, want first time each given
  test_3a <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin, Pemetrexed Disodium", "Nivolumab"),
    regimen_order = c(1, 2),
    regimen_order_type = "within REGimen"
  )

  test_3b <- left_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      select(cohort, record_id, ca_seq),
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    by = c(
      "cohort", "record_id", "ca_seq"
    ),
    multiple = "all"
  ) %>%
    group_by(record_id, regimen_drugs) %>%
    mutate(new_reg_number = 1:n()) %>%
    ungroup() %>%
    filter(regimen_drugs %in% c(
      "Carboplatin, Pemetrexed Disodium",
      "Nivolumab"
    )) %>%
    filter(new_reg_number %in% c(1, 2)) %>%
    select(-new_reg_number)

  expect_equal(test_3a$cohort_ca_drugs, test_3b)
})

test_that("containing drug regimen specified,
          order specified to be within regimen", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # specify regimen type to be containing (default is exact,
  # which is what is implemented in the above)
  test_1c <- create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = c("Carboplatin, Pemetrexed Disodium"),
    regimen_type = "containing",
    regimen_order = c(1),
    regimen_order_type = "within REGimen"
  )

  # order containing
  ordered_containing_regs <- data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs %>%
    filter(grepl("Carboplatin, Pemetrexed Disodium", regimen_drugs)) %>%
    distinct(cohort, record_id, regimen_number, regimen_drugs) %>%
    group_by(cohort, record_id) %>%
    mutate(order_within_containing_regimen = 1:n()) %>%
    ungroup() %>%
    filter(order_within_containing_regimen %in% c(1)) %>%
    select(
      cohort, record_id, regimen_number,
      order_within_containing_regimen
    )

  # merge containing order onto the regimen data
  # only keep regimens of interest
  ca_drugs_with_containing_order <- inner_join(data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_drugs,
    ordered_containing_regs,
    by = c(
      "cohort", "record_id",
      "regimen_number"
    ),
    multiple = "all"
  )

  # merge cohort with patients who received drug regimens of interest
  # in order specified
  test_1d <- inner_join(
    data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0$ca_dx_index %>%
      group_by(record_id) %>%
      slice(which.min(ca_seq)) %>%
      ungroup() %>%
      select(cohort, record_id, ca_seq),
    ca_drugs_with_containing_order,
    by = c(
      "cohort", "record_id", "ca_seq"
    )
  ) %>%
    arrange(cohort, record_id, ca_seq) %>%
    select(
      cohort, record_id, institution,
      regimen_number, ca_seq, everything()
    ) %>%
    as.data.frame()

  expect_equal(
    test_1c$cohort_ca_drugs %>%
      arrange(cohort, record_id, ca_seq) %>%
      select(cohort, record_id, ca_seq, regimen_number,
             everything()),
    test_1d %>%
      arrange(cohort, record_id, ca_seq) %>%
      select(cohort, record_id, ca_seq, regimen_number,
             everything())
  )
})

test_that("regimen_type", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # only testing on a single cancer cohort since not cohort-specific
  # invalid value provided for regimen_type
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_type = "exact_containing"
  ))

  # if regimen_type is specified, regimen_drugs must also be specified
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`CRC_v2.0-public`$CRC_v2.0,
    regimen_type = "exact"
  ))
})

test_that("regimen_order", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # character value provided for regimen_order
  # only testing on a single cancer cohort since not cohort-specific
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`BrCa_v1.1-consortium`$BrCa_v1.1,
    regimen_order = "C"
  ))
})

test_that("regimen_order_type", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # only testing on a single cancer cohort since not cohort-specific
  # invalid value provided for regimen_order_type
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`BrCa_v1.1-consortium`$BrCa_v1.1,
    regimen_order = 1,
    regimen_order_type =
      "within_btwn_cancer"
  ))

  # regimen_order is specified but regimen_order_type is not
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`BrCa_v1.1-consortium`$BrCa_v1.1,
    regimen_order = 1
  ))

  # regimen_order_type is specified but regimen_order is not
  expect_error(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`BrCa_v1.1-consortium`$BrCa_v1.1,
    regimen_order_type =
      "within cancer"
  ))
})

test_that("No patients met criteria", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  expect_message(create_analytic_cohort(
    data_synapse = data_releases_pull_data$`NSCLC_v2.0-public`$NSCLC_v2.0,
    regimen_drugs = "Carboplatin, Pemetrexed",
    regimen_order = 1000,
    regimen_order_type = "within cancer"
  ))
})
AxelitoMartin/GenieBPC documentation built on April 20, 2024, 6:38 a.m.