tests/testthat/test-pull_data_synapse.R

test_that("Missing cohort parameter", {
  expect_error(pull_data_synapse())
})

# pull data for each cohort
# return to avoid having to re-run pull_data_synapse for
# each test
testthat::expect_true(if (.is_connected_to_genie()) {
  # data frame of each release to use for pmap
  data_releases <- synapse_tables %>%
    distinct(cohort, version) %>%
    # define expected number of dataframes based on whether TM and RT data were released
    mutate(expected_n_dfs = case_when(
      # no TM or RT
      cohort == "NSCLC" ~ 11,
      # TM, no RT
      cohort %in% c("CRC", "BrCa") ~ 12,
      # RT, no TM
      cohort == "BLADDER" ~ 12,
      # TM and RT
      cohort %in% c("PANC", "Prostate") ~ 13
    ))

  # for each data release, pull data into the R environment
  test_list <- pmap(data_releases %>%
                      select(cohort, version),
                    pull_data_synapse)

  # name the items in the list
  names(test_list) <- paste0(
    data_releases$cohort, "_",
    data_releases$version
  )

  # get actual length of each data release returned from pull_data_synapse
  actual_length <- map_depth(test_list, .depth = 2, length) %>%
    bind_rows() %>%
    pivot_longer(
      cols = everything(),
      names_to = "data_release",
      values_to = "length",
      values_drop_na = TRUE
    )

  length(actual_length) > 0
} else {0 == 0})

test_that("Test class and length of list for public data", {
  skip_if_not(.is_connected_to_genie())

  # compare to expected length
  expect_equal(data_releases$expected_n_dfs, actual_length$length)

  # compare to expected class
  # expect each data release returned to be a list, need to rep "list" the
  # number of times for the data releases we have
  expect_equal(unname(map_chr(test_list, class)), rep("list", nrow(data_releases)))
})

test_that("test `cohort` argument specification", {
  # try to misspecify cohort (lower cases instead of capital)
  expect_error(pull_data_synapse(
    cohort = "nsclc",
    version = "v2.2-consortium"
  ), "*")
})

test_that("test `version` argument specification", {
  skip_if_not(.is_connected_to_genie())

  # no version specified
  expect_error(
    pull_data_synapse(
      cohort = "NSCLC",
      version = NULL
    ),
    "Version needs to be specified.*"
  )

  # incorrect arg formatting
  expect_error(pull_data_synapse(
    cohort = "NSCLC",
    version = "1.1"
  ), "*")

  # more versions than cohorts
  expect_error(
    pull_data_synapse(
      cohort = "NSCLC",
      version = c(
        "v2.2-consortium",
        "v2.0-public"
      )
    ),
    "*You have selected"
  )


  # mismatch version-cohort
  expect_error(
    pull_data_synapse(
      cohort = "BrCa",
      version = c("v2.1-consortium")
    ),
    "You have selected a version that is not available for this cohort*"
  )
})

test_that("correct release returned", {
  # exit if user doesn't have a synapse log in or access to data.
  testthat::skip_if_not(.is_connected_to_genie())

  # not all data releases had a release_version variable
  test_list_release_version_avail <- within(test_list,
                                            rm(`NSCLC_v1.1-consortium`))

  # for each data frame returned with a cohort, get the release_version variable
  # remove genomic data frames since we don't expect them to have a release_version variable
  test_list_release_version_avail_no_genomic <- map_depth(test_list_release_version_avail,
                                                          .depth = 2,
                                                ~within(.x,
                                                        rm(cna,
                                                           fusions,
                                                           mutations_extended)))

  # for each dataframe returned for a data release, get the cohort variable
  release_returned <- map_depth(test_list_release_version_avail_no_genomic, .depth = 3, select,
                                any_of("release_version")) %>%
    map_depth(., .depth = 2, enframe) %>%
    map_depth(., .depth = 2, unnest, cols = value) %>%
    map_depth(., .depth = 2, distinct) %>%
    map(., 1) %>%
    # sometimes release_version is char and sometimes numeric, make char
    map(., mutate, release_version_character = str_replace(pattern = "pharma",
                                                           replacement = "consortium",
                                                           string = as.character(release_version))) %>%
    map(., select, -release_version) %>%
    bind_rows(.id = "data_release") %>%
    mutate(release_matches = str_detect(pattern = str_trim(release_version_character),
                                        string = data_release)) %>%
    filter(release_matches == FALSE)

  expect_equal(nrow(release_returned), 0)
})

test_that("Number of columns and rows for each data release", {
  skip_if_not(.is_connected_to_genie())

  # get number of columns for each dataframe returned
  col_lengths <- map_depth(test_list, .depth = 3, length) %>%
    map_depth(., .depth = 2, bind_rows) %>%
    map(., 1, .id = "test") %>%
    map(., pivot_longer, cols = everything(), names_to = "df", values_to = "ncol") # %>%
  # map_df(., bind_rows, .id = "data_release")

  # get nrow for each dataframe returned
  row_lengths <- map_depth(test_list, .depth = 3, nrow) %>%
    map_depth(., .depth = 2, bind_rows) %>%
    map(., 1, .id = "test") %>%
    map(., pivot_longer, cols = everything(), names_to = "df", values_to = "nrow")
  # map_df(., bind_rows, .id = "data_release")

  # hard coded table of expected number of rows and columns
  # requires update for each data release
  expected_length <- tibble::tribble(
    ~data_release, ~df, ~expected_nrow, ~expected_ncol,
    "NSCLC_v1.1-consortium", "pt_char", 1849, 33,
    "NSCLC_v1.1-consortium", "ca_dx_index", 1874, 110,
    "NSCLC_v1.1-consortium", "ca_dx_non_index", 810, 83,
    "NSCLC_v1.1-consortium", "ca_drugs", 4032, 114,
    "NSCLC_v1.1-consortium", "prissmm_imaging", 35113, 42,
    "NSCLC_v1.1-consortium", "prissmm_pathology", 8329, 195,
    "NSCLC_v1.1-consortium", "prissmm_md", 24950, 11,
    "NSCLC_v1.1-consortium", "cpt", 2026, 19,
    "NSCLC_v1.1-consortium", "mutations_extended", 17574, 54,
    "NSCLC_v1.1-consortium", "fusions", 821, 9,
    "NSCLC_v1.1-consortium", "cna", 930, 1782,
    "NSCLC_v2.2-consortium", "pt_char", 1832, 35,
    "NSCLC_v2.2-consortium", "ca_dx_index", 1858, 152,
    "NSCLC_v2.2-consortium", "ca_dx_non_index", 791, 97,
    "NSCLC_v2.2-consortium", "ca_drugs", 4012, 101,
    "NSCLC_v2.2-consortium", "prissmm_imaging", 34926, 43,
    "NSCLC_v2.2-consortium", "prissmm_pathology", 8277, 196,
    "NSCLC_v2.2-consortium", "prissmm_md", 24804, 12,
    "NSCLC_v2.2-consortium", "cpt", 2002, 26,
    "NSCLC_v2.2-consortium", "mutations_extended", 17430, 64,
    "NSCLC_v2.2-consortium", "fusions", 815, 9,
    "NSCLC_v2.2-consortium", "cna", 965, 1764,
    "CRC_v1.3-consortium", "pt_char", 1476, 40,
    "CRC_v1.3-consortium", "ca_dx_index", 1485, 152,
    "CRC_v1.3-consortium", "ca_dx_non_index", 328, 97,
    "CRC_v1.3-consortium", "ca_drugs", 5401, 102,
    "CRC_v1.3-consortium", "prissmm_imaging", 26091, 43,
    "CRC_v1.3-consortium", "prissmm_pathology", 7112, 341,
    "CRC_v1.3-consortium", "prissmm_md", 27954, 12,
    "CRC_v1.3-consortium", "tumor_marker", 24219, 14,
    "CRC_v1.3-consortium", "cpt", 1551, 26,
    "CRC_v1.3-consortium", "mutations_extended", 22903, 64,
    "CRC_v1.3-consortium", "fusions", 395, 9,
    "CRC_v1.3-consortium", "cna", 965, 1479,
    "BrCa_v1.1-consortium", "pt_char", 1130, 40,
    "BrCa_v1.1-consortium", "ca_dx_index", 1141, 159,
    "BrCa_v1.1-consortium", "ca_dx_non_index", 194, 103,
    "BrCa_v1.1-consortium", "ca_drugs", 6906, 102,
    "BrCa_v1.1-consortium", "prissmm_imaging", 26763, 43,
    "BrCa_v1.1-consortium", "prissmm_pathology", 7223, 371,
    "BrCa_v1.1-consortium", "prissmm_md", 28293, 12,
    "BrCa_v1.1-consortium", "tumor_marker", 9744, 14,
    "BrCa_v1.1-consortium", "cpt", 1234, 27,
    "BrCa_v1.1-consortium", "mutations_extended", 6633, 54,
    "BrCa_v1.1-consortium", "fusions", 610, 9,
    "BrCa_v1.1-consortium", "cna", 930, 1222,
    "PANC_v1.1-consortium", "pt_char", 1109, 52,
    "PANC_v1.1-consortium", "ca_dx_index", 1110, 141,
    "PANC_v1.1-consortium", "ca_dx_non_index", 279, 108,
    "PANC_v1.1-consortium", "ca_drugs", 3153, 102,
    "PANC_v1.1-consortium", "prissmm_imaging", 14521, 42,
    "PANC_v1.1-consortium", "prissmm_pathology", 3532, 289,
    "PANC_v1.1-consortium", "ca_radtx", 527, 83,
    "PANC_v1.1-consortium", "prissmm_md", 15870, 13,
    "PANC_v1.1-consortium", "tumor_marker", 17720, 15,
    "PANC_v1.1-consortium", "cpt", 1130, 29,
    "PANC_v1.1-consortium", "mutations_extended", 6572, 64,
    "PANC_v1.1-consortium", "fusions", 330, 9,
    "PANC_v1.1-consortium", "cna", 965, 1059,
    "Prostate_v1.1-consortium", "pt_char", 1116, 53,
    "Prostate_v1.1-consortium", "ca_dx_index", 1116, 145,
    "Prostate_v1.1-consortium", "ca_dx_non_index", 180, 107,
    "Prostate_v1.1-consortium", "ca_drugs", 5588, 102,
    "Prostate_v1.1-consortium", "prissmm_imaging", 22496, 42,
    "Prostate_v1.1-consortium", "prissmm_pathology", 5121, 234,
    "Prostate_v1.1-consortium", "ca_radtx", 1953, 81,
    "Prostate_v1.1-consortium", "prissmm_md", 21733, 15,
    "Prostate_v1.1-consortium", "tumor_marker", 41411, 15,
    "Prostate_v1.1-consortium", "cpt", 1227, 29,
    "Prostate_v1.1-consortium", "mutations_extended", 6194, 64,
    "Prostate_v1.1-consortium", "fusions", 1148, 9,
    "Prostate_v1.1-consortium", "cna", 965, 1168,
    "NSCLC_v2.0-public", "pt_char", 1846, 36,
    "NSCLC_v2.0-public", "ca_dx_index", 1869, 141,
    "NSCLC_v2.0-public", "ca_dx_non_index", 797, 86,
    "NSCLC_v2.0-public", "ca_drugs", 4032, 102,
    "NSCLC_v2.0-public", "prissmm_imaging", 35101, 42,
    "NSCLC_v2.0-public", "prissmm_pathology", 8342, 196,
    "NSCLC_v2.0-public", "prissmm_md", 24909, 12,
    "NSCLC_v2.0-public", "cpt", 2015, 28,
    "NSCLC_v2.0-public", "mutations_extended", 17472, 64,
    "NSCLC_v2.0-public", "fusions", 819, 9,
    "NSCLC_v2.0-public", "cna", 964, 1779,
    "BrCa_v1.2-consortium", "pt_char", 1129, 40,
    "BrCa_v1.2-consortium", "ca_dx_index", 1140, 159,
    "BrCa_v1.2-consortium", "ca_dx_non_index", 194, 94,
    "BrCa_v1.2-consortium", "ca_drugs", 6896, 110,
    "BrCa_v1.2-consortium", "prissmm_imaging", 26747, 43,
    "BrCa_v1.2-consortium", "prissmm_pathology", 7210, 356,
    "BrCa_v1.2-consortium", "prissmm_md", 28264, 12,
    "BrCa_v1.2-consortium", "tumor_marker", 9744, 14,
    "BrCa_v1.2-consortium", "cpt", 1233, 27,
    "BrCa_v1.2-consortium", "mutations_extended", 6646, 64,
    "BrCa_v1.2-consortium", "fusions", 611, 9,
    "BrCa_v1.2-consortium", "cna", 965, 1222,
    "CRC_v2.0-public", "pt_char", 1485, 51,
    "CRC_v2.0-public", "ca_dx_index", 1494, 142,
    "CRC_v2.0-public", "ca_dx_non_index", 336, 106,
    "CRC_v2.0-public", "ca_drugs", 5417, 103,
    "CRC_v2.0-public", "prissmm_imaging", 26260, 42,
    "CRC_v2.0-public", "prissmm_pathology", 7156, 320,
    "CRC_v2.0-public", "prissmm_md", 28164, 12,
    "CRC_v2.0-public", "tumor_marker", 24462, 14,
    "CRC_v2.0-public", "cpt", 1559, 29,
    "CRC_v2.0-public", "mutations_extended", 23225, 64,
    "CRC_v2.0-public", "fusions", 403, 9,
    "CRC_v2.0-public", "cna", 965, 1488,
    "BLADDER_v1.1-consortium", "pt_char", 716, 39,
    "BLADDER_v1.1-consortium", "ca_dx_index", 716, 143,
    "BLADDER_v1.1-consortium", "ca_dx_non_index", 523, 111,
    "BLADDER_v1.1-consortium", "ca_drugs", 2269, 103,
    "BLADDER_v1.1-consortium", "prissmm_imaging", 13563, 42,
    "BLADDER_v1.1-consortium", "prissmm_pathology", 7944, 389,
    "BLADDER_v1.1-consortium", "ca_radtx", 533, 81,
    "BLADDER_v1.1-consortium", "prissmm_md", 10367, 15,
    "BLADDER_v1.1-consortium", "cpt", 748, 29,
    "BLADDER_v1.1-consortium", "mutations_extended", 11000, 64,
    "BLADDER_v1.1-consortium", "fusions", 242, 9,
    "BLADDER_v1.1-consortium", "cna", 965, 698,
    "BLADDER_v1.2-consortium", "pt_char", 716, 39,
    "BLADDER_v1.2-consortium", "ca_dx_index", 716, 143,
    "BLADDER_v1.2-consortium", "ca_dx_non_index", 523, 111,
    "BLADDER_v1.2-consortium", "ca_drugs", 2269, 103,
    "BLADDER_v1.2-consortium", "prissmm_imaging", 13563, 42,
    "BLADDER_v1.2-consortium", "prissmm_pathology", 7944, 374,
    "BLADDER_v1.2-consortium", "ca_radtx", 533, 81,
    "BLADDER_v1.2-consortium", "prissmm_md", 10367, 15,
    "BLADDER_v1.2-consortium", "cpt", 748, 29,
    "BLADDER_v1.2-consortium", "mutations_extended", 12994, 64,
    "BLADDER_v1.2-consortium", "fusions", 242, 9,
    "BLADDER_v1.2-consortium", "cna", 999, 674,
    "PANC_v1.2-consortium", "pt_char", 1109, 52,
    "PANC_v1.2-consortium", "ca_dx_index", 1110, 141,
    "PANC_v1.2-consortium", "ca_dx_non_index", 279, 108,
    "PANC_v1.2-consortium", "ca_drugs", 3153, 102,
    "PANC_v1.2-consortium", "prissmm_imaging", 14521, 42,
    "PANC_v1.2-consortium", "prissmm_pathology", 3532, 289,
    "PANC_v1.2-consortium", "ca_radtx", 527, 82,
    "PANC_v1.2-consortium", "prissmm_md", 15870, 13,
    "PANC_v1.2-consortium", "tumor_marker", 17720, 15,
    "PANC_v1.2-consortium", "cpt", 1130, 29,
    "PANC_v1.2-consortium", "mutations_extended", 6567, 64,
    "PANC_v1.2-consortium", "fusions", 330, 9,
    "PANC_v1.2-consortium", "cna", 965, 1058,
    "Prostate_v1.2-consortium", "pt_char", 1116, 53,
    "Prostate_v1.2-consortium", "ca_dx_index", 1116, 145,
    "Prostate_v1.2-consortium", "ca_dx_non_index", 180, 107,
    "Prostate_v1.2-consortium", "ca_drugs", 5588, 102,
    "Prostate_v1.2-consortium", "prissmm_imaging", 22496, 42,
    "Prostate_v1.2-consortium", "prissmm_pathology", 5121, 234,
    "Prostate_v1.2-consortium", "ca_radtx", 1953, 80,
    "Prostate_v1.2-consortium", "prissmm_md", 21733, 15,
    "Prostate_v1.2-consortium", "tumor_marker", 41411, 15,
    "Prostate_v1.2-consortium", "cpt", 1227, 29,
    "Prostate_v1.2-consortium", "mutations_extended", 6193, 64,
    "Prostate_v1.2-consortium", "fusions", 1148, 9,
    "Prostate_v1.2-consortium", "cna", 965, 1167
  ) %>%
    mutate(data_release_factor = factor(data_release,
                                        levels = paste0(data_releases$cohort,
                                                        "_",
                                                        data_releases$version))) %>%
    # separate(data_release, into = c("cohort", "data_release"), sep = "_") %>%
    split(~data_release_factor)

  # expect the correct number of columns
  map2(
    map(col_lengths, pull, "ncol"),
    map(expected_length, pull, expected_ncol),
    expect_equal
  )

  # expect the correct number of rows
  map2(
    map(row_lengths, pull, "nrow"),
    map(expected_length, pull, expected_nrow),
    expect_equal
  )
})

test_that("Test NA conversion", {
  skip_if_not(.is_connected_to_genie())

  # making sure there are no character "" instead of NAs
  # count number of character "" across all columns
  # first count number of times each column has a ""
  any_blank_cols <- map_depth(test_list, .depth = 3,
                              ~colSums(.x == "", na.rm = TRUE)) %>%
    # then aggregate across dataframes
    map_depth(., .depth = 3, ~sum(.x)) %>%
    # set all dataframes together
    map_depth(., .depth = 2, bind_rows) %>%
    map_depth(., .depth = 1, 1) %>%
    bind_rows(., .id = "release") %>%
    # get 1 row/release/df
    pivot_longer(cols = c(everything(), -release),
                 names_to = "df",
                 values_to = "n_blanks") %>%
    filter(n_blanks != 0)

  expect_equal(nrow(any_blank_cols), 0)
})
AxelitoMartin/GenieBPC documentation built on April 20, 2024, 6:38 a.m.