knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Setup

library(tidyverse)
library(rlang)
library(rubix)
library(OncoRegimenFinder)
library(fantasia)
conn <- fantasia::connectOMOP()

Summary

The results of the 1) OncoRegimenFinder algorithm is compared to the 2) abstracted Esophagus REDCap cohort treatment data.

1. Esophagus Cohort OncoRegimenFinder

Source Tables:

  1. Esophagus Cohort Table
  2. OncoRegimenFinder Regimen Ingredient Table

The Esophagus Cohort table patelm9.esophagus_cohort:

esophagus_cohort <- 
        pg13::readTable(conn = conn,
                        schema = "patelm9",
                        tableName = "esophagus_cohort")
head(esophagus_cohort) %>%
  kableExtra::kable()

OncoRegimenFinder Regimen Ingredient Table patelm9.oncoregimenfinder_regimen_ingredient:

regimen_ingredient <- 
    pg13::query(conn = conn,
                pg13::buildQuery(schema = "patelm9",
                                 tableName = "oncoregimenfinder_regimen_ingredients",
                                 n = 20,
                                 n_type = "random"))
head(regimen_ingredient) %>%
      kableExtra::kable()

SQL

The following SQL is used to join the 2 Source Tables:

sql <-
pg13::buildJoinQuery(fields = c("\npatelm9.esophagus_cohort.person_id AS esophagus_person_id\n",
                                "patelm9.esophagus_cohort.ptid\n",
                                "patelm9.oncoregimenfinder_regimen_ingredients.*\n"),
                     schema = "patelm9",
                     tableName = "esophagus_cohort",
                     column = "person_id",
                     joinType = "LEFT",
                     joinOnSchema = "patelm9",
                     joinOnTableName = "oncoregimenfinder_regimen_ingredients",
                     joinOnColumn = "person_id")

cat(sql)
esophagus_oncoregimenfinder_results <-
pg13::query(conn = conn,
             sql_statement = sql)

Output:

esophagus_oncoregimenfinder_results <-
          esophagus_oncoregimenfinder_results %>%
          tibble::as_tibble() %>% 
          dplyr::mutate_all(as.character) %>%
          rubix::normalize_all_to_na() 

head(esophagus_oncoregimenfinder_results) %>%
          kableExtra::kable()

OncoRegimenFinder Results Summary

Missingness

data_summary <-
  esophagus_oncoregimenfinder_results %>%
    rubix::summarize_variables() %>%
    dplyr::select(-contains("DISTINCT_VALUES"))

Analysis of OncoRegimenFinder results shows that out of the unique OMOP Person Id count of r data_summary[data_summary$Variable == "esophagus_person_id", "DISTINCT_COUNT"] patients in the Esophagus Cohort overall, only r data_summary[data_summary$Variable == "person_id", "DISTINCT_COUNT"] Person Ids were resulted from OncoRegimenFinder.

Full Metrics:

data_summary %>%
    kableExtra::kable()

REDCap Data

input <-
fantasia::processRedcapData(data_csv_file = "~/GitHub/MSK-patelm9/escritoire/Esophagus/data_with_phi/STGIEsophagogastricS_DATA_2020-07-28_1354.csv",
                  parsed_metadata_csv_file = "~/GitHub/MSK-patelm9/escritoire/Esophagus/data_without_phi/STGIEsophagogastricSandbox_DataDictionary_2020-06-24_Parsed.csv",
                    identifier_variables = "ptid")

redcapData <- input$ProcessedData

The REDCap Data is exported as a tabular file and the main patient identifier and all variables associated with oncology treatments are selected.

identifier_vars <- "ptid"
treatment_vars <-
          c('neoadj_regimen',
          'neoadj_tx_additional_drugs',
          'neoadj_tx_addtl_drugs_2',
          'crt_name',
          'neoadjcrt_tx_additional_drugs',
          'neoadjcrt_tx_addtl_drugs_2',
          'adj_regimen',
          'adj_tx_additional_drugs',
          'adj_tx_addtl_drugs_2',
          'adj_rt_concurrent',
          'adjcrt_tx_additional_drugs',
          'adjcrt_tx_addtl_drugs_2',
          'induction_regimen',
          'periop_neo_regimen',
          'periop_adj_regimen',
          'tx_name',
          'advanced_tx_additional_drugs',
          'advanced_tx_addtl_drugs_2',
          'tx_maintenance_drugs',
          'advanced_tx_addtl_drugsm1_1',
          'advanced_tx_addtl_drugsm1_2',
          'tx_maintenance2_drugs',
          'advanced_tx_addtl_drugsm2_1',
          'advanced_tx_addtl_drugsm2_2',
          'palliative_rt_concurrent',
          'neoadj_tx_base_regimen',
          'neoadjcrt_tx_base_regimen',
          'adj_tx_base_regimen',
          'adjcrt_tx_base_regimen',
          'advanced_tx_base_regimen',
          'regimen_name_m1',
          'regimen_name_m2',
          'hipec_drugs')
qa <- treatment_vars[!(treatment_vars %in% colnames(input$ProcessedData))]
treatment_vars <- treatment_vars[(treatment_vars %in% colnames(input$ProcessedData))]

For each of the Variables, a copy of the REDCap Data is selected for the Patient Identifier (ptid) and the Variable, as well as any associated Start and Stop Dates and placed in a list object.

output <-
      treatment_vars %>%
        rubix::map_names_set(function(x) redcapData %>%
                                         tibble::as_tibble() %>%
                                         rubix::normalize_all_to_na() %>%
                                         dplyr::filter_at(vars(all_of(x)),
                                                          any_vars(!is.na(.))) %>%
                                        dplyr::select(all_of(identifier_vars),
                                                      all_of(x),
                                                      contains("start"),
                                                      contains("stop")) %>%
                                        rubix::deselect_if_all_na()) %>%
        purrr::keep(~nrow(.) > 0) %>% 
        # If the df has less than 2 columns, that means there wasn't a date associated with the variable
        purrr::keep(~ncol(.) > 2)

The output is a list of the length of the treament_vars, where each position of the list occupied by the Patient Identifier, the treatment_var, and all the possible date data associated with that treatment. Unlike other REDCap projects where there is a single set of start and end dates associated with the treatment, the Esophagus REDCap took it an extra step by including valuable clinical context information in their data collection. For example, the Variable regimen_name_2 has the following possible dates: tx_start-tx_stop, tx_pre_maintenance_start-tx_pre_maintenance_stop, and tx_pre_maintenance2_start-tx_pre_maintenance2_stop.

print(output$regimen_name_m2)

For this particular project, r output %>% purrr::keep(~ncol(.)>4) %>% length() Variables have more than 1 set of start-stop dates for treatments due to contextual differences.

In order to manage the one-to-many relationship between a treatment and the contextual start and stop dates, the all the date Variables data are pivoted and the context is separated from the DateType, whether the Date represents a start or stop of a treatment.

output2 <-
  output %>%
  purrr::map2(names(output), function(x, y) x %>%
                          tidyr::pivot_longer(cols = c(contains("start"), contains("stop")),
                                              names_to = c("Context",
                                                           "DateType"),
                                              names_pattern = "(^.*)_([start|stop].*$)",
                                              values_to = "Date")) %>%
  purrr::map(function(x) x %>%
                          dplyr::rename_at(vars(2),
                                           ~paste("PV"))) %>%
  dplyr::bind_rows(.id = "VARIABLE")

print(output2)

Lastly, the data is massaged into the Patient Events Table Format, thereby successfully isolating the start-stop date concept pairs to a single observation level. At this stage any rows where both the Event Start and End Date are NA or blank are filtered out.

output3 <-
  output2 %>% 
  dplyr::mutate(DateType = forcats::as_factor(DateType)) %>% 
  dplyr::mutate(DateType = forcats::fct_collapse(DateType,
                                                 EVENT_START_DATE = c("start", "start_date"),
                                                 EVENT_END_DATE = c("stop", "stop_date"))) %>%
  tidyr::pivot_wider(id_cols = c(!DateType, !Date),
                     names_from = DateType,
                     values_from = Date,
                     values_fn = list(Date = function(x) paste(unique(x), collapse = "\n")))

print(output3)

The REDCap Data is then joined with the Esophagus Cohort Data on the ptid Variable to be able to map the data back to the OMOP Person Id. The Event Date columns are copied to additional NORMALIZE columns. These columns will also be added to the OncoRegimenFinder results in order to perform a union and to compare the accuracy between the 2 data sources.

output4 <-
  output3 %>%
  dplyr::full_join(esophagus_cohort) %>%
  dplyr::select(all_of(colnames(output3)),
                person_id) %>%
  rubix::normalize_all_to_na() %>%
  dplyr::filter_at(vars(c(EVENT_START_DATE, EVENT_END_DATE)),
                   any_vars(!is.na(.))) %>%
  dplyr::mutate(NORMALIZE_START_DATE = EVENT_START_DATE,
                NORMALIZE_END_DATE = EVENT_END_DATE)

print(output4)

To prepare the REDCap Data for the join with the OncoRegimenFinder results for a side-by-side comparison, the last step is grouping the REDCap Data by Person Id.

output5 <- split(output4, output4$person_id)

The OncoRegimenFinder results is also be grouped based on Person Id, allowing for specific joins at the person level after copying the Date data into their corresponding NORMALIZE columns.

esophagus_results2 <-
esophagus_oncoregimenfinder_results %>%
  dplyr::mutate(NORMALIZE_START_DATE = ingredient_start_date,
                NORMALIZE_END_DATE = ingredient_end_date) %>%
  dplyr::mutate_at(vars(person_id), as.integer) 
print(esophagus_results2)
esophagus_results3 <- split(esophagus_results2, esophagus_results2$esophagus_person_id)

The data from both sources are then blended, with a DATA_SOURCE column to demarcate whether the source was OMOP or REDCap.

final_output <-
list(REDCAP = output5,
     OMOP = esophagus_results3) %>%
  purrr::transpose() %>% 
  purrr::map(dplyr::bind_rows, .id = "DATA_SOURCE")

Notably Incorrect

print(final_output$`1558617`)
print(final_output$`1557156`)
fantasia::dcOMOP(conn = conn,
                 remove = TRUE)


meerapatelmd/OncoRegimenFinder documentation built on Jan. 1, 2021, 9:25 a.m.