knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(tidyverse) library(rlang) library(rubix) library(OncoRegimenFinder)
library(fantasia) conn <- fantasia::connectOMOP()
The results of the 1) OncoRegimenFinder algorithm is compared to the 2) abstracted Esophagus REDCap cohort treatment data.
The Esophagus Cohort table patelm9.esophagus_cohort
:
esophagus_cohort <- pg13::readTable(conn = conn, schema = "patelm9", tableName = "esophagus_cohort")
head(esophagus_cohort) %>% kableExtra::kable()
OncoRegimenFinder Regimen Ingredient Table patelm9.oncoregimenfinder_regimen_ingredient
:
regimen_ingredient <- pg13::query(conn = conn, pg13::buildQuery(schema = "patelm9", tableName = "oncoregimenfinder_regimen_ingredients", n = 20, n_type = "random"))
head(regimen_ingredient) %>% kableExtra::kable()
The following SQL is used to join the 2 Source Tables:
sql <- pg13::buildJoinQuery(fields = c("\npatelm9.esophagus_cohort.person_id AS esophagus_person_id\n", "patelm9.esophagus_cohort.ptid\n", "patelm9.oncoregimenfinder_regimen_ingredients.*\n"), schema = "patelm9", tableName = "esophagus_cohort", column = "person_id", joinType = "LEFT", joinOnSchema = "patelm9", joinOnTableName = "oncoregimenfinder_regimen_ingredients", joinOnColumn = "person_id") cat(sql)
esophagus_oncoregimenfinder_results <- pg13::query(conn = conn, sql_statement = sql)
Output:
esophagus_oncoregimenfinder_results <- esophagus_oncoregimenfinder_results %>% tibble::as_tibble() %>% dplyr::mutate_all(as.character) %>% rubix::normalize_all_to_na() head(esophagus_oncoregimenfinder_results) %>% kableExtra::kable()
data_summary <- esophagus_oncoregimenfinder_results %>% rubix::summarize_variables() %>% dplyr::select(-contains("DISTINCT_VALUES"))
Analysis of OncoRegimenFinder results shows that out of the unique OMOP Person Id count of r data_summary[data_summary$Variable == "esophagus_person_id", "DISTINCT_COUNT"]
patients in the Esophagus Cohort overall, only r data_summary[data_summary$Variable == "person_id", "DISTINCT_COUNT"]
Person Ids were resulted from OncoRegimenFinder.
Full Metrics:
data_summary %>% kableExtra::kable()
input <- fantasia::processRedcapData(data_csv_file = "~/GitHub/MSK-patelm9/escritoire/Esophagus/data_with_phi/STGIEsophagogastricS_DATA_2020-07-28_1354.csv", parsed_metadata_csv_file = "~/GitHub/MSK-patelm9/escritoire/Esophagus/data_without_phi/STGIEsophagogastricSandbox_DataDictionary_2020-06-24_Parsed.csv", identifier_variables = "ptid") redcapData <- input$ProcessedData
The REDCap Data is exported as a tabular file and the main patient identifier and all variables associated with oncology treatments are selected.
identifier_vars <- "ptid" treatment_vars <- c('neoadj_regimen', 'neoadj_tx_additional_drugs', 'neoadj_tx_addtl_drugs_2', 'crt_name', 'neoadjcrt_tx_additional_drugs', 'neoadjcrt_tx_addtl_drugs_2', 'adj_regimen', 'adj_tx_additional_drugs', 'adj_tx_addtl_drugs_2', 'adj_rt_concurrent', 'adjcrt_tx_additional_drugs', 'adjcrt_tx_addtl_drugs_2', 'induction_regimen', 'periop_neo_regimen', 'periop_adj_regimen', 'tx_name', 'advanced_tx_additional_drugs', 'advanced_tx_addtl_drugs_2', 'tx_maintenance_drugs', 'advanced_tx_addtl_drugsm1_1', 'advanced_tx_addtl_drugsm1_2', 'tx_maintenance2_drugs', 'advanced_tx_addtl_drugsm2_1', 'advanced_tx_addtl_drugsm2_2', 'palliative_rt_concurrent', 'neoadj_tx_base_regimen', 'neoadjcrt_tx_base_regimen', 'adj_tx_base_regimen', 'adjcrt_tx_base_regimen', 'advanced_tx_base_regimen', 'regimen_name_m1', 'regimen_name_m2', 'hipec_drugs')
qa <- treatment_vars[!(treatment_vars %in% colnames(input$ProcessedData))] treatment_vars <- treatment_vars[(treatment_vars %in% colnames(input$ProcessedData))]
For each of the Variables, a copy of the REDCap Data is selected for the Patient Identifier (ptid
) and the Variable, as well as any associated Start and Stop Dates and placed in a list object.
output <- treatment_vars %>% rubix::map_names_set(function(x) redcapData %>% tibble::as_tibble() %>% rubix::normalize_all_to_na() %>% dplyr::filter_at(vars(all_of(x)), any_vars(!is.na(.))) %>% dplyr::select(all_of(identifier_vars), all_of(x), contains("start"), contains("stop")) %>% rubix::deselect_if_all_na()) %>% purrr::keep(~nrow(.) > 0) %>% # If the df has less than 2 columns, that means there wasn't a date associated with the variable purrr::keep(~ncol(.) > 2)
The output is a list of the length of the treament_vars
, where each position of the list occupied by the Patient Identifier, the treatment_var, and all the possible date data associated with that treatment. Unlike other REDCap projects where there is a single set of start and end dates associated with the treatment, the Esophagus REDCap took it an extra step by including valuable clinical context information in their data collection. For example, the Variable regimen_name_2
has the following possible dates: tx_start-tx_stop, tx_pre_maintenance_start-tx_pre_maintenance_stop, and tx_pre_maintenance2_start-tx_pre_maintenance2_stop.
print(output$regimen_name_m2)
For this particular project, r output %>% purrr::keep(~ncol(.)>4) %>% length()
Variables have more than 1 set of start-stop dates for treatments due to contextual differences.
In order to manage the one-to-many relationship between a treatment and the contextual start and stop dates, the all the date Variables data are pivoted and the context is separated from the DateType
, whether the Date
represents a start or stop of a treatment.
output2 <- output %>% purrr::map2(names(output), function(x, y) x %>% tidyr::pivot_longer(cols = c(contains("start"), contains("stop")), names_to = c("Context", "DateType"), names_pattern = "(^.*)_([start|stop].*$)", values_to = "Date")) %>% purrr::map(function(x) x %>% dplyr::rename_at(vars(2), ~paste("PV"))) %>% dplyr::bind_rows(.id = "VARIABLE") print(output2)
Lastly, the data is massaged into the Patient Events Table Format, thereby successfully isolating the start-stop date concept pairs to a single observation level. At this stage any rows where both the Event Start and End Date are NA or blank are filtered out.
output3 <- output2 %>% dplyr::mutate(DateType = forcats::as_factor(DateType)) %>% dplyr::mutate(DateType = forcats::fct_collapse(DateType, EVENT_START_DATE = c("start", "start_date"), EVENT_END_DATE = c("stop", "stop_date"))) %>% tidyr::pivot_wider(id_cols = c(!DateType, !Date), names_from = DateType, values_from = Date, values_fn = list(Date = function(x) paste(unique(x), collapse = "\n"))) print(output3)
The REDCap Data is then joined with the Esophagus Cohort Data on the ptid
Variable to be able to map the data back to the OMOP Person Id. The Event Date columns are copied to additional NORMALIZE
columns. These columns will also be added to the OncoRegimenFinder results in order to perform a union and to compare the accuracy between the 2 data sources.
output4 <- output3 %>% dplyr::full_join(esophagus_cohort) %>% dplyr::select(all_of(colnames(output3)), person_id) %>% rubix::normalize_all_to_na() %>% dplyr::filter_at(vars(c(EVENT_START_DATE, EVENT_END_DATE)), any_vars(!is.na(.))) %>% dplyr::mutate(NORMALIZE_START_DATE = EVENT_START_DATE, NORMALIZE_END_DATE = EVENT_END_DATE) print(output4)
To prepare the REDCap Data for the join with the OncoRegimenFinder results for a side-by-side comparison, the last step is grouping the REDCap Data by Person Id.
output5 <- split(output4, output4$person_id)
The OncoRegimenFinder results is also be grouped based on Person Id, allowing for specific joins at the person level after copying the Date data into their corresponding NORMALIZE
columns.
esophagus_results2 <- esophagus_oncoregimenfinder_results %>% dplyr::mutate(NORMALIZE_START_DATE = ingredient_start_date, NORMALIZE_END_DATE = ingredient_end_date) %>% dplyr::mutate_at(vars(person_id), as.integer) print(esophagus_results2) esophagus_results3 <- split(esophagus_results2, esophagus_results2$esophagus_person_id)
The data from both sources are then blended, with a DATA_SOURCE
column to demarcate whether the source was OMOP or REDCap.
final_output <- list(REDCAP = output5, OMOP = esophagus_results3) %>% purrr::transpose() %>% purrr::map(dplyr::bind_rows, .id = "DATA_SOURCE")
print(final_output$`1558617`) print(final_output$`1557156`)
fantasia::dcOMOP(conn = conn, remove = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.