knitr::opts_chunk$set(comment = "##", tidy = TRUE, #`styler` to use styler:style_text() to reformat code tidy.opts = list(blank = FALSE, width.cutoff = 60), echo = TRUE, eval = TRUE, cache = TRUE, cache.path = "cache/", child = NULL, #file/s to knit and then include, collapse = FALSE, #collapse all output into a single block, error = TRUE, #display error messages in doc. FALSE stops render when error is thrown fig.align = "center", #left, right, center, or default fig.width = 7, #inches fig.height = 7, #inches include = TRUE, #include chunk? message = FALSE, #display code messages? tidy = TRUE, #tidy code warning = FALSE, #include warnings? results = "markup" # "asis": passthrough results # "hide": do not display results # "hold": put all results below all code )
Last updated on: r Sys.Date()
library(mOMOP) library(chariot) library(tidyverse)
conn <- chariot::connectAthena()
This vignette takes a look at the Value Sets in the mCode Data Dictionary to create the standard library of Oncology concepts, available for loading in this package.
The mCode valuesets are retrieved from the publicly available data dictionary.
value_sets <- get_value_sets() value_sets
To map the valuesets to OMOP concepts, it is split by Code System
, the
equivalent to the vocabulary_id
field in OMOP's Concept table, for joining.
As of r Sys.Date()
, there are 9 Code Systems
used in mCode.
value_sets_by_vocab <- value_sets %>% rubix::split_by(col = `Code System`) names(value_sets_by_vocab)
In mCode, Specimen representation is based on HL7's Code System while in OMOP,
the Specimen domain has its own subset of concepts. Furthermore, mCode's
Specimen representation maps to other OMOP Concept Ids. The Specimen
representation in this package therefore has 2 parts:
Specimen representation is derived from the valuesets.
specimen_library <- value_sets_by_vocab$`http://terminology.hl7.org/CodeSystem/v2-0487` %>% rubix::format_colnames() head(specimen_library)
The specimen description is joined on the Concept Synonym Names in the Concept Synonym table to retrieve the Concept Id the mCode Specimen maps to.
specimen_omop_library1 <- join_on_concept_synonym_name( data = specimen_library, column = "code_description", case_insensitive = TRUE, conn = conn ) %>% select(-concept_synonym_name, -language_concept_id) %>% rename(match_concept_id = concept_id)
head(specimen_omop_library1)
The complete Concept representation is taken from the Concept Id.
specimen_omop_library2 <- join_on_concept_id( data = specimen_omop_library1, column = "match_concept_id", conn = conn )
head(specimen_omop_library2)
To maintain a one-to-one representation of the original grain of information, the OMOP mappings are pivoted on the OMOP Domain to see how each mCode Specimen maps to by domain.
specimen_omop_library3 <- specimen_omop_library2 %>% merge_strip(into = "concept", domain_id) %>% pivot_wider(id_cols = !concept, names_from = domain_id, values_from = concept) %>% select(-match_concept_id) %>% distinct() head(specimen_omop_library3)
To gather a complete representation of Specimens, any missing Specimen concepts in OMOP are added to the library.
omop_specimen <- chariot::queryAthena("SELECT * FROM omop_vocabulary.concept WHERE domain_id = 'Specimen' AND concept_class_id = 'Specimen';", conn = conn) %>% merge_strip(into = "Specimen") %>% select(-Specimen_id)
head(omop_specimen)
specimen_omop_library <- specimen_omop_library3 %>% full_join(omop_specimen, by = "Specimen") %>% distinct() head(specimen_omop_library)
This file is written to the data-raw/
folder for distribution if it does not
already exist.
file <- file.path(getwd(), "data-raw", "specimen.csv") if (!file.exists(file)) { write_csv(x = specimen_omop_library, file = file) }
mCode uses the AJCC TNM Staging system, which correlates with NCIt concepts in the OMOP vocabulary.
cancer_staging_library <- value_sets_by_vocab$`http://cancerstaging.org` %>% rubix::format_colnames() head(cancer_staging_library)
As it is represented in the OMOP Vocabulary, NCIt is not separated by Tumor, Node, and Metastasis like it is in mCode.
ncit_omop_library <- chariot::queryAthena("SELECT * FROM omop_vocabulary.concept WHERE vocabulary_id = 'NCIt' AND concept_class_id = 'AJCC Category';", conn = conn)
head(ncit_omop_library)
NCIt AJCC Category
concepts are therefore grouped based on pattern matching
with the Concept Code.
ncit_omop_library2 <- ncit_omop_library ncit_omop_library2$value_set_name <- "" ncit_omop_library2 <- ncit_omop_library2 %>% mutate(value_set_name = case_when( grepl("^[cp]{1}M", concept_code) ~ "TNMDistantMetastasesCategoryVS", grepl("^[cp]{1}N", concept_code) ~ "TNMRegionalNodesCategoryVS", grepl("^[cp]{1}T", concept_code) ~ "TNMPrimaryTumorCategoryVS" ) ) head(ncit_omop_library2)
A correlate to mCode's TNMStageGroupVS
value set is not present in the OMOP
Vocabularies, but since it can be derived from the TNM categories, it is skipped.
cancer_staging_omop_library <- cancer_staging_library %>% left_join(ncit_omop_library2, by = "value_set_name") head(cancer_staging_omop_library)
This data is written to a cancer_staging.csv
in the data-raw/
folder for
distribution if it does not already exist.
file <- file.path(getwd(), "data-raw", "cancer_staging.csv") if (!file.exists(file)) { write_csv(x = cancer_staging_omop_library, file = file) }
HGVS, HGNC, and ClinVar are collapsed into a single Genomics
category.
mCode uses HGVS (http://varnomen.hgvs.org), but these codes are not in the OMOP Vocabulary and cannot be derived from the website. Therefore, it is skipped for now.
hgvs_library <- value_sets_by_vocab$`http://varnomen.hgvs.org` %>% rubix::format_colnames() head(hgvs_library)
Gene Names in mCode are presumed to be derived from HGNC and the entire HGNC subset of the OMOP Vocabularies are included in the library.
hgnc_library <- value_sets_by_vocab$`http://www.genenames.org/geneId` %>% rubix::format_colnames() head(hgnc_library)
hgnc_omop_library1 <- chariot::queryAthena("SELECT * FROM omop_vocabulary.concept WHERE vocabulary_id = 'HGNC';", conn = conn)
head(hgnc_omop_library1)
The OMOP HGNC concepts are joined to the mCode data dictionary set.
hgnc_omop_library2 <- hgnc_omop_library1 %>% mutate(value_set_name = "HGNCVS") head(hgnc_omop_library2)
hgnc_omop_library <- hgnc_library %>% left_join(hgnc_omop_library2, by = "value_set_name") head(hgnc_omop_library)
Like HGNC, the entire ClinVar subset of the OMOP Vocabularies are included in the library.
clinvar_library <- value_sets_by_vocab$ClinVar %>% rubix::format_colnames() head(clinvar_library)
clinvar_omop_library1 <- chariot::queryAthena("SELECT * FROM omop_vocabulary.concept WHERE vocabulary_id = 'ClinVar';", conn = conn)
head(clinvar_omop_library1)
clinvar_omop_library2 <- clinvar_omop_library1 %>% mutate(value_set_name = "ClinVarVS") head(clinvar_omop_library2) clinvar_omop_library <- clinvar_library %>% left_join(clinvar_omop_library2, by = "value_set_name") head(clinvar_omop_library)
The final genomics library file is written if it does not already exist.
genomics_omop_library <- bind_rows(hgnc_omop_library, clinvar_omop_library) file <- file.path(getwd(), "data-raw", "genomics.csv") if (!file.exists(file)) { write_csv(x = genomics_omop_library, file = file) }
Like Specimen, the Units of Measure representation is limited and requires
incorporation of the OMOP UCUM
vocabulary valueset.
uom_library <- value_sets_by_vocab$`http://unitsofmeasure.org` %>% rubix::format_colnames() head(uom_library)
ucum_omop_library1 <- chariot::queryAthena("SELECT * FROM omop_vocabulary.concept WHERE vocabulary_id = 'UCUM';", conn = conn)
head(ucum_omop_library1)
uom_omop_library <- uom_library %>% bind_rows(ucum_omop_library1) head(uom_omop_library)
Write the data to a file if it does not already exist.
file <- file.path(getwd(), "data-raw", "unitsofmeasurement.csv") if (!file.exists(file)) { write_csv(x = uom_omop_library, file = file) }
The SNOMED subset of the library contains:
To derive all concepts, the code in the logical_definition
field is extracted
based on the listed scenario above.
snomed_library <- value_sets_by_vocab$`SNOMED CT` %>% rubix::format_colnames() %>% distinct() %>% mutate_all(as.character) %>% extract(col = logical_definition, into = "descendants_of", regex = "includes codes descending from ([0-9]{1,})[^0-9]{1}.*$", remove = FALSE) %>% extract(col = logical_definition, into = "ancestors_of", regex = "excludes codes descending from ([0-9]{1,})[^0-9]{1}.*$", remove = FALSE) %>% mutate(all_codes = coalesce(code, ancestors_of, descendants_of)) %>% mutate_all(trimws) head(snomed_library)
OMOP Concept table is joined to the mCode's SNOMED library by code.
snomed_omop_library1 <- chariot::join_on_concept_code(kind = "LEFT", data = snomed_library, column = "all_codes", where_in_concept_field = "vocabulary_id", where_in_concept_field_value = "SNOMED") snomed_omop_library1 <- snomed_library %>% left_join(snomed_omop_library1, by = c("value_set_name", "ancestors_of", "descendants_of", "all_codes", "code_system", "logical_definition", "code", "code_description")) %>% distinct()
head(snomed_omop_library1)
The resultset is then split into 3 based on whether the descendants (a) or ancestors (b) need to be derived, or if the concept is explicitly stated by code (c).
The descendants are derived for those mCode concepts that included descendants.
snomed_omop_library2a <- snomed_omop_library1 %>% filter(!is.na(descendants_of)) snomed_omop_library2a2 <- join_for_descendants(kind = "LEFT", data = snomed_omop_library2a, ancestor_id_column = "concept_id") %>% select(all_of(colnames(snomed_library)), starts_with("descendant_")) %>% rename_all(~ str_remove_all(., pattern = "^descendant_"))
head(snomed_omop_library2a2)
The ancestors are derived for those mCode concepts that excluded descendants.
snomed_omop_library2b <- snomed_omop_library1 %>% filter(!is.na(ancestors_of)) snomed_omop_library2b2 <- join_for_ancestors(kind = "LEFT", data = snomed_omop_library2b, descendant_id_column = "concept_id") %>% filter(min_levels_of_separation != 0) %>% select(all_of(colnames(snomed_library)), starts_with("ancestor_")) %>% rename_all(~ str_remove_all(., pattern = "^ancestor_"))
head(snomed_omop_library2b2)
The concepts that are explicitly stated do not require any additional derivation.
snomed_omop_library2c <- snomed_omop_library1 %>% filter(is.na(descendants_of), is.na(ancestors_of)) head(snomed_omop_library2c)
All 3 subsets are recombined before writing to file.
snomed_omop_library <- bind_rows(snomed_omop_library2a2, snomed_omop_library2b2, snomed_omop_library2c) head(snomed_omop_library)
file <- file.path(getwd(), "data-raw", "snomed.csv") if (!file.exists(file)) { write_csv(x = snomed_omop_library, file = file) }
loinc_library <- value_sets_by_vocab$LOINC %>% rubix::format_colnames() %>% mutate_all(as.character) %>% mutate_all(trimws) %>% distinct() head(loinc_library)
loinc_omop_library <- chariot::join_on_concept_code(kind = "LEFT", data = loinc_library, column = "code", where_in_concept_field = "vocabulary_id", where_in_concept_field_value = "LOINC") loinc_omop_library <- loinc_library %>% left_join(loinc_omop_library, by = c("value_set_name", "code_system", "logical_definition", "code", "code_description")) %>% distinct()
head(loinc_omop_library)
file <- file.path(getwd(), "data-raw", "loinc.csv") if (!file.exists(file)) { write_csv(x = loinc_omop_library, file = file) }
icd10cm_library <- value_sets_by_vocab$`ICD-10-CM` %>% rubix::format_colnames() %>% mutate_all(as.character) %>% mutate_all(trimws) %>% mutate(code = str_replace_all(string = code, pattern = "(^[A-Z]{1}[0-9A-Z]{2})([0-9A-Z]{1}.*$)", replacement = "\\1.\\2")) %>% distinct() head(icd10cm_library)
icd10cm_omop_library <- chariot::join_on_concept_code(kind = "LEFT", data = icd10cm_library, column = "code", where_in_concept_field = "vocabulary_id", where_in_concept_field_value = "ICD10CM") icd10cm_omop_library <- icd10cm_library %>% left_join(icd10cm_omop_library, by = c("value_set_name", "code_system", "logical_definition", "code", "code_description")) %>% distinct()
head(icd10cm_omop_library)
file <- file.path(getwd(), "data-raw", "icd10cm.csv") if (!file.exists(file)) { write_csv(x = icd10cm_omop_library, file = file) }
chariot::dcAthena(conn = conn)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.