knitr::opts_chunk$set(comment = "##",
                      tidy = TRUE, #`styler` to use styler:style_text() to reformat code
                      tidy.opts = list(blank = FALSE, width.cutoff = 60),
                      echo = TRUE,
                      eval = TRUE,
                      cache = TRUE,
                      cache.path = "cache/",
                      child = NULL, #file/s to knit and then include,
                      collapse = FALSE, #collapse all output into a single block,
                      error = TRUE, #display error messages in doc. FALSE stops render when error is thrown
                      fig.align = "center", #left, right, center, or default
                      fig.width = 7, #inches
                      fig.height = 7, #inches
                      include = TRUE, #include chunk?
                      message = FALSE, #display code messages?
                      tidy = TRUE, #tidy code 
                      warning = FALSE, #include warnings?
                      results = "markup"
                        # "asis": passthrough results
                        # "hide": do not display results 
                        # "hold": put all results below all code
                      )

Last updated on: r Sys.Date()

library(mOMOP)
library(chariot)
library(tidyverse)
conn <- chariot::connectAthena()

This vignette takes a look at the Value Sets in the mCode Data Dictionary to create the standard library of Oncology concepts, available for loading in this package.

The mCode valuesets are retrieved from the publicly available data dictionary.

value_sets <- get_value_sets()
value_sets

To map the valuesets to OMOP concepts, it is split by Code System, the equivalent to the vocabulary_id field in OMOP's Concept table, for joining. As of r Sys.Date(), there are 9 Code Systems used in mCode.

value_sets_by_vocab <- 
  value_sets %>%
  rubix::split_by(col = `Code System`)
names(value_sets_by_vocab)

Specimen

In mCode, Specimen representation is based on HL7's Code System while in OMOP, the Specimen domain has its own subset of concepts. Furthermore, mCode's
Specimen representation maps to other OMOP Concept Ids. The Specimen representation in this package therefore has 2 parts:

  1. A map between mCode and OMOP Concepts outside OMOP's Specimen domain
  2. OMOP's Specimen domain concepts

Map from mCode to OMOP

Specimen representation is derived from the valuesets.

specimen_library <- 
  value_sets_by_vocab$`http://terminology.hl7.org/CodeSystem/v2-0487` %>%
  rubix::format_colnames()
head(specimen_library)

The specimen description is joined on the Concept Synonym Names in the Concept Synonym table to retrieve the Concept Id the mCode Specimen maps to.

specimen_omop_library1 <-
join_on_concept_synonym_name(
  data = specimen_library,
  column = "code_description",
  case_insensitive = TRUE,
  conn = conn
) %>%
  select(-concept_synonym_name,
         -language_concept_id) %>%
  rename(match_concept_id = concept_id)
head(specimen_omop_library1)

The complete Concept representation is taken from the Concept Id.

specimen_omop_library2 <-
  join_on_concept_id(
    data = specimen_omop_library1,
    column = "match_concept_id",
    conn = conn
  )
head(specimen_omop_library2)

To maintain a one-to-one representation of the original grain of information, the OMOP mappings are pivoted on the OMOP Domain to see how each mCode Specimen maps to by domain.

specimen_omop_library3 <-
  specimen_omop_library2 %>%
  merge_strip(into = "concept",
              domain_id) %>%
  pivot_wider(id_cols =  !concept,
              names_from = domain_id,
              values_from = concept) %>%
  select(-match_concept_id) %>%
  distinct()
head(specimen_omop_library3)

To gather a complete representation of Specimens, any missing Specimen concepts in OMOP are added to the library.

omop_specimen <-
  chariot::queryAthena("SELECT *
                     FROM omop_vocabulary.concept 
                     WHERE domain_id = 'Specimen' AND concept_class_id = 'Specimen';",
                     conn = conn) %>%
  merge_strip(into = "Specimen") %>%
  select(-Specimen_id)
head(omop_specimen)
specimen_omop_library <-
specimen_omop_library3 %>%
  full_join(omop_specimen,
            by = "Specimen") %>%
  distinct()
head(specimen_omop_library)

This file is written to the data-raw/ folder for distribution if it does not already exist.

file <- file.path(getwd(), "data-raw", "specimen.csv")
if (!file.exists(file)) {
  write_csv(x = specimen_omop_library,
            file = file)
}

Cancer Staging

mCode uses the AJCC TNM Staging system, which correlates with NCIt concepts in the OMOP vocabulary.

cancer_staging_library <-
  value_sets_by_vocab$`http://cancerstaging.org` %>%
  rubix::format_colnames()
head(cancer_staging_library)

As it is represented in the OMOP Vocabulary, NCIt is not separated by Tumor, Node, and Metastasis like it is in mCode.

ncit_omop_library <-
  chariot::queryAthena("SELECT *
                     FROM omop_vocabulary.concept 
                     WHERE vocabulary_id = 'NCIt' AND concept_class_id = 'AJCC Category';",
                     conn = conn)
head(ncit_omop_library)

NCIt AJCC Category concepts are therefore grouped based on pattern matching with the Concept Code.

ncit_omop_library2 <- ncit_omop_library
ncit_omop_library2$value_set_name <- ""

ncit_omop_library2 <-
  ncit_omop_library2 %>%
  mutate(value_set_name = 
           case_when(
             grepl("^[cp]{1}M", concept_code) ~ "TNMDistantMetastasesCategoryVS",
             grepl("^[cp]{1}N", concept_code) ~ "TNMRegionalNodesCategoryVS",
             grepl("^[cp]{1}T", concept_code) ~ "TNMPrimaryTumorCategoryVS"
             )
         )
head(ncit_omop_library2)

A correlate to mCode's TNMStageGroupVS value set is not present in the OMOP Vocabularies, but since it can be derived from the TNM categories, it is skipped.

cancer_staging_omop_library <-
  cancer_staging_library %>%
  left_join(ncit_omop_library2,
            by = "value_set_name")
head(cancer_staging_omop_library)

This data is written to a cancer_staging.csv in the data-raw/ folder for distribution if it does not already exist.

file <- file.path(getwd(), "data-raw", "cancer_staging.csv")
if (!file.exists(file)) {
  write_csv(x = cancer_staging_omop_library,
            file = file)
}

Genomics

HGVS, HGNC, and ClinVar are collapsed into a single Genomics category.

HGVS

mCode uses HGVS (http://varnomen.hgvs.org), but these codes are not in the OMOP Vocabulary and cannot be derived from the website. Therefore, it is skipped for now.

hgvs_library <-
  value_sets_by_vocab$`http://varnomen.hgvs.org` %>%
  rubix::format_colnames()
head(hgvs_library)

HGNC

Gene Names in mCode are presumed to be derived from HGNC and the entire HGNC subset of the OMOP Vocabularies are included in the library.

hgnc_library <-
  value_sets_by_vocab$`http://www.genenames.org/geneId` %>%
  rubix::format_colnames()
head(hgnc_library)
hgnc_omop_library1 <-
  chariot::queryAthena("SELECT *
                     FROM omop_vocabulary.concept 
                     WHERE vocabulary_id = 'HGNC';",
                     conn = conn)
head(hgnc_omop_library1)

The OMOP HGNC concepts are joined to the mCode data dictionary set.

hgnc_omop_library2 <-
  hgnc_omop_library1 %>%
  mutate(value_set_name = "HGNCVS")
head(hgnc_omop_library2)
hgnc_omop_library <-
  hgnc_library %>%
  left_join(hgnc_omop_library2,
            by = "value_set_name")
head(hgnc_omop_library)

ClinVar

Like HGNC, the entire ClinVar subset of the OMOP Vocabularies are included in the library.

clinvar_library <-
  value_sets_by_vocab$ClinVar %>%
  rubix::format_colnames()
head(clinvar_library)
clinvar_omop_library1 <-
chariot::queryAthena("SELECT *
                     FROM omop_vocabulary.concept 
                     WHERE vocabulary_id = 'ClinVar';",
                     conn = conn)
head(clinvar_omop_library1)
clinvar_omop_library2 <-
  clinvar_omop_library1 %>%
  mutate(value_set_name = "ClinVarVS")
head(clinvar_omop_library2)

clinvar_omop_library <-
  clinvar_library %>%
  left_join(clinvar_omop_library2,
            by = "value_set_name")
head(clinvar_omop_library)

Final Genomics Library

The final genomics library file is written if it does not already exist.

genomics_omop_library <-
  bind_rows(hgnc_omop_library,
            clinvar_omop_library)

file <- file.path(getwd(), "data-raw", "genomics.csv")
if (!file.exists(file)) {
  write_csv(x = genomics_omop_library,
            file = file)
}

Units of Measure

Like Specimen, the Units of Measure representation is limited and requires incorporation of the OMOP UCUM vocabulary valueset.

uom_library <-
  value_sets_by_vocab$`http://unitsofmeasure.org` %>%
  rubix::format_colnames()
head(uom_library)
ucum_omop_library1 <-
chariot::queryAthena("SELECT *
                     FROM omop_vocabulary.concept 
                     WHERE vocabulary_id = 'UCUM';",
                     conn = conn) 
head(ucum_omop_library1)
uom_omop_library <-
  uom_library %>%
  bind_rows(ucum_omop_library1)
head(uom_omop_library)

Write the data to a file if it does not already exist.

file <- file.path(getwd(), "data-raw", "unitsofmeasurement.csv")
if (!file.exists(file)) {
  write_csv(x = uom_omop_library,
            file = file)
}

SNOMED

The SNOMED subset of the library contains:

To derive all concepts, the code in the logical_definition field is extracted based on the listed scenario above.

snomed_library <-
   value_sets_by_vocab$`SNOMED CT` %>%
  rubix::format_colnames() %>%
  distinct() %>%
  mutate_all(as.character) %>%
  extract(col = logical_definition,
          into = "descendants_of",
          regex = "includes codes descending from ([0-9]{1,})[^0-9]{1}.*$",
          remove = FALSE) %>%
  extract(col = logical_definition,
          into = "ancestors_of",
          regex = "excludes codes descending from ([0-9]{1,})[^0-9]{1}.*$",
          remove = FALSE) %>%
  mutate(all_codes = coalesce(code, ancestors_of, descendants_of)) %>%
  mutate_all(trimws)
head(snomed_library)

OMOP Concept table is joined to the mCode's SNOMED library by code.

snomed_omop_library1 <-
chariot::join_on_concept_code(kind = "LEFT",
                              data = snomed_library,
                              column = "all_codes",
                              where_in_concept_field = "vocabulary_id",
                              where_in_concept_field_value = "SNOMED")
snomed_omop_library1 <-
  snomed_library %>%
  left_join(snomed_omop_library1,
             by = c("value_set_name", 
                    "ancestors_of",
                    "descendants_of",
                    "all_codes",
                    "code_system", 
                    "logical_definition", 
                    "code", 
                    "code_description")) %>%
  distinct()
head(snomed_omop_library1)

The resultset is then split into 3 based on whether the descendants (a) or ancestors (b) need to be derived, or if the concept is explicitly stated by code (c).

The descendants are derived for those mCode concepts that included descendants.

snomed_omop_library2a <-
  snomed_omop_library1 %>%
  filter(!is.na(descendants_of)) 

snomed_omop_library2a2 <-
  join_for_descendants(kind = "LEFT",
                       data = snomed_omop_library2a,
                       ancestor_id_column = "concept_id") %>%
  select(all_of(colnames(snomed_library)),
         starts_with("descendant_")) %>%
  rename_all(~ str_remove_all(., pattern = "^descendant_"))
head(snomed_omop_library2a2)

The ancestors are derived for those mCode concepts that excluded descendants.

snomed_omop_library2b <-
  snomed_omop_library1 %>%
  filter(!is.na(ancestors_of))

snomed_omop_library2b2 <-
  join_for_ancestors(kind = "LEFT",
                     data = snomed_omop_library2b,
                     descendant_id_column = "concept_id") %>%
  filter(min_levels_of_separation != 0) %>%
  select(all_of(colnames(snomed_library)),
         starts_with("ancestor_")) %>%
  rename_all(~ str_remove_all(., pattern = "^ancestor_"))
head(snomed_omop_library2b2)

The concepts that are explicitly stated do not require any additional derivation.

snomed_omop_library2c <-
  snomed_omop_library1 %>% 
  filter(is.na(descendants_of), is.na(ancestors_of))
head(snomed_omop_library2c)

All 3 subsets are recombined before writing to file.

snomed_omop_library <-
  bind_rows(snomed_omop_library2a2,
            snomed_omop_library2b2,
            snomed_omop_library2c)
head(snomed_omop_library)
file <- file.path(getwd(), "data-raw", "snomed.csv")
if (!file.exists(file)) {
  write_csv(x = snomed_omop_library,
            file = file)
}

LOINC

loinc_library <-
   value_sets_by_vocab$LOINC %>%
  rubix::format_colnames() %>%
  mutate_all(as.character) %>%
  mutate_all(trimws) %>%
  distinct()
head(loinc_library)
loinc_omop_library <-
chariot::join_on_concept_code(kind = "LEFT",
                              data = loinc_library,
                              column = "code",
                              where_in_concept_field = "vocabulary_id",
                              where_in_concept_field_value = "LOINC")
loinc_omop_library <-
  loinc_library %>%
  left_join(loinc_omop_library,
             by = c("value_set_name", 
                    "code_system", 
                    "logical_definition", 
                    "code", 
                    "code_description")) %>%
  distinct()
head(loinc_omop_library)
file <- file.path(getwd(), "data-raw", "loinc.csv")
if (!file.exists(file)) {
  write_csv(x = loinc_omop_library,
            file = file)
}

ICD-10 CM

icd10cm_library <-
   value_sets_by_vocab$`ICD-10-CM` %>%
  rubix::format_colnames() %>%
  mutate_all(as.character) %>%
  mutate_all(trimws) %>%
  mutate(code = str_replace_all(string = code,
                                pattern = "(^[A-Z]{1}[0-9A-Z]{2})([0-9A-Z]{1}.*$)",
                                replacement = "\\1.\\2")) %>%
  distinct()

head(icd10cm_library)
icd10cm_omop_library <-
chariot::join_on_concept_code(kind = "LEFT",
                              data = icd10cm_library,
                              column = "code",
                              where_in_concept_field = "vocabulary_id",
                              where_in_concept_field_value = "ICD10CM")

icd10cm_omop_library <-
  icd10cm_library %>%
  left_join(icd10cm_omop_library,
             by = c("value_set_name", 
                    "code_system", 
                    "logical_definition", 
                    "code", 
                    "code_description")) %>%
  distinct()
head(icd10cm_omop_library)
file <- file.path(getwd(), "data-raw", "icd10cm.csv")
if (!file.exists(file)) {
  write_csv(x = icd10cm_omop_library,
            file = file)
}
chariot::dcAthena(conn = conn)


meerapatelmd/mOMOP documentation built on June 15, 2021, 3:01 p.m.