DataQuality: Data Quality study in OHDSI and data quality evaluation

#DQD Centralized processing
#----------------------------------------
#----------------------------------------
#this script assummes coordinating center infrustructure for loading
#athena dictionaries (.rda files istead of in a database)
#now truly doing 2 sites

#load athena dictionary
library(tidyverse);library(magrittr);options(tibble.print_max = 200)
load('o:/athena/concept.rda')

#lkup<-concept %>% filter(vocabulary_id %in% c('CPT4','ICD9Proc','CDT','HCPCS','ICD9CM','ICD10CM','ICD10PCS'))

#reading a single site data (for now) 
f<-'d:/OneDrive - National Institutes of Health/temp/dqd/export'
f<-'d:/OneDrive - National Institutes of Health/ohdsi/thresholds'

sfiles<-c(file.path(f,'1ThresholdsA.csv'))
sfiles<-c(file.path(f,'test-ThresholdsA.csv'))
sfiles<-c(file.path(f,'1ThresholdsA.csv'),file.path(f,'ThresholdsA.csv'),file.path(f,'test-ThresholdsA.csv'))
#3 sites processing +1
sfiles<-c(file.path(f,'01ThresholdsB.csv')
          ,file.path(f,'02ThresholdsB.csv')
          ,file.path(f,'03ThresholdsB.csv')
          ,file.path(f,'04ThresholdsB.csv')
          ,file.path(f,'05ThresholdsB.csv')
          ,file.path(f,'06ThresholdsB.csv')
)
ll<-map(sfiles,read_csv)
ll

#ll<-map(p$pid,doProperty())
#strip name from full path trick

#make lowercase the column names
llmoded<-map(ll,~{names(.x)<-tolower(names(.x));return(.x)})
#llmoded[[1]]
#ll[[1]]
ll2<-map2(llmoded,basename(sfiles),~mutate(.x,site=.y))
d<-bind_rows(ll2)


#---end of reading input files

#add terminology concepts
sconcept<-concept %>% select(concept_id,concept_name,concept_code)
names(d) <- tolower(names(d))
names(d)
#remove no units rows and expand the CIDs
#stratum hav suffix id
# d2<-d %>% filter(stratum_1 != 0) %>% filter(stratum_2 != 0) %>% left_join(sconcept,by=c('stratum_1'='concept_id')) %>%
#   left_join(sconcept,by=c('stratum_2'='concept_id')) 


d2<-d %>% filter(count_value >=11 ) %>% filter(stratum1_id != 0) %>% filter(stratum2_id != 0) %>% 
  left_join(concept %>% select(concept_id,concept_name,concept_code),by=c('stratum1_id'='concept_id')) %>%
  left_join(concept %>% select(concept_id,concept_name),by=c('stratum2_id'='concept_id')) %>% filter(!is.na(concept_name.x))
#test in 2B range are excluded by last filter

names(d2)

#overview of sites
soverview<-d2 %>% count(site)
soverview
#soverview %>% write_csv('extras/DqdResults/S1_overview.csv')




ba<-d2 %>% group_by(stratum1_id,stratum2_id,concept_name.x,concept_name.y,concept_code) %>% summarize(tcnt=sum(count_value),n=n())
ba %>% filter(n>=2) %>% nrow()
nrow(ba)
#4465 distinct test-unit pairs
#872 test-unit paris have 2+ sites



#tests with more units
ba %>% ungroup() %>%  count(stratum1_id,concept_name.x) %>% filter(n>=2)
#TODO improve later




#end of section


#---------------comparison with expert driven

#read expert driven checks
library(stats);library(tidyverse);library(magrittr)
#message("\n*** Successfully loaded .Rprofile ***\n")


url='https://raw.githubusercontent.com/OHDSI/DataQualityDashboard/master/inst/csv/OMOP_CDMv5.3.1_Concept_Level.csv'
dqd<-read_csv(url)
str(dqd)
names(dqd)
dqd %>% dplyr::filter(cdmTableName=='MEASUREMENT' & cmdFieldName=='MEASUREMENT_CONCEPT_ID' ) 
dqd %>% dplyr::filter(cdmFieldName=='MEASUREMENT_CONCEPT_ID' ) %>% nrow()
dqd %>% count(cdmTableName,cdmFieldName)


#compare data driven and expert drive sets
#d$STRATUM_1 %<>% as.integer()
dqd$unitConceptId %<>% as.integer()
expert <-dqd %>% dplyr::filter(cdmFieldName=='MEASUREMENT_CONCEPT_ID' )
nrow(expert)
#856 threshold checks are in expert driven KB
names(expert)
elabs<-expert %>% group_by(conceptId,conceptName) %>% summarise(unitcnt=n(),units=paste(unitConceptName,collapse = "|"))
nrow(elabs)
# for 330 distinct lab tests
#elabs %>% write_csv('extras/DqdResults/DQD-expert-driven-A-lab-list.csv')  

names(expert)

names(d2)
#renaming d2 (data driven) data to prepare for comparison of expert and data driven
ddriven<-d2 %>% rename(conceptId=stratum1_id,unitConceptId=stratum2_id) 

#ddriven2 is on test-unit pair level (only one row per lab-unit pair)
ddriven2<-ba %>% rename(conceptId=stratum1_id,unitConceptId=stratum2_id) 



#ddriven %<>% filter(conceptId!=0)
#ddriven %<>% filter(unitConceptId!=0)

over=expert %>% inner_join(ddriven2) 
nrow(over)
#451 pairs are overlapping between ddriven (data driven) and expert (expert driven)
#View(over)

not2<-expert %>% anti_join(ddriven2) 
nrow(not2)
#405 are in expert list but not in data from any site

#freq rank for lab test

freq<-ddriven %>% group_by(conceptId) %>% summarize(tcnt=sum(count_value)) %>% 
   mutate(lab_test_freq_rank = dense_rank(desc(tcnt)))





#---more analysis in March 2020
#mean  and median should be close, one super high value shifts the mean (could be used in logic)
#3020891 body temp

#list units for each lab test
names(ddriven)

# bb<-ddriven %>% group_by(conceptId,concept_name.x) %>% dplyr::summarize(n=n()                                                            
#                                                                         ,unitIds=paste(unitConceptId,collapse = '|')
#                                                             ,units=paste(concept_name.y,collapse = '|'))
#by lab test view
bb2<-ddriven2 %>% ungroup %>% arrange(desc(n),desc(tcnt)) %>% group_by(conceptId,concept_name.x) %>% dplyr::summarize(                                                            
                                                                        unitIds=paste(unitConceptId,collapse = '|')
                                                                        ,units=paste(concept_name.y,collapse = '|')
                                                                        ,siteCnts=paste(n,collapse='|')
                                                                        ,distinct_unit_cnt=n())

names(ba)
viewSites<-ba %>% ungroup() %>%  count(n)
viewSites
# # A tibble: 6 x 2
# n    nn
# <int> <int>
#   1     1  4593
# 2     2   641
# 3     3   434
# 4     4   247
# 5     5    27
# 6     6     1


#bset<-ba %>% filter(n==3)
#bc<-ddriven %>% inner_join(bset)
#3009542 hematocrit
options(scipen=999) #disable scientific notation
names(ddriven)
prekb<-ddriven %>% group_by(conceptId,concept_name.x,unitConceptId,concept_name.y,concept_code) %>% summarize(
  n=n()
  ,sum_count_value=sum(count_value)
  #,kb_min_mean=mean(min_value)
  #,kb_max_mean=mean(max_value)
  ,kb_min_median=median(min_value)
  ,kb_max_median=median(max_value)
  ,kb_p01_value_median=median(p01_value)
  ,kb_p99_value_median=median(p99_value)
  ,kb_p50_median=median(median_value)
  ,kb_stdev_median=median(stdev_value)
)  %>% ungroup() %>% 
    mutate(freq_rank = dense_rank(desc(sum_count_value)))

kb<- prekb%>% filter(n>=2) %>% filter(sum_count_value>=1)

#analysis on lab test only level
#expert has 330 distinct tests

#kb <- read_csv('local/S01-benchmark-data2.csv')
kb %>% write_csv('local/S01-benchmark-data2.csv')

kbExport<-kb %>% filter(sum_count_value>=100) %>% select(-sum_count_value) %>% arrange(freq_rank)
kbExport %>% write_csv('extras/DqdResults/S01-benchmark-kb-subset.csv')
kbExport %>% write_csv(            'local/S01-benchmark-kb-subset.csv')

#conservative
kbExport<-kb %>% filter(sum_count_value>=100) %>% select(-sum_count_value) %>% arrange(freq_rank)



names(kb)
names(ddriven)
                                        

#analysis on lab test only level
#expert has 330 distinct tests
#bb2 is essentially that
  #labs<-prekb  %>% group_by(conceptId,concept_name.x) %>% summarise(n=n(),sum_rank=sum(freq_rank)) 
#add freq rank to it

bb3<-bb2 %>% left_join(freq %>% select(-tcnt)) %>% arrange(lab_test_freq_rank)
names(bb3)
bb3 %>%  nrow()
bb3 %>% write_csv('extras/DqdResults/S02-KB-lab-test-view.csv')

#test that are in data but  are not in expert driven (on lab test level)
not1<-bb3 %>% anti_join(expert) 
nrow(not1)
#3993 are in data (full KB) but are absent in expert driven KB


bb3 %>% nrow()
#4282 distinct lab test (including data from a single site)

#terms
#benchmark KB = has content only if 2+ sites contribute thresholds
#full KB = provides thresholds even if they come from a single site (is bigger than benchmark)





#compare the trehsholds
names(over)
over %>% select(conceptName,unitConceptName,plausibleValueLow,min_value)
over %>% select(conceptName,unitConceptName,plausibleValueHigh,max_value) 
#%>% knitr::kable()


#expert thresholds don't follow unit conversion logic (max and min is same even if units indicate order of magniture difference)
#MEASUREMENT	MEASUREMENT_CONCEPT_ID	3013721	Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma	8713	gram per deciliter	5	5	2000	5	NA	NA	NA	NA	NA	NA	NA	NA
#MEASUREMENT	MEASUREMENT_CONCEPT_ID	3013721	Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma	8840	milligram per deciliter	5	5	2000

#5g/dL into  mg/dL  (is 5000 mg/dL)
#in data is in fact unit/L



#unitmorph
#Protein [Mass/volume] in Serum or Plasma	7096851	4	gram per deciliter|unit|milligram per deciliter|gram per liter
#	gram per deciliter|   |milligram per deciliter |   gram per liter
names(d3)
bb<-d3 %>% filter(site=='ThresholdsA.csv') %>% group_by(stratum_1,concept_name.x) %>% 
  summarize(tcnt=sum(count_value)
            ,n=n(),units=paste(concept_name.y,collapse = '|')
            ,cnts=paste(count_value ,collapse = '|')
            ,unitcids=paste(stratum_2,collapse = '|')
  )

bb %>% write_csv('local/morphA.csv')

#more unit work
u<-read_csv('local/conv-dev.csv')
names(u)
u %<>% filter(is.na(measurement_concept_id))
u2<-tibble(unit_concept_id=u$target_unit_concept_id
       ,target_unit_concept_id=u$unit_concept_id
       ,factor=1/u$factor)
ub<-bind_rows(u,u2)  %>% 
  left_join(sconcept,by=c('unit_concept_id'='concept_id')) %>% 
  left_join(sconcept,by=c('target_unit_concept_id'='concept_id'))

vojtechhuser/DataQuality documentation built on May 10, 2020, 8:31 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

vojtechhuser/DataQuality
Data Quality study in OHDSI and data quality evaluation

extras/CentralProcessingDQDThresholds.R
In vojtechhuser/DataQuality: Data Quality study in OHDSI and data quality evaluation

R Package Documentation

Browse R Packages

We want your feedback!

vojtechhuser/DataQuality Data Quality study in OHDSI and data quality evaluation

extras/CentralProcessingDQDThresholds.R In vojtechhuser/DataQuality: Data Quality study in OHDSI and data quality evaluation

R Package Documentation

Browse R Packages

We want your feedback!

vojtechhuser/DataQuality
Data Quality study in OHDSI and data quality evaluation

extras/CentralProcessingDQDThresholds.R
In vojtechhuser/DataQuality: Data Quality study in OHDSI and data quality evaluation