knitr::opts_chunk$set( collapse = TRUE, comment = "#>", echo = FALSE, warning = FALSE, error = FALSE, out.width = "100%", dpi = 300 )
library(knitr) library(ggplot2) library(dplyr) library(gt) library(patchwork) library(markovchain)
This is a mid-stream quality evaluation and does not check for schema conformity. This should be conducted separately through the application of the DECOVID-OMOP dll file constraints. It is assumed that these have been applied successfully before running this report.
ctn <- setup_ctn(params) schema <- params$schema st <- prepare_tables(ctn, schema) overview <- prepare_overview(st) vo_id_true <- all(!is.na(overview$visit_occurrence_id))
Check that all visit_occurrence_id is populated: r vo_id_true
Row counts for core clinical tables:
clinical_tbls <- prepare_tally(ctn, schema) zero_vals <- clinical_tbls %>% filter(n == 0) %>% select(table) %>% pull() clinical_tbls %>% arrange(table) %>% gt() %>% tab_options(table.width = pct(100))
The proportion of missing data is shown. Sometimes is can be entirely appropriate. Rows are shown in red if they have a higher level of missingness than acceptable tolerances.
st %>% prepare_null_counts() %>% gt() %>% tab_style( style = cell_fill(color = "red1", alpha = 0.5), locations = cells_body( rows = `% rows missing` > tolerance) ) %>% tab_options(table.width = pct(100))
if ("person" %in% zero_vals) { cat("There is no data in the person table") } else { age_plot <- plot_age(overview) check_age <- min(overview$age, na.rm = TRUE) >= 18 sex_plot <- plot_sex(st[["person"]]) ethnic_plot <- plot_ethnicity(st[["person"]]) age_plot + sex_plot + ethnic_plot }
Madatory checks
r check_age
Date boundaries of the cohort:
if ("visit_occurrence" %in% zero_vals) { cat("There is no data in the visit_occurrence table") } else { st[["visit_occurrence"]] %>% prepare_cohort_boundaries() %>% gt() %>% tab_style( style = cell_fill(color = "red1", alpha = 0.5), locations = cells_body( rows = tolerance == "fail") ) %>% tab_options(table.width = pct(100)) }
Number of visits stratified by visit type:
if (!("visit_occurrence" %in% zero_vals)) { st[["visit_occurrence"]] %>% group_by(visit_concept_id) %>% summarise( count = n(), percentage = round((n()/nrow(st[["visit_occurrence"]]))*100, 0) ) %>% gt() %>% tab_options(table.width = pct(100)) }
plot_visit_profile(st[["visit_occurrence"]])
Admission codes outside the approved list:
asci <- vo_ans %>% filter(target_columnn == "admitting_source_concept_id") %>% select(concept_id, concept_name) %>% distinct() st[["visit_occurrence"]] %>% filter(!(admitting_source_concept_id %in% asci$concept_name)) %>% group_by(admitting_source_concept_id) %>% tally() %>% gt() %>% tab_options(table.width = pct(100))
st[["visit_occurrence"]] %>% # mutate(admitting_source_concept_id = factor( # x = admitting_source_concept_id, levels = asci$concept_name, labels = asci$concept_name # )) %>% mutate(admitting_source_concept_id = forcats::fct_infreq(admitting_source_concept_id)) %>% group_by(admitting_source_concept_id) %>% tally() %>% check_zero_tally(n) %>% ggplot(aes(x = admitting_source_concept_id)) + geom_point(aes(y = n, colour = .is_zero)) + geom_segment(aes( y = 0, yend = n, xend = admitting_source_concept_id)) + scale_colour_manual(values = c("red", "black"), drop = FALSE) + theme_classic() + labs(y = "count") + theme( legend.position = "none", plot.title.position = "plot", axis.title.y = element_blank()) + coord_flip() + ggtitle("Admission Source")
Discharge codes outside the approved list:
dtci <- vo_ans %>% filter(target_columnn == "discharge_to_concept_id") %>% select(concept_id, concept_name) %>% distinct() st[["visit_occurrence"]] %>% filter(!(discharge_to_concept_id %in% dtci$concept_name)) %>% group_by(discharge_to_concept_id) %>% tally() %>% gt()
st[["visit_occurrence"]] %>% # mutate(discharge_to_concept_id = factor( # x = discharge_to_concept_id, levels = dtci$concept_name, labels = dtci$concept_name # )) %>% mutate(discharge_to_concept_id = forcats::fct_infreq(discharge_to_concept_id)) %>% group_by(discharge_to_concept_id) %>% tally() %>% check_zero_tally(n) %>% ggplot(aes(x = discharge_to_concept_id)) + geom_point(aes(y = n, colour = .is_zero)) + geom_segment(aes( y = 0, yend = n, xend = discharge_to_concept_id)) + scale_colour_manual(values = c("red", "black"), drop = FALSE) + theme_classic() + labs(y = "count") + theme( legend.position = "none", plot.title.position = "plot", axis.title.y = element_blank()) + coord_flip() + ggtitle("Discharge To")
All deaths inside the hospital should be aligned with the final visit end datetime. This is an OMOP convention and doesn't necessarily mean the data is bad.
ghost_visit <- st[["visit_occurrence"]] %>% left_join(st[["death"]] %>% select(person_id, death_date, death_datetime), by = "person_id") %>% filter(visit_start_date > death_date) testthat::expect_true(nrow(ghost_visit) == 0)
All records that have a discharge recorded at "Patient died" should also have a death record in the death table.
st[["visit_occurrence"]] %>% filter(discharge_to_concept_id == "Patient died") %>% left_join( st[["death"]] %>% select(person_id, death_date, death_datetime) %>% mutate(death_record = TRUE), by = "person_id") %>% group_by(death_record = !is.na(death_record)) %>% tally() %>% gt()
Deviations of more than 24 hours (by datetime) or 48 hours (by date) should be investigated:
death_discrep <- overview %>% filter( !is.na(visit_concept_id) & (!is.na(death_datetime) | !is.na(death_date)) & discharge_to_concept_id == "Patient died") %>% group_by(person_id) %>% filter(max(visit_start_datetime) == visit_start_datetime) %>% ungroup() %>% select(person_id, death_datetime, death_date, visit_end_datetime) %>% mutate( diff_date = as.integer(difftime(visit_end_datetime, death_date, units = "hours")), diff_dttm = as.integer(difftime(visit_end_datetime, death_datetime, units = "hours"))) death_discrep %>% summarise( `death 24 hrs < visit end (dttm)` = sum(diff_dttm > 24, na.rm = TRUE), `death within tolerance (dttm)` = sum(diff_dttm <= 24 & diff_dttm >= -24, na.rm = TRUE), `death 24 hrs > visit end (dttm)` = sum(diff_dttm < -24, na.rm = TRUE), `death 48 hrs < visit end (date)` = sum(diff_date > 48, na.rm = TRUE), `death within tolerance (date)` = sum(diff_date <= 48 & diff_date >= -48, na.rm = TRUE), `death 48 hrs > visit end (date)` = sum(diff_date < -48, na.rm = TRUE), ) %>% tidyr::pivot_longer(everything(), names_to = "tolerance test", values_to = "n") %>% group_by(dttm = grepl(x = `tolerance test`, pattern = "dttm")) %>% mutate( total = sum(n), `%` = round((n/total)*100, 0)) %>% ungroup() %>% mutate(tolerance = c(5, 90, 5, 5, 90, 5)) %>% select(-dttm, -total) %>% gt() %>% tab_style( style = cell_fill(color = "red1", alpha = 0.5), locations = cells_body( rows = (`%` < tolerance & grepl("tolerance", `tolerance test`)) | (`%` > tolerance & !grepl("tolerance", `tolerance test`))) ) %>% tab_options(table.width = pct(100))
plot_min <- min(pmin(death_discrep$diff_date, death_discrep$diff_dttm), na.rm = TRUE) plot_max <- max(pmax(death_discrep$diff_date, death_discrep$diff_dttm), na.rm = TRUE) death_discrep_date <- death_discrep %>% filter(!is.na(diff_date)) %>% ggplot(aes(x = diff_date)) + geom_density() + theme_classic() + ggtitle("Hours difference between visit end and death date") + theme( axis.text.y = element_blank(), axis.title = element_blank()) + lims(x = c(plot_min, plot_max)) death_discrep_dttm <- death_discrep %>% filter(!is.na(diff_dttm)) %>% ggplot(aes(x = diff_dttm)) + geom_density() + theme_classic() + labs(x = "time difference (hours)") + ggtitle("Hours difference between visit end and death date time") + theme( axis.text.y = element_blank(), axis.title.y = element_blank()) + lims(x = c(plot_min, plot_max)) death_discrep_date / death_discrep_dttm
Are all visits ordered sequentially:
overlapping <- overview %>% select(person_id, visit_start_datetime, visit_end_datetime) %>% group_by(person_id) %>% arrange(person_id, visit_start_datetime) %>% mutate(lead_start = quick_lead(visit_start_datetime)) %>% mutate(overlap = lead_start < visit_end_datetime) %>% mutate(overlap = if_else(is.na(overlap), FALSE, overlap)) overlapping %>% group_by(overlap) %>% tally() %>% gt()
What are the lengths of stay:
res_adm <- st[["visit_occurrence"]] %>% mutate( los = as.numeric( difftime(visit_end_datetime, visit_start_datetime, units = "days"))) %>% select(person_id, los) los_plot <- res_adm %>% filter(!is.na(los)) %>% mutate(outlier = outliers(los)) %>% mutate(los = if_else( outlier == "main", los, scales::rescale(los, to = c(max(los)-0.02*max(los), max(los))))) %>% ggplot(aes(x = los)) + geom_density() + facet_grid(cols = vars(outlier), scales = "free_x", space = "free_x") + theme_classic() + theme(strip.background = element_rect(fill = "grey80", colour = "grey80")) + labs(x = "length of stay (days)") los_tab <- ggplotGrob(los_plot) p_filtered <- gtable_filter_remove(los_tab, name = "axis-b-2", trim = TRUE) grid::grid.newpage() grid::grid.draw(p_filtered)
res_adm_tbl <- res_adm %>% mutate( `length of stay` = case_when( los < 0 ~ "negative", los == 0 ~ "zero", los > 0 ~ "positive", TRUE ~ "missing los data" ) ) %>% group_by(`length of stay`) %>% tally() %>% mutate( total = sum(n), `%` = round((n/total)*100, 0) ) %>% select(-total) res_adm_toll <- tibble( `length of stay` = c("negative", "zero", "positive", "missing los data"), tolerance = c(1, 1, 99, 1) ) res_adm_toll %>% left_join(res_adm_tbl, by = "length of stay") %>% gt() %>% tab_style( style = cell_fill(color = "red1", alpha = 0.5), locations = cells_body( rows = (`%` > tolerance & `length of stay` != "positive") | (`%` < tolerance & `length of stay` == "positive") )) %>% tab_options(table.width = pct(100))
visit_n <- overview %>% group_by(person_id) %>% tally(name = "attendance_number") %>% group_by(attendance_number) %>% tally() %>% mutate(attendance_number = cut( x = attendance_number, breaks = c(1:10, 20, 100), labels = c(paste0(1:9), "[10, 20)", "[20, 100)"), right = FALSE)) %>% group_by(attendance_number) %>% summarise(n = sum(n)) %>% mutate(attendance_number = forcats::fct_rev(attendance_number)) visit_n %>% ggplot() + geom_linerange(aes(xmin = 0, xmax = n, y = attendance_number, group = attendance_number)) + geom_point(aes(x = n, y = attendance_number)) + theme_classic() + labs(y = "attendances per patient", x = "count") + ggtitle("Number of Admissions Observed per Patient")
Are all visits ordered sequentially:
overlapping <- st[["visit_detail"]] %>% select(person_id, visit_detail_start_datetime, visit_detail_end_datetime) %>% group_by(person_id) %>% arrange(person_id, visit_detail_start_datetime) %>% mutate(lead_start = quick_lead(visit_detail_start_datetime)) %>% mutate(overlap = lead_start < visit_detail_end_datetime) %>% mutate(overlap = if_else(is.na(overlap), FALSE, overlap)) overlapping %>% group_by(overlap) %>% tally() %>% gt()
What are the lengths of stay:
cs <- tbl(ctn, in_schema(schema, "care_site")) %>% collect() %>% select( concept_id = care_site_id, concept_name = care_site_name ) %>% mutate(concept_id = as.integer(concept_id)) st[["visit_detail"]]$care_site_id <- match_concepts(st[["visit_detail"]]$care_site_id, cs) ## Temporary fix st[["visit_detail"]]$care_site_id <- if_else(is.na(st[["visit_detail"]]$care_site_id), "Out Patient Department", st[["visit_detail"]]$care_site_id) res_adm <- st[["visit_detail"]] %>% mutate( los = as.numeric( difftime(visit_detail_end_datetime, visit_detail_start_datetime, units = "days"))) %>% select(person_id, care_site_id, los) res_adm_tbl <- res_adm %>% mutate( `length of stay` = case_when( los < 0 ~ "negative", los == 0 ~ "zero", los > 0 ~ "positive", TRUE ~ "missing los data" ) ) %>% group_by(`length of stay`) %>% tally() %>% mutate( total = sum(n), `%` = round((n/total)*100, 0) ) %>% select(-total) res_adm_toll <- tibble( `length of stay` = c("negative", "zero", "positive", "missing los data"), tolerance = c(1, 1, 99, 1) ) res_adm_toll %>% left_join(res_adm_tbl, by = "length of stay") %>% gt() %>% tab_style( style = cell_fill(color = "red1", alpha = 0.5), locations = cells_body( rows = (`%` > tolerance & `length of stay` != "positive") | (`%` < tolerance & `length of stay` == "positive") )) %>% tab_options(table.width = pct(100))
The following is a transition matrix of moves within the visit_detail, fit under MLE. It does not consider transitions as a function of time spent in any location.
cs <- tbl(ctn, in_schema(schema, "care_site")) %>% collect() loc_order <- c("External (Pt Home)", "External (Pt Other)", unique(cs$care_site_name), "Died") tm <- transition_matrix(overview, st, possible_states = loc_order) plot_tm(tm, loc_order)
tm_fit <- as_tibble(tm$estimate@transitionMatrix) %>% tibble::add_column( source = rownames(tm$estimate@transitionMatrix), .before = TRUE) %>% pivot_longer(-contains("source"), names_to = "destination", values_to = "value") %>% mutate(across(c("source", "destination"), factor, levels = loc_order)) tm_fit %>% filter(source == destination, value > 0) %>% gt()
Death is an absorbing state, so we shouldn't be able to see the patient appear anywhere else afterwards
testthat::expect_true(nrow(filter(tm_fit, source == "Died", value > 0)) == 0)
The following list the outcomes for patients attending at r params$local_hospital
:
overview %>% distinct(person_id, .keep_all = TRUE) %>% filter(!is.na(visit_occurrence_id)) %>% group_by(visit_concept_id, death = !is.na(death_datetime)) %>% tally() %>% group_by(visit_concept_id) %>% mutate( total = sum(n), `%` = round((n/total)*100, 0) ) %>% gt()
Typically patients are admitted earlier in the day, and discarged later. Death often has a uniform distribution, with perhaps a slight peak in the early morning or later afternoon.
overview %>% select(visit_start_datetime, visit_end_datetime, death_datetime) %>% mutate_all(~ as.POSIXct(., tz = "UTC")) %>% filter_all(any_vars(!is.na(.))) %>% mutate_all(~ hms::as_hms(.)) %>% tidyr::pivot_longer(everything(), names_to = "time_point", values_to = "time") %>% ggplot(aes(x = time, fill = time_point)) + geom_density(alpha = 0.5) + theme_classic() + labs(x = "event time distribution", fill = "event") + theme(legend.position = "bottom")
Missing concepts are highlighted in red:
meas <- tbl(ctn, in_schema(schema, "measurement")) %>% group_by(measurement_concept_id) %>% tally() %>% collect() %>% mutate(across(everything(), as.integer)) # Measurements that are in the roadmap & the DB meas_in_ref <- meas$measurement_concept_id[meas$measurement_concept_id %in% dq_ref$concept_id] # Measurements that are not in the roadmap but appear in the DB meas_nin_ref <- meas$measurement_concept_id[!(meas$measurement_concept_id %in% dq_ref$concept_id)] # Measurements that appear in the DB, but are not in the roadmap meas_out <- dq_ref$concept_id[!(dq_ref$concept_id %in% meas$measurement_concept_id)] meas_all <- sort(unique(c(meas_in_ref, meas_nin_ref, meas_out))) meas_all <- mini_dict(ctn, schema, meas_all) meas_all <- meas_all %>% mutate( status = case_when( concept_id %in% meas_in_ref ~ "Onboarded", concept_id %in% meas_out ~ "Pending", concept_id %in% meas_nin_ref ~ "Not in roadmap" )) %>% select(concept_id, concept_name, status) %>% arrange(concept_id) %>% left_join(meas, by = c("concept_id" = "measurement_concept_id")) %>% mutate(n = if_else(is.na(n), 0L, n)) meas_all %>% gt() %>% tab_style( style = cell_fill(color = "red1", alpha = 0.5), locations = cells_body( rows = status == "Pending") ) %>% tab_options(table.width = pct(100))
This section details all measurements:
Error codes:
meas_dq <- meas_all %>% left_join(dq_ref[,c("concept_id", "short_name", "target_column", "units_concept_id", "limits")], by = "concept_id") rmd <- vector(mode = "list", length = nrow(meas_dq)) for (i in seq_along(meas_dq$concept_id)) { if (meas_dq$status[i] == "Onboarded") { rmd[[i]] <- try(knitr::knit_expand("child_measurement.Rmd", x = meas_dq$concept_id[i])) } else if (meas_dq$status[i] == "Pending") { rmd[[i]] <- try(knitr::knit_expand("child_pending.Rmd", x = meas_dq$concept_id[i])) } else { rmd[[i]] <- try(knitr::knit_expand("child_off_roadmap.Rmd", x = meas_dq$concept_id[i])) } } # rmd <- meas_dq$concept_id %>% # purrr::map( # ~ knitr::knit_expand("child_measurement.Rmd", x = .x) # ) rmd <- paste(rmd, collapse = "\n")
rendered <- knit(text = rmd, quiet = TRUE) cat(rendered, sep = "\n")
Unique specimen concepts:
if (!("specimen" %in% zero_vals)) { spec <- tally_concept(ctn, schema, "specimen", "specimen_concept_id") spec %>% arrange(desc(n)) %>% gt() %>% tab_options(table.width = pct(100)) }
Unique anatamical sites:
if (!("specimen" %in% zero_vals)) { spec_as <- tally_concept(ctn, schema, "specimen", "anatomic_site_concept_id") spec_as %>% arrange(desc(n)) %>% gt() %>% tab_options(table.width = pct(100)) }
The top 50 most common drugs given:
if (!("drug_exposure" %in% zero_vals)) { drug <- tally_concept(ctn, schema, "drug_exposure", "drug_concept_id") drug %>% arrange(desc(n)) %>% slice(1:50) %>% gt() %>% tab_options(table.width = pct(100)) }
The top 20 most common conditions:
if (!("condition_occurrence" %in% zero_vals)) { cond <- tally_concept(ctn, schema, "condition_occurrence", "condition_concept_id") cond %>% arrange(desc(n)) %>% slice(1:20) %>% gt() %>% tab_options(table.width = pct(100)) }
Condition types seen:
if (!("condition_occurrence" %in% zero_vals)) { cond_t <- tally_concept(ctn, schema, "condition_occurrence", "condition_type_concept_id") cond_t %>% arrange(desc(n)) %>% gt() %>% tab_options(table.width = pct(100)) }
Condition status seen:
if (!("condition_occurrence" %in% zero_vals)) { cond_s <- tally_concept(ctn, schema, "condition_occurrence", "condition_status_concept_id") cond_s %>% arrange(desc(n)) %>% gt() %>% tab_options(table.width = pct(100)) }
All procedures seen:
if (!("procedure_occurrence" %in% zero_vals)) { proc <- tally_concept(ctn, schema, "procedure_occurrence", "procedure_concept_id") proc %>% arrange(desc(n)) %>% gt() %>% tab_options(table.width = pct(100)) }
if (!("fact_relationship" %in% zero_vals)) { fr_types <- tally_concept(ctn, schema, "fact_relationship", "relationship_concept_id") fr_types %>% arrange(desc(n)) %>% gt() %>% tab_options(table.width = pct(100)) }
DBI::dbDisconnect(ctn)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.