In DocEd/d.inspectEHR: Data Quality Evaluation for DECOVID

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  echo = FALSE, warning = FALSE, error = FALSE,
  out.width = "100%",
  dpi = 300
)

library(knitr)
library(ggplot2)
library(dplyr)
library(gt)
library(patchwork)
library(markovchain)

Schema Checks

This is a mid-stream quality evaluation and does not check for schema conformity. This should be conducted separately through the application of the DECOVID-OMOP dll file constraints. It is assumed that these have been applied successfully before running this report.

Structural Checks

ctn <- setup_ctn(params)
schema <- params$schema

st <- prepare_tables(ctn, schema)
overview <- prepare_overview(st)

vo_id_true <- all(!is.na(overview$visit_occurrence_id))

Check that all visit_occurrence_id is populated: r vo_id_true

Row counts for core clinical tables:

clinical_tbls <- prepare_tally(ctn, schema)

zero_vals <- clinical_tbls %>%
  filter(n == 0) %>%
  select(table) %>%
  pull()

clinical_tbls %>%
  arrange(table) %>%
  gt() %>%
  tab_options(table.width = pct(100))

The proportion of missing data is shown. Sometimes is can be entirely appropriate. Rows are shown in red if they have a higher level of missingness than acceptable tolerances.

st %>%
  prepare_null_counts() %>%
  gt() %>%
  tab_style(
    style = cell_fill(color = "red1", alpha = 0.5),
    locations = cells_body(
      rows = `% rows missing` > tolerance)
  ) %>%
  tab_options(table.width = pct(100))

Person Table

if ("person" %in% zero_vals) {
  cat("There is no data in the person table")
} else {
  age_plot <- plot_age(overview)
  check_age <- min(overview$age, na.rm = TRUE) >= 18

  sex_plot <- plot_sex(st[["person"]])
  ethnic_plot <- plot_ethnicity(st[["person"]])

  age_plot + sex_plot + ethnic_plot
}

Madatory checks

Minimum age >= 18: r check_age

Visit Characteristics

Date boundaries of the cohort:

if ("visit_occurrence" %in% zero_vals) {
  cat("There is no data in the visit_occurrence table")
} else {
  st[["visit_occurrence"]] %>%
    prepare_cohort_boundaries() %>%
    gt() %>%
    tab_style(
      style = cell_fill(color = "red1", alpha = 0.5),
      locations = cells_body(
        rows = tolerance == "fail")
    ) %>%
    tab_options(table.width = pct(100))
}

Number of visits stratified by visit type:

if (!("visit_occurrence" %in% zero_vals)) {
  st[["visit_occurrence"]] %>%
    group_by(visit_concept_id) %>%
    summarise(
      count = n(),
      percentage = round((n()/nrow(st[["visit_occurrence"]]))*100, 0)
    ) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

plot_visit_profile(st[["visit_occurrence"]])

Admission codes outside the approved list:

asci <- vo_ans %>%
  filter(target_columnn == "admitting_source_concept_id") %>%
  select(concept_id, concept_name) %>%
  distinct()

st[["visit_occurrence"]] %>%
  filter(!(admitting_source_concept_id %in% asci$concept_name)) %>%
  group_by(admitting_source_concept_id) %>%
  tally() %>%
  gt() %>%
  tab_options(table.width = pct(100))

st[["visit_occurrence"]] %>%
  # mutate(admitting_source_concept_id = factor(
  #   x = admitting_source_concept_id, levels = asci$concept_name, labels = asci$concept_name
  # )) %>%
  mutate(admitting_source_concept_id = forcats::fct_infreq(admitting_source_concept_id)) %>%
  group_by(admitting_source_concept_id) %>%
  tally() %>%
  check_zero_tally(n) %>%
  ggplot(aes(x = admitting_source_concept_id)) +
  geom_point(aes(y = n, colour = .is_zero)) +
  geom_segment(aes(
    y = 0,
    yend = n,
    xend = admitting_source_concept_id)) +
  scale_colour_manual(values = c("red", "black"), drop = FALSE) +
  theme_classic() +
  labs(y = "count") +
  theme(
    legend.position = "none",
    plot.title.position = "plot",
    axis.title.y = element_blank()) +
    coord_flip() +
  ggtitle("Admission Source")

Discharge codes outside the approved list:

dtci <- vo_ans %>%
  filter(target_columnn == "discharge_to_concept_id") %>%
  select(concept_id, concept_name) %>%
  distinct()

st[["visit_occurrence"]] %>%
  filter(!(discharge_to_concept_id %in% dtci$concept_name)) %>%
  group_by(discharge_to_concept_id) %>%
  tally() %>%
  gt()

st[["visit_occurrence"]] %>%
  #   mutate(discharge_to_concept_id = factor(
  #   x = discharge_to_concept_id, levels = dtci$concept_name, labels = dtci$concept_name
  # )) %>%
  mutate(discharge_to_concept_id = forcats::fct_infreq(discharge_to_concept_id)) %>%
  group_by(discharge_to_concept_id) %>%
  tally() %>%
  check_zero_tally(n) %>%
  ggplot(aes(x = discharge_to_concept_id)) +
  geom_point(aes(y = n, colour = .is_zero)) +
  geom_segment(aes(
    y = 0,
    yend = n,
    xend = discharge_to_concept_id)) +
  scale_colour_manual(values = c("red", "black"), drop = FALSE) +
  theme_classic() +
  labs(y = "count") +
  theme(
    legend.position = "none",
    plot.title.position = "plot",
    axis.title.y = element_blank()) +
  coord_flip() +
  ggtitle("Discharge To")

All deaths inside the hospital should be aligned with the final visit end datetime. This is an OMOP convention and doesn't necessarily mean the data is bad.

ghost_visit <- st[["visit_occurrence"]] %>%
  left_join(st[["death"]] %>%
              select(person_id, death_date, death_datetime),
            by = "person_id") %>%
  filter(visit_start_date > death_date)

testthat::expect_true(nrow(ghost_visit) == 0)

All records that have a discharge recorded at "Patient died" should also have a death record in the death table.

st[["visit_occurrence"]] %>%
  filter(discharge_to_concept_id == "Patient died") %>%
  left_join(
    st[["death"]] %>%
      select(person_id, death_date, death_datetime) %>%
      mutate(death_record = TRUE),
    by = "person_id") %>%
  group_by(death_record = !is.na(death_record)) %>%
  tally() %>%
  gt()

Deviations of more than 24 hours (by datetime) or 48 hours (by date) should be investigated:

death_discrep <- overview %>%
  filter(
    !is.na(visit_concept_id) &
    (!is.na(death_datetime) | !is.na(death_date)) &
    discharge_to_concept_id == "Patient died") %>%
  group_by(person_id) %>%
  filter(max(visit_start_datetime) == visit_start_datetime) %>%
  ungroup() %>%
  select(person_id, death_datetime, death_date, visit_end_datetime) %>%
  mutate(
    diff_date = as.integer(difftime(visit_end_datetime, death_date, units = "hours")),
    diff_dttm = as.integer(difftime(visit_end_datetime, death_datetime, units = "hours")))

death_discrep %>%
  summarise(
    `death 24 hrs < visit end (dttm)` = sum(diff_dttm > 24, na.rm = TRUE),
    `death within tolerance (dttm)` = sum(diff_dttm <= 24 & diff_dttm >= -24, na.rm = TRUE),
    `death 24 hrs > visit end (dttm)` = sum(diff_dttm < -24, na.rm = TRUE),
    `death 48 hrs < visit end (date)` = sum(diff_date > 48, na.rm = TRUE),
    `death within tolerance (date)` = sum(diff_date <= 48 & diff_date >= -48, na.rm = TRUE),
    `death 48 hrs > visit end (date)` = sum(diff_date < -48, na.rm = TRUE),
  ) %>%
  tidyr::pivot_longer(everything(), names_to = "tolerance test", values_to = "n") %>%
  group_by(dttm = grepl(x = `tolerance test`, pattern = "dttm")) %>%
  mutate(
    total = sum(n),
    `%` = round((n/total)*100, 0)) %>%
  ungroup() %>%
  mutate(tolerance = c(5, 90, 5, 5, 90, 5)) %>%
  select(-dttm, -total) %>%
  gt() %>%
    tab_style(
      style = cell_fill(color = "red1", alpha = 0.5),
      locations = cells_body(
        rows = (`%` < tolerance & grepl("tolerance", `tolerance test`)) |
               (`%` > tolerance & !grepl("tolerance", `tolerance test`)))
    ) %>%
  tab_options(table.width = pct(100))

plot_min <- min(pmin(death_discrep$diff_date, death_discrep$diff_dttm), na.rm = TRUE)
plot_max <- max(pmax(death_discrep$diff_date, death_discrep$diff_dttm), na.rm = TRUE)

death_discrep_date <- death_discrep %>%
  filter(!is.na(diff_date)) %>%
  ggplot(aes(x = diff_date)) +
  geom_density() +
  theme_classic() +
  ggtitle("Hours difference between visit end and death date") +
  theme(
    axis.text.y = element_blank(),
    axis.title = element_blank()) +
  lims(x = c(plot_min, plot_max))

death_discrep_dttm <- death_discrep %>%
  filter(!is.na(diff_dttm)) %>%
  ggplot(aes(x = diff_dttm)) +
  geom_density() +
  theme_classic() +
  labs(x = "time difference (hours)") +
  ggtitle("Hours difference between visit end and death date time") +
  theme(
    axis.text.y = element_blank(),
    axis.title.y = element_blank()) +
  lims(x = c(plot_min, plot_max))

death_discrep_date / death_discrep_dttm

Visit Occurrence

Are all visits ordered sequentially:

overlapping <- overview %>%
  select(person_id, visit_start_datetime, visit_end_datetime) %>%
  group_by(person_id) %>%
  arrange(person_id, visit_start_datetime) %>%
  mutate(lead_start = quick_lead(visit_start_datetime)) %>%
  mutate(overlap = lead_start < visit_end_datetime) %>%
  mutate(overlap = if_else(is.na(overlap), FALSE, overlap))

overlapping %>%
  group_by(overlap) %>%
  tally() %>%
  gt()

What are the lengths of stay:

res_adm <- st[["visit_occurrence"]] %>%
  mutate(
    los = as.numeric(
        difftime(visit_end_datetime, visit_start_datetime, units = "days"))) %>%
  select(person_id, los)

los_plot <- res_adm %>%
  filter(!is.na(los)) %>%
  mutate(outlier = outliers(los)) %>%
  mutate(los = if_else(
    outlier == "main", los, scales::rescale(los, to = c(max(los)-0.02*max(los), max(los))))) %>%
  ggplot(aes(x = los)) +
  geom_density() +
  facet_grid(cols = vars(outlier), scales = "free_x", space = "free_x") +
  theme_classic() +
  theme(strip.background = element_rect(fill = "grey80", colour = "grey80")) +
  labs(x = "length of stay (days)")

los_tab <- ggplotGrob(los_plot)
p_filtered <- gtable_filter_remove(los_tab, name = "axis-b-2",
                                   trim = TRUE)

grid::grid.newpage()
grid::grid.draw(p_filtered)

res_adm_tbl <- res_adm %>%
  mutate(
    `length of stay` = case_when(
      los < 0 ~ "negative",
      los == 0 ~ "zero",
      los > 0 ~ "positive",
      TRUE ~ "missing los data"
    )
  ) %>%
  group_by(`length of stay`) %>%
  tally() %>%
  mutate(
    total = sum(n),
    `%` = round((n/total)*100, 0)
  ) %>%
  select(-total)

res_adm_toll <- tibble(
  `length of stay` = c("negative", "zero", "positive", "missing los data"),
  tolerance = c(1, 1, 99, 1)
)

res_adm_toll %>%
  left_join(res_adm_tbl, by = "length of stay") %>%
  gt() %>%
  tab_style(
    style = cell_fill(color = "red1", alpha = 0.5),
    locations = cells_body(
      rows = (`%` > tolerance & `length of stay` != "positive") |
             (`%` < tolerance & `length of stay` == "positive")
  )) %>%
  tab_options(table.width = pct(100))

Readmissions

visit_n <- overview %>%
  group_by(person_id) %>%
  tally(name = "attendance_number") %>%
  group_by(attendance_number) %>%
  tally() %>%
  mutate(attendance_number = cut(
    x = attendance_number,
    breaks = c(1:10, 20, 100),
    labels = c(paste0(1:9), "[10, 20)", "[20, 100)"),
    right = FALSE)) %>%
  group_by(attendance_number) %>%
  summarise(n = sum(n)) %>%
  mutate(attendance_number = forcats::fct_rev(attendance_number))

visit_n %>%
  ggplot() +
  geom_linerange(aes(xmin = 0, xmax = n, y = attendance_number, group = attendance_number)) +
  geom_point(aes(x = n, y = attendance_number)) +
  theme_classic() +
  labs(y = "attendances per patient", x = "count") +
  ggtitle("Number of Admissions Observed per Patient")

Visit Detail

Are all visits ordered sequentially:

overlapping <- st[["visit_detail"]] %>%
  select(person_id, visit_detail_start_datetime, visit_detail_end_datetime) %>%
  group_by(person_id) %>%
  arrange(person_id, visit_detail_start_datetime) %>%
  mutate(lead_start = quick_lead(visit_detail_start_datetime)) %>%
  mutate(overlap = lead_start < visit_detail_end_datetime) %>%
  mutate(overlap = if_else(is.na(overlap), FALSE, overlap))

overlapping %>%
  group_by(overlap) %>%
  tally() %>%
  gt()

What are the lengths of stay:

cs <- tbl(ctn, in_schema(schema, "care_site")) %>%
  collect() %>%
  select(
    concept_id = care_site_id,
    concept_name = care_site_name
  ) %>%
  mutate(concept_id = as.integer(concept_id))

st[["visit_detail"]]$care_site_id <- match_concepts(st[["visit_detail"]]$care_site_id, cs)

## Temporary fix 

st[["visit_detail"]]$care_site_id <-
  if_else(is.na(st[["visit_detail"]]$care_site_id),
          "Out Patient Department",
          st[["visit_detail"]]$care_site_id)

res_adm <- st[["visit_detail"]] %>%
  mutate(
    los = as.numeric(
        difftime(visit_detail_end_datetime, visit_detail_start_datetime, units = "days"))) %>%
  select(person_id, care_site_id, los)

res_adm_tbl <- res_adm %>%
  mutate(
    `length of stay` = case_when(
      los < 0 ~ "negative",
      los == 0 ~ "zero",
      los > 0 ~ "positive",
      TRUE ~ "missing los data"
    )
  ) %>%
  group_by(`length of stay`) %>%
  tally() %>%
  mutate(
    total = sum(n),
    `%` = round((n/total)*100, 0)
  ) %>%
  select(-total)

res_adm_toll <- tibble(
  `length of stay` = c("negative", "zero", "positive", "missing los data"),
  tolerance = c(1, 1, 99, 1)
)

res_adm_toll %>%
  left_join(res_adm_tbl, by = "length of stay") %>%
  gt() %>%
  tab_style(
    style = cell_fill(color = "red1", alpha = 0.5),
    locations = cells_body(
      rows = (`%` > tolerance & `length of stay` != "positive") |
             (`%` < tolerance & `length of stay` == "positive")
  )) %>%
  tab_options(table.width = pct(100))

The following is a transition matrix of moves within the visit_detail, fit under MLE. It does not consider transitions as a function of time spent in any location.

cs <- tbl(ctn, in_schema(schema, "care_site")) %>%
  collect()

loc_order <- c("External (Pt Home)",
               "External (Pt Other)",
               unique(cs$care_site_name),
               "Died")

tm <- transition_matrix(overview, st, possible_states = loc_order)
plot_tm(tm, loc_order)

tm_fit <- as_tibble(tm$estimate@transitionMatrix) %>%
    tibble::add_column(
      source = rownames(tm$estimate@transitionMatrix), .before = TRUE) %>%
    pivot_longer(-contains("source"),
                 names_to = "destination",
                 values_to = "value") %>%
    mutate(across(c("source", "destination"), factor, levels = loc_order))

tm_fit %>%
  filter(source == destination, value > 0) %>%
  gt()

Death is an absorbing state, so we shouldn't be able to see the patient appear anywhere else afterwards

testthat::expect_true(nrow(filter(tm_fit, source == "Died", value > 0)) == 0)

Outcomes

The following list the outcomes for patients attending at r params$local_hospital:

overview %>%
  distinct(person_id, .keep_all = TRUE) %>%
  filter(!is.na(visit_occurrence_id)) %>%
  group_by(visit_concept_id, death = !is.na(death_datetime)) %>%
  tally() %>%
  group_by(visit_concept_id) %>%
  mutate(
    total = sum(n),
    `%` = round((n/total)*100, 0)
  ) %>%
  gt()

Distributions of Times

Typically patients are admitted earlier in the day, and discarged later. Death often has a uniform distribution, with perhaps a slight peak in the early morning or later afternoon.

overview %>%
  select(visit_start_datetime, visit_end_datetime, death_datetime) %>%
  mutate_all(~ as.POSIXct(., tz = "UTC")) %>%
  filter_all(any_vars(!is.na(.))) %>%
  mutate_all(~ hms::as_hms(.)) %>%
  tidyr::pivot_longer(everything(), names_to = "time_point", values_to = "time") %>%
  ggplot(aes(x = time, fill = time_point)) +
  geom_density(alpha = 0.5) +
  theme_classic() +
  labs(x = "event time distribution", fill = "event") +
  theme(legend.position = "bottom")

Measurements

Missing concepts are highlighted in red:

meas <- tbl(ctn, in_schema(schema, "measurement")) %>%
  group_by(measurement_concept_id) %>%
  tally() %>%
  collect() %>%
  mutate(across(everything(), as.integer))

# Measurements that are in the roadmap & the DB
meas_in_ref <- meas$measurement_concept_id[meas$measurement_concept_id %in% dq_ref$concept_id]

# Measurements that are not in the roadmap but appear in the DB
meas_nin_ref <- meas$measurement_concept_id[!(meas$measurement_concept_id %in% dq_ref$concept_id)]

# Measurements that appear in the DB, but are not in the roadmap
meas_out <- dq_ref$concept_id[!(dq_ref$concept_id %in% meas$measurement_concept_id)]

meas_all <- sort(unique(c(meas_in_ref, meas_nin_ref, meas_out)))
meas_all <- mini_dict(ctn, schema, meas_all)

meas_all <- meas_all %>%
  mutate(
    status = case_when(
      concept_id %in% meas_in_ref ~ "Onboarded",
      concept_id %in% meas_out ~ "Pending",
      concept_id %in% meas_nin_ref ~ "Not in roadmap"
  )) %>%
  select(concept_id, concept_name, status) %>%
  arrange(concept_id) %>%
  left_join(meas, by = c("concept_id" = "measurement_concept_id")) %>%
  mutate(n = if_else(is.na(n), 0L, n))

meas_all %>%
  gt() %>%
  tab_style(
    style = cell_fill(color = "red1", alpha = 0.5),
    locations = cells_body(
      rows = status == "Pending")
  ) %>%
  tab_options(table.width = pct(100))

This section details all measurements:

Error codes:

before: sample taken before visit
after: sample taken after visit
no visit: sample taken without an associated visit
duplications: suspicious for duplicate samples

meas_dq <- meas_all %>%
  left_join(dq_ref[,c("concept_id", "short_name", "target_column",
                      "units_concept_id", "limits")],
            by = "concept_id")

rmd <- vector(mode = "list", length = nrow(meas_dq))

for (i in seq_along(meas_dq$concept_id)) {
  if (meas_dq$status[i] == "Onboarded") {
      rmd[[i]] <- try(knitr::knit_expand("child_measurement.Rmd",
                                 x = meas_dq$concept_id[i]))
  } else if (meas_dq$status[i] == "Pending") {
      rmd[[i]] <- try(knitr::knit_expand("child_pending.Rmd",
                                 x = meas_dq$concept_id[i]))
  } else {
      rmd[[i]] <- try(knitr::knit_expand("child_off_roadmap.Rmd",
                                 x = meas_dq$concept_id[i]))
  }

}

# rmd <- meas_dq$concept_id %>%
#   purrr::map(
#     ~ knitr::knit_expand("child_measurement.Rmd", x = .x)
#   )

rmd <- paste(rmd, collapse = "\n")

rendered <- knit(text = rmd, quiet = TRUE)
cat(rendered, sep = "\n")

Specimen Table

Unique specimen concepts:

if (!("specimen" %in% zero_vals)) {
  spec <- tally_concept(ctn, schema, "specimen", "specimen_concept_id")

  spec %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Unique anatamical sites:

if (!("specimen" %in% zero_vals)) {
  spec_as <- tally_concept(ctn, schema, "specimen", "anatomic_site_concept_id")

  spec_as %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Drug Exposure Table

The top 50 most common drugs given:

if (!("drug_exposure" %in% zero_vals)) {
  drug <- tally_concept(ctn, schema, "drug_exposure", "drug_concept_id")

  drug %>%
    arrange(desc(n)) %>%
    slice(1:50) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Condition Occurrence Table

The top 20 most common conditions:

if (!("condition_occurrence" %in% zero_vals)) {
  cond <- tally_concept(ctn, schema, "condition_occurrence", "condition_concept_id")

  cond %>%
    arrange(desc(n)) %>%
    slice(1:20) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Condition types seen:

if (!("condition_occurrence" %in% zero_vals)) {
  cond_t <- tally_concept(ctn, schema, "condition_occurrence", "condition_type_concept_id")

  cond_t %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Condition status seen:

if (!("condition_occurrence" %in% zero_vals)) {
  cond_s <- tally_concept(ctn, schema, "condition_occurrence", "condition_status_concept_id")

  cond_s %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Procedure Occurrence Table

All procedures seen:

if (!("procedure_occurrence" %in% zero_vals)) {
  proc <- tally_concept(ctn, schema, "procedure_occurrence", "procedure_concept_id")

  proc %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Fact Relationship

check that all the relationships are listed both ways around
check how many:
specimens link to measurements
how many terminal branch (metadata) tags we have

if (!("fact_relationship" %in% zero_vals)) {
  fr_types <- tally_concept(ctn, schema, "fact_relationship", "relationship_concept_id")

  fr_types %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

DBI::dbDisconnect(ctn)

DocEd/d.inspectEHR documentation built on Aug. 24, 2020, 1:44 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

DocEd/d.inspectEHR
Data Quality Evaluation for DECOVID

In DocEd/d.inspectEHR: Data Quality Evaluation for DECOVID

Schema Checks

Structural Checks

Person Table

Visit Characteristics

Visit Occurrence

Readmissions

Visit Detail

Outcomes

Distributions of Times

Measurements

Specimen Table

Drug Exposure Table

Condition Occurrence Table

Procedure Occurrence Table

Fact Relationship

R Package Documentation

Browse R Packages

We want your feedback!

DocEd/d.inspectEHR Data Quality Evaluation for DECOVID

In DocEd/d.inspectEHR: Data Quality Evaluation for DECOVID

Schema Checks

Structural Checks

Person Table

Visit Characteristics

Visit Occurrence

Readmissions

Visit Detail

Outcomes

Distributions of Times

Measurements

Specimen Table

Drug Exposure Table

Condition Occurrence Table

Procedure Occurrence Table

Fact Relationship

R Package Documentation

Browse R Packages

We want your feedback!

DocEd/d.inspectEHR
Data Quality Evaluation for DECOVID