knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  echo = FALSE, warning = FALSE, error = FALSE,
  out.width = "100%",
  dpi = 300
)
library(knitr)
library(ggplot2)
library(dplyr)
library(gt)
library(patchwork)
library(markovchain)

Schema Checks

This is a mid-stream quality evaluation and does not check for schema conformity. This should be conducted separately through the application of the DECOVID-OMOP dll file constraints. It is assumed that these have been applied successfully before running this report.

Structural Checks

ctn <- setup_ctn(params)
schema <- params$schema

st <- prepare_tables(ctn, schema)
overview <- prepare_overview(st)

vo_id_true <- all(!is.na(overview$visit_occurrence_id))

Check that all visit_occurrence_id is populated: r vo_id_true

Row counts for core clinical tables:

clinical_tbls <- prepare_tally(ctn, schema)

zero_vals <- clinical_tbls %>%
  filter(n == 0) %>%
  select(table) %>%
  pull()

clinical_tbls %>%
  arrange(table) %>%
  gt() %>%
  tab_options(table.width = pct(100))

The proportion of missing data is shown. Sometimes is can be entirely appropriate. Rows are shown in red if they have a higher level of missingness than acceptable tolerances.

st %>%
  prepare_null_counts() %>%
  gt() %>%
  tab_style(
    style = cell_fill(color = "red1", alpha = 0.5),
    locations = cells_body(
      rows = `% rows missing` > tolerance)
  ) %>%
  tab_options(table.width = pct(100))

Person Table

if ("person" %in% zero_vals) {
  cat("There is no data in the person table")
} else {
  age_plot <- plot_age(overview)
  check_age <- min(overview$age, na.rm = TRUE) >= 18

  sex_plot <- plot_sex(st[["person"]])
  ethnic_plot <- plot_ethnicity(st[["person"]])

  age_plot + sex_plot + ethnic_plot
}

Madatory checks

Visit Characteristics

Date boundaries of the cohort:

if ("visit_occurrence" %in% zero_vals) {
  cat("There is no data in the visit_occurrence table")
} else {
  st[["visit_occurrence"]] %>%
    prepare_cohort_boundaries() %>%
    gt() %>%
    tab_style(
      style = cell_fill(color = "red1", alpha = 0.5),
      locations = cells_body(
        rows = tolerance == "fail")
    ) %>%
    tab_options(table.width = pct(100))
}

Number of visits stratified by visit type:

if (!("visit_occurrence" %in% zero_vals)) {
  st[["visit_occurrence"]] %>%
    group_by(visit_concept_id) %>%
    summarise(
      count = n(),
      percentage = round((n()/nrow(st[["visit_occurrence"]]))*100, 0)
    ) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}
plot_visit_profile(st[["visit_occurrence"]])

Admission codes outside the approved list:

asci <- vo_ans %>%
  filter(target_columnn == "admitting_source_concept_id") %>%
  select(concept_id, concept_name) %>%
  distinct()

st[["visit_occurrence"]] %>%
  filter(!(admitting_source_concept_id %in% asci$concept_name)) %>%
  group_by(admitting_source_concept_id) %>%
  tally() %>%
  gt() %>%
  tab_options(table.width = pct(100))
st[["visit_occurrence"]] %>%
  # mutate(admitting_source_concept_id = factor(
  #   x = admitting_source_concept_id, levels = asci$concept_name, labels = asci$concept_name
  # )) %>%
  mutate(admitting_source_concept_id = forcats::fct_infreq(admitting_source_concept_id)) %>%
  group_by(admitting_source_concept_id) %>%
  tally() %>%
  check_zero_tally(n) %>%
  ggplot(aes(x = admitting_source_concept_id)) +
  geom_point(aes(y = n, colour = .is_zero)) +
  geom_segment(aes(
    y = 0,
    yend = n,
    xend = admitting_source_concept_id)) +
  scale_colour_manual(values = c("red", "black"), drop = FALSE) +
  theme_classic() +
  labs(y = "count") +
  theme(
    legend.position = "none",
    plot.title.position = "plot",
    axis.title.y = element_blank()) +
    coord_flip() +
  ggtitle("Admission Source")

Discharge codes outside the approved list:

dtci <- vo_ans %>%
  filter(target_columnn == "discharge_to_concept_id") %>%
  select(concept_id, concept_name) %>%
  distinct()

st[["visit_occurrence"]] %>%
  filter(!(discharge_to_concept_id %in% dtci$concept_name)) %>%
  group_by(discharge_to_concept_id) %>%
  tally() %>%
  gt()
st[["visit_occurrence"]] %>%
  #   mutate(discharge_to_concept_id = factor(
  #   x = discharge_to_concept_id, levels = dtci$concept_name, labels = dtci$concept_name
  # )) %>%
  mutate(discharge_to_concept_id = forcats::fct_infreq(discharge_to_concept_id)) %>%
  group_by(discharge_to_concept_id) %>%
  tally() %>%
  check_zero_tally(n) %>%
  ggplot(aes(x = discharge_to_concept_id)) +
  geom_point(aes(y = n, colour = .is_zero)) +
  geom_segment(aes(
    y = 0,
    yend = n,
    xend = discharge_to_concept_id)) +
  scale_colour_manual(values = c("red", "black"), drop = FALSE) +
  theme_classic() +
  labs(y = "count") +
  theme(
    legend.position = "none",
    plot.title.position = "plot",
    axis.title.y = element_blank()) +
  coord_flip() +
  ggtitle("Discharge To")

All deaths inside the hospital should be aligned with the final visit end datetime. This is an OMOP convention and doesn't necessarily mean the data is bad.

ghost_visit <- st[["visit_occurrence"]] %>%
  left_join(st[["death"]] %>%
              select(person_id, death_date, death_datetime),
            by = "person_id") %>%
  filter(visit_start_date > death_date)

testthat::expect_true(nrow(ghost_visit) == 0)

All records that have a discharge recorded at "Patient died" should also have a death record in the death table.

st[["visit_occurrence"]] %>%
  filter(discharge_to_concept_id == "Patient died") %>%
  left_join(
    st[["death"]] %>%
      select(person_id, death_date, death_datetime) %>%
      mutate(death_record = TRUE),
    by = "person_id") %>%
  group_by(death_record = !is.na(death_record)) %>%
  tally() %>%
  gt()

Deviations of more than 24 hours (by datetime) or 48 hours (by date) should be investigated:

death_discrep <- overview %>%
  filter(
    !is.na(visit_concept_id) &
    (!is.na(death_datetime) | !is.na(death_date)) &
    discharge_to_concept_id == "Patient died") %>%
  group_by(person_id) %>%
  filter(max(visit_start_datetime) == visit_start_datetime) %>%
  ungroup() %>%
  select(person_id, death_datetime, death_date, visit_end_datetime) %>%
  mutate(
    diff_date = as.integer(difftime(visit_end_datetime, death_date, units = "hours")),
    diff_dttm = as.integer(difftime(visit_end_datetime, death_datetime, units = "hours")))

death_discrep %>%
  summarise(
    `death 24 hrs < visit end (dttm)` = sum(diff_dttm > 24, na.rm = TRUE),
    `death within tolerance (dttm)` = sum(diff_dttm <= 24 & diff_dttm >= -24, na.rm = TRUE),
    `death 24 hrs > visit end (dttm)` = sum(diff_dttm < -24, na.rm = TRUE),
    `death 48 hrs < visit end (date)` = sum(diff_date > 48, na.rm = TRUE),
    `death within tolerance (date)` = sum(diff_date <= 48 & diff_date >= -48, na.rm = TRUE),
    `death 48 hrs > visit end (date)` = sum(diff_date < -48, na.rm = TRUE),
  ) %>%
  tidyr::pivot_longer(everything(), names_to = "tolerance test", values_to = "n") %>%
  group_by(dttm = grepl(x = `tolerance test`, pattern = "dttm")) %>%
  mutate(
    total = sum(n),
    `%` = round((n/total)*100, 0)) %>%
  ungroup() %>%
  mutate(tolerance = c(5, 90, 5, 5, 90, 5)) %>%
  select(-dttm, -total) %>%
  gt() %>%
    tab_style(
      style = cell_fill(color = "red1", alpha = 0.5),
      locations = cells_body(
        rows = (`%` < tolerance & grepl("tolerance", `tolerance test`)) |
               (`%` > tolerance & !grepl("tolerance", `tolerance test`)))
    ) %>%
  tab_options(table.width = pct(100))
plot_min <- min(pmin(death_discrep$diff_date, death_discrep$diff_dttm), na.rm = TRUE)
plot_max <- max(pmax(death_discrep$diff_date, death_discrep$diff_dttm), na.rm = TRUE)

death_discrep_date <- death_discrep %>%
  filter(!is.na(diff_date)) %>%
  ggplot(aes(x = diff_date)) +
  geom_density() +
  theme_classic() +
  ggtitle("Hours difference between visit end and death date") +
  theme(
    axis.text.y = element_blank(),
    axis.title = element_blank()) +
  lims(x = c(plot_min, plot_max))

death_discrep_dttm <- death_discrep %>%
  filter(!is.na(diff_dttm)) %>%
  ggplot(aes(x = diff_dttm)) +
  geom_density() +
  theme_classic() +
  labs(x = "time difference (hours)") +
  ggtitle("Hours difference between visit end and death date time") +
  theme(
    axis.text.y = element_blank(),
    axis.title.y = element_blank()) +
  lims(x = c(plot_min, plot_max))

death_discrep_date / death_discrep_dttm

Visit Occurrence

Are all visits ordered sequentially:

overlapping <- overview %>%
  select(person_id, visit_start_datetime, visit_end_datetime) %>%
  group_by(person_id) %>%
  arrange(person_id, visit_start_datetime) %>%
  mutate(lead_start = quick_lead(visit_start_datetime)) %>%
  mutate(overlap = lead_start < visit_end_datetime) %>%
  mutate(overlap = if_else(is.na(overlap), FALSE, overlap))

overlapping %>%
  group_by(overlap) %>%
  tally() %>%
  gt()

What are the lengths of stay:

res_adm <- st[["visit_occurrence"]] %>%
  mutate(
    los = as.numeric(
        difftime(visit_end_datetime, visit_start_datetime, units = "days"))) %>%
  select(person_id, los)

los_plot <- res_adm %>%
  filter(!is.na(los)) %>%
  mutate(outlier = outliers(los)) %>%
  mutate(los = if_else(
    outlier == "main", los, scales::rescale(los, to = c(max(los)-0.02*max(los), max(los))))) %>%
  ggplot(aes(x = los)) +
  geom_density() +
  facet_grid(cols = vars(outlier), scales = "free_x", space = "free_x") +
  theme_classic() +
  theme(strip.background = element_rect(fill = "grey80", colour = "grey80")) +
  labs(x = "length of stay (days)")

los_tab <- ggplotGrob(los_plot)
p_filtered <- gtable_filter_remove(los_tab, name = "axis-b-2",
                                   trim = TRUE)

grid::grid.newpage()
grid::grid.draw(p_filtered)
res_adm_tbl <- res_adm %>%
  mutate(
    `length of stay` = case_when(
      los < 0 ~ "negative",
      los == 0 ~ "zero",
      los > 0 ~ "positive",
      TRUE ~ "missing los data"
    )
  ) %>%
  group_by(`length of stay`) %>%
  tally() %>%
  mutate(
    total = sum(n),
    `%` = round((n/total)*100, 0)
  ) %>%
  select(-total)

res_adm_toll <- tibble(
  `length of stay` = c("negative", "zero", "positive", "missing los data"),
  tolerance = c(1, 1, 99, 1)
)

res_adm_toll %>%
  left_join(res_adm_tbl, by = "length of stay") %>%
  gt() %>%
  tab_style(
    style = cell_fill(color = "red1", alpha = 0.5),
    locations = cells_body(
      rows = (`%` > tolerance & `length of stay` != "positive") |
             (`%` < tolerance & `length of stay` == "positive")
  )) %>%
  tab_options(table.width = pct(100))

Readmissions

visit_n <- overview %>%
  group_by(person_id) %>%
  tally(name = "attendance_number") %>%
  group_by(attendance_number) %>%
  tally() %>%
  mutate(attendance_number = cut(
    x = attendance_number,
    breaks = c(1:10, 20, 100),
    labels = c(paste0(1:9), "[10, 20)", "[20, 100)"),
    right = FALSE)) %>%
  group_by(attendance_number) %>%
  summarise(n = sum(n)) %>%
  mutate(attendance_number = forcats::fct_rev(attendance_number))

visit_n %>%
  ggplot() +
  geom_linerange(aes(xmin = 0, xmax = n, y = attendance_number, group = attendance_number)) +
  geom_point(aes(x = n, y = attendance_number)) +
  theme_classic() +
  labs(y = "attendances per patient", x = "count") +
  ggtitle("Number of Admissions Observed per Patient")

Visit Detail

Are all visits ordered sequentially:

overlapping <- st[["visit_detail"]] %>%
  select(person_id, visit_detail_start_datetime, visit_detail_end_datetime) %>%
  group_by(person_id) %>%
  arrange(person_id, visit_detail_start_datetime) %>%
  mutate(lead_start = quick_lead(visit_detail_start_datetime)) %>%
  mutate(overlap = lead_start < visit_detail_end_datetime) %>%
  mutate(overlap = if_else(is.na(overlap), FALSE, overlap))

overlapping %>%
  group_by(overlap) %>%
  tally() %>%
  gt()

What are the lengths of stay:

cs <- tbl(ctn, in_schema(schema, "care_site")) %>%
  collect() %>%
  select(
    concept_id = care_site_id,
    concept_name = care_site_name
  ) %>%
  mutate(concept_id = as.integer(concept_id))

st[["visit_detail"]]$care_site_id <- match_concepts(st[["visit_detail"]]$care_site_id, cs)

## Temporary fix 

st[["visit_detail"]]$care_site_id <-
  if_else(is.na(st[["visit_detail"]]$care_site_id),
          "Out Patient Department",
          st[["visit_detail"]]$care_site_id)

res_adm <- st[["visit_detail"]] %>%
  mutate(
    los = as.numeric(
        difftime(visit_detail_end_datetime, visit_detail_start_datetime, units = "days"))) %>%
  select(person_id, care_site_id, los)

res_adm_tbl <- res_adm %>%
  mutate(
    `length of stay` = case_when(
      los < 0 ~ "negative",
      los == 0 ~ "zero",
      los > 0 ~ "positive",
      TRUE ~ "missing los data"
    )
  ) %>%
  group_by(`length of stay`) %>%
  tally() %>%
  mutate(
    total = sum(n),
    `%` = round((n/total)*100, 0)
  ) %>%
  select(-total)

res_adm_toll <- tibble(
  `length of stay` = c("negative", "zero", "positive", "missing los data"),
  tolerance = c(1, 1, 99, 1)
)

res_adm_toll %>%
  left_join(res_adm_tbl, by = "length of stay") %>%
  gt() %>%
  tab_style(
    style = cell_fill(color = "red1", alpha = 0.5),
    locations = cells_body(
      rows = (`%` > tolerance & `length of stay` != "positive") |
             (`%` < tolerance & `length of stay` == "positive")
  )) %>%
  tab_options(table.width = pct(100))

The following is a transition matrix of moves within the visit_detail, fit under MLE. It does not consider transitions as a function of time spent in any location.

cs <- tbl(ctn, in_schema(schema, "care_site")) %>%
  collect()

loc_order <- c("External (Pt Home)",
               "External (Pt Other)",
               unique(cs$care_site_name),
               "Died")

tm <- transition_matrix(overview, st, possible_states = loc_order)
plot_tm(tm, loc_order)
tm_fit <- as_tibble(tm$estimate@transitionMatrix) %>%
    tibble::add_column(
      source = rownames(tm$estimate@transitionMatrix), .before = TRUE) %>%
    pivot_longer(-contains("source"),
                 names_to = "destination",
                 values_to = "value") %>%
    mutate(across(c("source", "destination"), factor, levels = loc_order))

tm_fit %>%
  filter(source == destination, value > 0) %>%
  gt()

Death is an absorbing state, so we shouldn't be able to see the patient appear anywhere else afterwards

testthat::expect_true(nrow(filter(tm_fit, source == "Died", value > 0)) == 0)

Outcomes

The following list the outcomes for patients attending at r params$local_hospital:

overview %>%
  distinct(person_id, .keep_all = TRUE) %>%
  filter(!is.na(visit_occurrence_id)) %>%
  group_by(visit_concept_id, death = !is.na(death_datetime)) %>%
  tally() %>%
  group_by(visit_concept_id) %>%
  mutate(
    total = sum(n),
    `%` = round((n/total)*100, 0)
  ) %>%
  gt()

Distributions of Times

Typically patients are admitted earlier in the day, and discarged later. Death often has a uniform distribution, with perhaps a slight peak in the early morning or later afternoon.

overview %>%
  select(visit_start_datetime, visit_end_datetime, death_datetime) %>%
  mutate_all(~ as.POSIXct(., tz = "UTC")) %>%
  filter_all(any_vars(!is.na(.))) %>%
  mutate_all(~ hms::as_hms(.)) %>%
  tidyr::pivot_longer(everything(), names_to = "time_point", values_to = "time") %>%
  ggplot(aes(x = time, fill = time_point)) +
  geom_density(alpha = 0.5) +
  theme_classic() +
  labs(x = "event time distribution", fill = "event") +
  theme(legend.position = "bottom")

Measurements

Missing concepts are highlighted in red:

meas <- tbl(ctn, in_schema(schema, "measurement")) %>%
  group_by(measurement_concept_id) %>%
  tally() %>%
  collect() %>%
  mutate(across(everything(), as.integer))

# Measurements that are in the roadmap & the DB
meas_in_ref <- meas$measurement_concept_id[meas$measurement_concept_id %in% dq_ref$concept_id]

# Measurements that are not in the roadmap but appear in the DB
meas_nin_ref <- meas$measurement_concept_id[!(meas$measurement_concept_id %in% dq_ref$concept_id)]

# Measurements that appear in the DB, but are not in the roadmap
meas_out <- dq_ref$concept_id[!(dq_ref$concept_id %in% meas$measurement_concept_id)]

meas_all <- sort(unique(c(meas_in_ref, meas_nin_ref, meas_out)))
meas_all <- mini_dict(ctn, schema, meas_all)

meas_all <- meas_all %>%
  mutate(
    status = case_when(
      concept_id %in% meas_in_ref ~ "Onboarded",
      concept_id %in% meas_out ~ "Pending",
      concept_id %in% meas_nin_ref ~ "Not in roadmap"
  )) %>%
  select(concept_id, concept_name, status) %>%
  arrange(concept_id) %>%
  left_join(meas, by = c("concept_id" = "measurement_concept_id")) %>%
  mutate(n = if_else(is.na(n), 0L, n))

meas_all %>%
  gt() %>%
  tab_style(
    style = cell_fill(color = "red1", alpha = 0.5),
    locations = cells_body(
      rows = status == "Pending")
  ) %>%
  tab_options(table.width = pct(100))

This section details all measurements:

Error codes:

meas_dq <- meas_all %>%
  left_join(dq_ref[,c("concept_id", "short_name", "target_column",
                      "units_concept_id", "limits")],
            by = "concept_id")

rmd <- vector(mode = "list", length = nrow(meas_dq))

for (i in seq_along(meas_dq$concept_id)) {
  if (meas_dq$status[i] == "Onboarded") {
      rmd[[i]] <- try(knitr::knit_expand("child_measurement.Rmd",
                                 x = meas_dq$concept_id[i]))
  } else if (meas_dq$status[i] == "Pending") {
      rmd[[i]] <- try(knitr::knit_expand("child_pending.Rmd",
                                 x = meas_dq$concept_id[i]))
  } else {
      rmd[[i]] <- try(knitr::knit_expand("child_off_roadmap.Rmd",
                                 x = meas_dq$concept_id[i]))
  }

}

# rmd <- meas_dq$concept_id %>%
#   purrr::map(
#     ~ knitr::knit_expand("child_measurement.Rmd", x = .x)
#   )

rmd <- paste(rmd, collapse = "\n")
rendered <- knit(text = rmd, quiet = TRUE)
cat(rendered, sep = "\n")

Specimen Table

Unique specimen concepts:

if (!("specimen" %in% zero_vals)) {
  spec <- tally_concept(ctn, schema, "specimen", "specimen_concept_id")

  spec %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Unique anatamical sites:

if (!("specimen" %in% zero_vals)) {
  spec_as <- tally_concept(ctn, schema, "specimen", "anatomic_site_concept_id")

  spec_as %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Drug Exposure Table

The top 50 most common drugs given:

if (!("drug_exposure" %in% zero_vals)) {
  drug <- tally_concept(ctn, schema, "drug_exposure", "drug_concept_id")

  drug %>%
    arrange(desc(n)) %>%
    slice(1:50) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Condition Occurrence Table

The top 20 most common conditions:

if (!("condition_occurrence" %in% zero_vals)) {
  cond <- tally_concept(ctn, schema, "condition_occurrence", "condition_concept_id")

  cond %>%
    arrange(desc(n)) %>%
    slice(1:20) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Condition types seen:

if (!("condition_occurrence" %in% zero_vals)) {
  cond_t <- tally_concept(ctn, schema, "condition_occurrence", "condition_type_concept_id")

  cond_t %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Condition status seen:

if (!("condition_occurrence" %in% zero_vals)) {
  cond_s <- tally_concept(ctn, schema, "condition_occurrence", "condition_status_concept_id")

  cond_s %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Procedure Occurrence Table

All procedures seen:

if (!("procedure_occurrence" %in% zero_vals)) {
  proc <- tally_concept(ctn, schema, "procedure_occurrence", "procedure_concept_id")

  proc %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}

Fact Relationship

if (!("fact_relationship" %in% zero_vals)) {
  fr_types <- tally_concept(ctn, schema, "fact_relationship", "relationship_concept_id")

  fr_types %>%
    arrange(desc(n)) %>%
    gt() %>%
    tab_options(table.width = pct(100))
}
DBI::dbDisconnect(ctn)


DocEd/d.inspectEHR documentation built on Aug. 24, 2020, 1:44 p.m.