In DocEd/d.inspectEHR: Data Quality Evaluation for DECOVID

```r}} current_concept <- {{x}} current_name <- meas_dq$concept_name[meas_dq$concept_id == current_concept] target_col <- meas_dq$target_column[meas_dq$concept_id == current_concept]

curr_title <- stringr::str_sub(current_name, 1, 30) if (nchar(curr_title) >= 30) { curr_title <- paste0(curr_title, "...") }

### `r paste0(current_concept, ": ", curr_title)`

```r}}
## Collect Just the working data we need
working <- tbl(ctn, in_schema(schema, "measurement")) %>%
  filter(measurement_concept_id %in% !! current_concept) %>%
  collect() %>%
  mutate(across(where(is.integer64), as.integer)) %>%
  mutate(across(c(contains("date"), -contains("datetime")), as.Date))

working_units <- as.integer(na.omit(unique(working$unit_concept_id)))
working_operator <- as.integer(na.omit(unique(working$operator_concept_id)))

if (length(working_units) > 0) {
  working_units_dict <- mini_dict(ctn, schema, working_units)
  label_units <- paste0(working_units_dict$concept_name, collapse = ", ")
} else {
  label_units <- "None in use!"
}

if (length(working_operator) > 0) {
  working_operator_dict <- mini_dict(ctn, schema, working_operator)
  operator_units <- paste0(working_operator_dict$concept_name, collapse = ", ")
} else {
  operator_units <- "None in use!"
}

measure_n <- nrow(working)

The number of units in use: r length(working_units).

Units in use: r label_units.

The number of operators in use: r length(operator_units).

Operators in use: r operator_units.

```r}} working <- left_join( working %>% select( person_id, measurement_id, measurement_date, measurement_datetime, value_as_number, value_as_concept_id, visit_occurrence_id), st[["visit_occurrence"]] %>% select(visit_occurrence_id, visit_start_datetime, visit_end_datetime), by = "visit_occurrence_id")

Boundaries

boundaries <- working %>% summarise( before = sum(measurement_datetime < visit_start_datetime, na.rm = TRUE), after = sum(measurement_datetime > visit_end_datetime, na.rm = TRUE) ) %>% tidyr::pivot_longer(everything(), names_to = "condition", values_to = "count")

Duplications

dup <- working %>% select(.data$person_id, .data$measurement_datetime, .data[[target_col]]) %>% add_count( .data$person_id, .data$measurement_datetime, .data[[target_col]], name = "dupe_count") %>% filter(dupe_count > 1) %>% nrow()

dup <- tibble::tribble( ~condition, ~count, "duplications", dup )

miss <- tibble::tribble( ~condition, ~count, "no visit", sum(is.na(working$visit_occurrence_id)) )

bind_rows(boundaries, dup, miss) %>% mutate( total = measure_n, % = round((count/total)*100, 0), tolerance = c(1, 1, 1, 100) ) %>% gt() %>% tab_style( style = cell_fill(color = "red1", alpha = 0.5), locations = cells_body( rows = % > tolerance) ) %>% tab_options(table.width = pct(100))

```r}}
# value distribution
if (target_col == "value_as_number") {
  val_dist <- working %>%
    select(value_as_number) %>%
    filter(!is.na(value_as_number))

  if (nrow(val_dist) == 0) {
    val_dist <- "bad"
  } else {
    val_dist <- val_dist %>%
      ggplot(aes(x = value_as_number)) +
      geom_density() +
      theme_classic() +
      labs(x = label_units)
  }
} else {
  opt <- dq_ans[dq_ans$concept_id == current_concept, c("option_concept_id", "option_name")]

  val_dist <- working %>%
    select(value_as_concept_id) %>%
    filter(!is.na(value_as_concept_id))

  if (nrow(val_dist) == 0) {
    val_dist <- "bad"
  } else {
    val_dist <- val_dist %>%
      group_by(value_as_concept_id) %>%
      tally() %>%
      mutate(value_as_concept_id = factor(
        value_as_concept_id,
        levels = opt$option_concept_id,
        labels = opt$option_name
      )) %>%
      ggplot(aes(
        x = value_as_concept_id)) +
      geom_point(aes(y = n)) +
      geom_segment(aes(
        y = 0,
        yend = n,
        xend = as.factor(value_as_concept_id))) +
      theme_classic() +
      labs(y = "number of respones", x = "categories") +
      theme(axis.title.y = element_blank()) +
      coord_flip()
  }
}

# timing distribution
timing_dist <- working %>%
  select(measurement_datetime) %>%
  filter(!is.na(measurement_datetime))

if (nrow(timing_dist) == 0) {
  timing_dist <- "bad"
} else {
  timing_dist <- timing_dist %>%
    mutate(measurement_datetime = hms::as_hms(measurement_datetime)) %>%
    ggplot(aes(x = measurement_datetime)) +
    geom_density() +
    theme_classic() +
    labs(x = "time of sample")
}


# samples over time
sample_timing <- working %>%
  select(measurement_date) %>%
  filter(!is.na(measurement_date))

if (nrow(sample_timing) == 0) {
  sample_timing <- "bad"
} else {
  sample_timing <- sample_timing %>%
    group_by(measurement_date) %>%
    tally() %>%
    ggplot(aes(x = measurement_date, y = n)) +
    geom_path() +
    theme_classic() +
    labs(x = "measurement date", y = "daily samples")
}

cant_plot <- any(c(class(sample_timing), class(timing_dist), class(val_dist)) %in% "character")

if (!cant_plot) {
  (val_dist | timing_dist) / sample_timing
}