R/characterise_episodes.R

Defines functions weekly_admissions report_cases_daily daily_admissions event_occurrances unit_admissions characterise_episodes characterise_spells resolve_date_time episode_varacity

Documented in characterise_episodes characterise_spells daily_admissions episode_varacity event_occurrances report_cases_daily resolve_date_time unit_admissions weekly_admissions

#' Report Weekly Admissions
#'
#' Shows the number of admissions to each each in terms of patients and episodes
#' stratified in weeks
#'
#' @param distinct_episodes from reference table
#' @return a summary containing patient and episode numbers
#'
#' @importFrom dplyr group_by summarise n_distinct
#' @importFrom magrittr %>%
#' @importFrom lubridate day month year
weekly_admissions <- function(distinct_episodes = NULL) {
  admissions <- distinct_episodes %>%
    dplyr::mutate(
      year = lubridate::year(start_date),
      month = lubridate::month(start_date, label = TRUE),
      week_of_month = as.integer(ceiling(lubridate::day(start_date) / 7))
    ) %>%
    dplyr::group_by(site, year, month, week_of_month) %>%
    dplyr::summarise(
      patients = dplyr::n_distinct(nhs_number),
      episodes = dplyr::n_distinct(episode_id)
    )
}


#' Reports Case Numbers according to day
#'
#' @param unique_cases_tbl from \code{pull_cases_all}
#'
#' @return breakdown of unique daily cases
#'
#' @importFrom dplyr group_by summarise n_distinct
#' @importFrom magrittr %>%
#' @importFrom lubridate day month year wday
report_cases_daily <- function(unique_cases_tbl = NULL) {
  cases <- unique_cases_tbl %>%
    mutate(
      year = lubridate::year(start_date),
      month = lubridate::month(start_date, label = TRUE),
      week_of_month = as.integer(ceiling(lubridate::day(start_date) / 7)),
      wday = lubridate::wday(start_date, label = TRUE)
    ) %>%
    dplyr::group_by(site, year, month, week_of_month, wday) %>%
    dplyr::summarise(
      patients = dplyr::n_distinct(nhs_number),
      episodes = dplyr::n_distinct(episode_id)
    )
}


#' Daily Admissions for each Site
#'
#' Calculates the number of admissions for each calendar day, stratified
#' by site. This is a complete table, i.e. days with 0 admissions are not
#' listed
#'
#' @param reference the reference table generated by \code{\link{make_reference}}
#' @param by_site the named site of interest as a character
#'
#' @return a tibble with the number of unique episodes admitted for a given day
#'
#' @importFrom dplyr filter mutate group_by summarise n_distinct
#' @importFrom lubridate date
daily_admissions <- function(reference = NULL, by_site = NULL) {

  admissions <- reference %>%
    filter(site == by_site) %>%
    mutate(date = lubridate::date(start_date)) %>%
    group_by(date) %>%
    summarise(episodes = n_distinct(episode_id)) %>%
    filter(episodes > 0)
}


#' Event Occurrances for each Site
#'
#' Calculates the number of event occurances for each calendar day, stratified
#' by site. Days with 0 event submissions are not listed
#'
#' @param extracted_event extracted HIC event
#' @param by_site a site code as a character vector
#'
#' @return a tibble with the number of unique episodes admitted for a given day
event_occurrances <- function(extracted_event = NULL, by_site = "UCL") {
  occurances <- extracted_event %>%
    filter(site == by_site) %>%
    mutate(date = lubridate::date(datetime)) %>%
    group_by(date) %>%
    summarise(events = n_distinct(internal_id)) %>%
    filter(events > 0)

  return(occurances)
}


#' Report Admission Numbers for each Unit
#'
#' Reports on ICNARC CMP unit codes to describe the numbers of reported cases by
#' site. This is dependent on accurate parsing of the xml schema for hic code
#' 0002 (INCARC CMP unit code). And as such, may dramatically under-report. To
#' see a comprehensive list of acual cases, see the
#' \code{\link{weekly_admissions}} function.
#'
#' @param events_table the events table
#' @param reference_table the reference table
#' @export
#'
#' @importFrom dplyr filter select collect right_join group_by summarise
#'   n_distinct rename
#' @importFrom rlang .data
#' @importFrom magrittr %>%
#'
#' @return a tibble with unique episodes and patients reported by ICU
unit_admissions <- function(events_table = NULL, reference_table = NULL) {
  unit_numbers <- events_table %>%
    filter(.data$code_name == "NIHR_HIC_ICU_0002") %>%
    select(.data$episode_id, .data$string) %>%
    collect() %>%
    right_join(reference_table, by = "episode_id") %>%
    select(.data$episode_id, .data$nhs_number,
                  .data$string, .data$site) %>%
    group_by(.data$site, .data$string) %>%
    summarise(
      patients = n_distinct(.data$nhs_number),
      episodes = n_distinct(.data$episode_id)
    ) %>%
    rename(unit = .data$string)

  return(unit_numbers)
}


#' Characterise Episodes
#'
#' There are several ways to potentially define an episode. Fundamentally, we
#' need a start and an end datetime. The start is already required to onboard
#' patients into the database and so is 100% complete. This function primary
#' serves to identify the end of an epsiode, which is often much less clearly
#' defined. Potential sources of information, in order of precidence include:
#' \itemize{
#'   \item date and time of in unit death: 0042 and 0043
#'   \item date and time of brainstem death: 0044 and 0045
#'   \item episode end date time: 0412
#'   \item date and time of body removed: 0038 and 0039
#'   \item the last regularly recorded observation from HR (0108) and SpO2
#'     (0129)
#' }
#' The goal is not to create a perfect record, as doing so drops too many cases.
#' Rather, the goal is to characterise the episode in a meaningful way. This
#' means that episodes cannot have negative episode length, and you should not
#' be able to record a vital sign after a death has occurred. If date and time
#' of death preceed the discharge time we overwrite the episode end datetime
#' with the date and time of death. The same is not true for brainstem death and
#' body removal. This would result in a situation whereby a body is kept
#' physiologically active on the ICU, but the episode is now listed as finished.
#'
#' Since a patient can only die once, checks are also performed to ensure that
#' death events, can only be linked to a single episode. This is relevent for
#' the following situations:
#' \itemize{
#'   \item More than one episode has become linked with the same death event. In
#'     this situation, the final episode is closed with an end datetime of the
#'     datetime of death. The death is then uncoupled from any prior episodes
#'   \item A patient was discharged alive, but subseqnetly died.
#' }
#' Checks are taken to ensure that episodes are non-overlapping, though they can
#' run immediately from one to the next.
#'
#' @param connection a connection to the CC-HIC database
#'
#' @return a tibble that characterises each episode. The attribute
#'   "invalid_records" contains information related to invalid records and the
#'   reason for invalidation
#' @export
#'
#' @importFrom rlang abort
#' @importFrom dplyr setdiff tbl
#' @importFrom tidyselect ends_with
characterise_episodes <- function(connection = NULL) {

  if (is.null(connection)) {
    rlang::abort("connection must be supplied")
  }

  df_extract <- tibble::tribble(
    ~codes, ~names,
    "NIHR_HIC_ICU_0411", "epi_start_dttm",
    "NIHR_HIC_ICU_0412", "src_end_dttm",
    "NIHR_HIC_ICU_0042", "death_date",
    "NIHR_HIC_ICU_0043", "death_time",
    "NIHR_HIC_ICU_0044", "bsd_date",
    "NIHR_HIC_ICU_0045", "bsd_time",
    "NIHR_HIC_ICU_0038", "body_date",
    "NIHR_HIC_ICU_0039", "body_time",
    "NIHR_HIC_ICU_0073", "nhs",
    "NIHR_HIC_ICU_0097", "outcome",
    "NIHR_HIC_ICU_0400", "bsd"
  )

  df <- extract_demographics(
    connection = connection,
    code_names = df_extract$codes,
    rename = df_extract$names)

  ## For this particular purpse, we need to add in columns that might
  ## be all NA.

  missing_names <- setdiff(df_extract$names, names(df))

  for (i in seq_along(missing_names)) {
    df[[missing_names[i]]] <- as.character(NA)
  }

  df <- df %>% mutate_at(vars(ends_with("time")), hms::as_hms)
  df <- df %>% mutate_at(vars(ends_with("date")), lubridate::ymd)
  df <- df %>% mutate_at(vars(ends_with("dttm")), lubridate::ymd_hms)

df <- df %>%
  mutate(
    death_dttm = if_else(
      !is.na(.data$death_date) & !is.na(.data$death_time),
      paste(
        format(.data$death_date),
        format(.data$death_time)), as.character(NA)
    )
  ) %>%
  mutate(death_dttm = if_else(
    !is.na(.data$death_dttm),
    lubridate::ymd_hms(.data$death_dttm),
    as.POSIXct(NA))) %>%
  mutate(
    bsd_dttm = if_else(
      !is.na(.data$bsd_date) & !is.na(.data$bsd_time),
      paste(format(.data$bsd_date),
            format(.data$bsd_time)), as.character(NA)
    )
  ) %>%
  mutate(
    bsd_dttm = if_else(
      !is.na(.data$bsd_dttm),
      lubridate::ymd_hms(.data$bsd_dttm),
      as.POSIXct(NA))) %>%
  mutate(
    body_dttm = if_else(
      !is.na(.data$body_date) & !is.na(.data$body_time),
      paste(format(.data$body_date),
            format(.data$body_time)), as.character(NA)
    )
  ) %>%
  mutate(
    body_dttm = if_else(
      !is.na(.data$body_dttm),
      lubridate::ymd_hms(.data$body_dttm), as.POSIXct(NA)))

  df <- df %>%
    select(-ends_with("date"), -ends_with("time")) %>%
    mutate(nhs_validation = if_else(
      verify_nhs(.data$nhs), 1L, 0L
    ))

  invalid_records <- df %>% filter(.data$nhs_validation == 0) %>%
    select(.data$episode_id) %>%
    mutate(reason = "invalid nhs number")

  df <- df %>%
    filter(.data$nhs_validation == 1) %>%
    select(-.data$nhs_validation)

  invalid_records <- df %>%
    filter(.data$outcome == "E") %>%
    select(.data$episode_id) %>%
    mutate(reason = "open episode") %>%
    bind_rows(invalid_records)

  df <- df %>% filter(.data$outcome != "E")

  ## Discharged alive and no src end date is invalid
  ## Disacharge alive, with an end date, automatically VALID

  df <- df %>%
    mutate(
      epi_end_dttm = case_when(
        .data$outcome == "A" & is.na(.data$src_end_dttm) ~ as.POSIXct(NA),
        .data$outcome == "A" & !is.na(.data$src_end_dttm)
          ~ .data$src_end_dttm,
        .data$outcome == "D" & !is.na(.data$death_dttm) & .data$bsd == 0 | is.na(.data$bsd)
          ~ .data$death_dttm,
        .data$outcome == "D" & bsd == 1 & !is.na(.data$death_dttm)
          ~ .data$bsd_dttm,
  TRUE ~ as.POSIXct(NA)
      )
    )

  broken_timings <- df %>%
    filter(is.na(.data$epi_end_dttm) | .data$epi_end_dttm < .data$epi_start_dttm) %>%
    select(-.data$epi_end_dttm)

  if (nrow(broken_timings) > 0) {
    recover_timings <- tbl(connection, "events") %>%
      filter(.data$episode_id %in% !!broken_timings$episode_id,
             .data$code_name %in% c("NIHR_HIC_ICU_0108", "NIHR_HIC_ICU_0129")) %>%
      select(.data$episode_id, .data$datetime) %>%
      collect() %>%
      group_by(.data$episode_id) %>%
      summarise(epi_end_dttm = max(.data$datetime)) %>%
      filter(!is.na(.data$epi_end_dttm)) %>%
      left_join(broken_timings, ., by = "episode_id")

    df <- df %>%
      filter(!(.data$episode_id %in% broken_timings$episode_id)) %>%
      bind_rows(recover_timings)

    invalid_records <- df %>%
      filter(is.na(.data$epi_end_dttm)) %>%
      select(.data$episode_id) %>%
      mutate(reason = "no end datetime") %>%
      bind_rows(invalid_records)

    invalid_records <- df %>%
      filter(!is.na(.data$epi_end_dttm)) %>%
      filter(.data$epi_end_dttm < .data$epi_start_dttm) %>%
      select(.data$episode_id) %>%
      mutate(reason = "end time prior to admission") %>%
      bind_rows(invalid_records)

    df <- df %>%
      filter(!is.na(.data$epi_end_dttm)) %>%
      filter(.data$epi_end_dttm >= .data$epi_start_dttm)
  }

  duplicate_start <- df %>%
    ungroup() %>%
    distinct(.data$nhs, .data$epi_start_dttm, .keep_all = TRUE) %>%
    select(.data$episode_id) %>%
    anti_join(df, by = "episode_id") %>%
    select(.data$episode_id) %>%
    mutate(reason = "duplicate start datetime")

  duplicate_end <- df %>%
    ungroup() %>%
    distinct(.data$nhs, .data$epi_start_dttm, .keep_all = TRUE) %>%
    select(.data$episode_id) %>%
    anti_join(df, by = "episode_id") %>%
    select(.data$episode_id) %>%
    mutate(reason = "duplicate end datetime")

  overlapping <- df %>%
    group_by(.data$nhs) %>%
    arrange(.data$nhs, .data$epi_start_dttm) %>%
    mutate(time_out = difftime(
      .data$epi_start_dttm, lag(.data$epi_end_dttm))) %>%
    ungroup() %>%
    filter(.data$time_out < 0) %>%
    select(.data$episode_id) %>%
    mutate(reason = "overlapping episode")

  invalid_records <- bind_rows(
    invalid_records,
    duplicate_start,
    duplicate_end,
    overlapping
  )

  df <- df %>%
    select(
      .data$episode_id, .data$nhs, .data$epi_start_dttm,
      .data$epi_end_dttm, .data$outcome) %>%
    rename(nhs_number = .data$nhs) %>%
    anti_join(invalid_records, by = "episode_id") %>%
    arrange(.data$nhs_number, .data$epi_start_dttm) %>%
    mutate(los_days = as.numeric(
      difftime(
        .data$epi_end_dttm, .data$epi_start_dttm, units = "hours"))/24)

  df <- left_join(
    tbl(connection, "episodes"),
    tbl(connection, "provenance"),
    by = c("provenance" = "file_id")
    ) %>%
    select(.data$episode_id, .data$site) %>%
    collect() %>%
    left_join(df, ., by = "episode_id")

  attr(df, "invalid_records") <- invalid_records

  return(df)
}


#' Characterise Spells
#'
#' Some sites have patients check out of one ICU and into another (for example
#' ICU stepdown to HDU). This checks to see if patients are discharged from one
#' unit and admitted to another wihtin a pre-defined time period, specified in
#' the minutes argument.
#'
#' This only evaluates episodes that have already been flagged as valid by the
#' \code{\link{characterise_episodes}} function.
#'
#' @param df episode length table
#' @param minutes numeric scalar to define transition period
#' @export
#'
#' @return a table with episodes reconciled as spells
characterise_spells <- function(df = NULL, minutes = 30) {
  df %>%
    arrange(.data$nhs_number, .data$epi_start_dttm) %>%
    group_by(.data$nhs_number) %>%
    mutate(time_out = .data$epi_start_dttm[-1] %>%
      difftime(
        .data$epi_end_dttm[-length(.data$epi_end_dttm)], units = "mins") %>%
      as.integer() %>%
      c(NA)) %>%
    mutate(new_spell = if_else(
      lag(.data$time_out) > minutes | is.na(lag(.data$time_out)),
      TRUE, FALSE)) %>%
    ungroup() %>%
    mutate(spell_id = cumsum(.data$new_spell)) %>%
    select(.data$spell_id, .data$episode_id, .data$nhs_number, .data$site,
           .data$epi_start_dttm, .data$epi_end_dttm, .data$los_days)
}


#' Resolve DateTime
#'
#' Many events in CC-HIC are stored in separate date and time columns/objects.
#' This function attempts to reconcile and combine these times when possible. Of
#' note, date and time information is not always stored with consistent rules.
#' For example, death date and time, are often stored for every patient in every
#' episode, even though the patient can only die once. The following are some
#' date and time pairings that denote a singular event:
#' \itemize{
#'   \item "NIHR_HIC_ICU_0042", "NIHR_HIC_ICU_0043" - Unit Death
#'   \item "NIHR_HIC_ICU_0038", "NIHR_HIC_ICU_0039" - Body Removal
#'   \item "NIHR_HIC_ICU_0044", "NIHR_HIC_ICU_0045" - Brain stem death
#'   \item "NIHR_HIC_ICU_0048", "NIHR_HIC_ICU_0049" - Treatment Withdrawal
#'   \item "NIHR_HIC_ICU_0050", "NIHR_HIC_ICU_0051" - Discharge ready
#' }
#' If a date or time component is missing, nothing is returned as the datetime
#' cannot be accurately formed.
#'
#' @param df a table that contains columns for the date and time of interest
#' @param date_code the column name for the date of interest
#' @param time_code the column name for the time of interest
#'
#' @return a table with the correct datetime pairing for the codes given
#'
#' @importFrom rlang .data sym
resolve_date_time <- function(df = NULL,
                              date_code = NULL,
                              time_code = NULL) {

  if (any(is.null(c(df, date_code, time_code)))) {
    rlang::abort("you must supply a dataframe and two column names")
  }

  dc <- rlang::enquo(date_code)
  tc <- rlang::enquo(time_code)

  df <- df %>%
    mutate(
      dttm = if_else(
        !is.na(!!dc) & !is.na(!!tc),
      paste0(format(!!dc), " ", format(!!tc)), as.character(NA)
      )
    ) %>%
    mutate(
      dttm = if_else(!is.na(dttm), lubridate::ymd_hms(dttm), as.POSIXct(NA)))

  return(df)
}

#' Summarise Non-Verifiable Episodes
#'
#' Provides an overview of the reasons for episode invalidation
#'
#' @param df the episode table returned from \code{\link{characterise_episodes}}
#' @export
#'
#' @return a tibble containing summary information for validation at episode
#'   level
episode_varacity <- function(df) {

  attr(df, "invalid_records") %>%
    group_by(reason) %>%
    tally()

}
CC-HIC/inspectEHR documentation built on Jan. 16, 2020, 11:24 p.m.