create_smr_data.R

#########################################################################
# Name of file - create_hsmr_data.R
# Data release - Quarterly HSMR publication
# Original Authors - David Caldwell
# Orginal Date - February 2018
#
# Type - Reproducible Analytical Pipeline
# Written/run on - RStudio server
# Version of R - 3.5.1
#
# Description - Extracts SMR01 & deaths data and carries out required
# manipulations and modelling to create the minimal tidy datasets for HSMR
#
# Approximate run time - xx minutes
#########################################################################


### SECTION 1 - HOUSE KEEPING ----

### 1 - Load environment file ----
source("setup_environment.R")

# Define the database connection with SMRA
smra_connect  <- suppressWarnings(dbConnect(odbc(),  dsn="SMRA",
                                            uid=.rs.askForPassword("SMRA Username:"),
                                            pwd=.rs.askForPassword("SMRA Password:")))

### 2 - Read in lookup files ----

# Primary Diagnosis Groupings
pdiag_grp_data <- read_csv(here("reference_files",
                                 "diag_grps_lookup_updated.csv")) %>%
  select(diag1_4, DIAGNOSIS_GROUP) %>%
  clean_names()


# ICD-10 codes, their Charlson Index Groupings and CIG weights
morbs <- read_csv(here("reference_files", "morbs.csv")) %>%
  # Gather ICD codes into a single column
  gather(code, diag, diag_3:diag_4) %>%
  select(-code) %>%
  # Remove all NAs from the ICD-10 column
  filter(!is.na(diag))

# Postcode lookups for SIMD 2020, 2016 and 2012
# These files will be combined, so create a year variable in each one, to allow
# them to be differentiated from one another
simd_2020 <- readRDS(paste0(plat_filepath,
                              "lookups/Unicode/Deprivation",
                              "/postcode_2024_1_simd2020v2.rds")) %>%
  select(pc7, simd2020v2_sc_quintile) %>%
  rename(postcode = pc7,
         simd = simd2020v2_sc_quintile) %>%
  mutate(year = "simd_2020")

simd_2016 <- readRDS(paste0(plat_filepath,
                              "lookups/Unicode/Deprivation",
                              "/postcode_2019_2_simd2016.rds")) %>%
  select(pc7, simd2016_sc_quintile) %>%
  rename(postcode = pc7,
         simd = simd2016_sc_quintile) %>%
  mutate(year = "simd_2016")

simd_2012 <- readRDS(paste0(plat_filepath,
                              "lookups/Unicode/Deprivation/",
                              "postcode_2016_1_simd2012.rds")) %>%
  select(pc7, simd2012_sc_quintile) %>%
  rename(postcode = pc7,
         simd = simd2012_sc_quintile) %>%
  mutate(year = "simd_2012")

# Combine postcode lookups into a single dataset
simd_all <- bind_rows(simd_2020, simd_2016, simd_2012) %>%
  pivot_wider(names_from = year, values_from = simd)

rm(simd_2020, simd_2016, simd_2012) # saving a bit of space

### SECTION 2 - DATA EXTRACTION AND MANIPULATION ----

### 1 - Extract data Extract deaths and SMR01 data from SMRA databases ----

# Deaths data
deaths  <- as_tibble(dbGetQuery(smra_connect,
                                query_gro_smr(extract_start = start_date))) %>%
  clean_names()

# SMR01 data
smr01 <- as_tibble(dbGetQuery(smra_connect,
                              query_smr01(extract_start = start_date,
                                          extract_end = end_date))) %>%
  clean_names()

# Prior morbidities within previous 1 & 5 years data
data_pmorbs <- as_tibble(dbGetQuery(smra_connect,
                                    query_smr01_minus5(
                                      extract_start = start_date_5,
                                      extract_end = end_date))) %>%
  clean_names()

# Save basefiles
save_file(deaths, "GRO_deaths", "base_files", "rds",  dev = F, overwrite = F)
save_file(smr01, "SMR01_basefile", "base_files", "rds",  dev = F, overwrite = F)
save_file(data_pmorbs, "SMR01_minus_5_basefile", "base_files", "rds",
          dev = F, overwrite = F)


# 2 - Pipeline ----
# smr01    = The SMR01 extract used to produce SMR data. This should contain
#            ONLY the quarters being published
# gro      = The deaths extract used to produce SMR data. This should contain
#            ALL data AFTER the start of the first publication quarter
# pdiags   = The primary diagnosis lookup dataframe/tibble
# postcode = The postcode lookup dataframe for SIMD matching
# morbs    = The lookup tibble for morbidity groupings
# spec     = The specialty grouping lookup tibble
#
# This function does most of the wrangling required for producing HSMR
smr01 <- smr_wrangling(smr01    = smr01,
                       gro      = deaths,
                       pdiags   = pdiag_grp_data,
                       postcode = simd_all,
                       morbs    = morbs,
                       spec     = specialty_group)

# smr01        = The output from smr_wrangling()
# smr01_minus5 = The SMR01 extract used to calculate the prior morbidities.
#                This should contain all publication quarters plus an extra
#                five years at the start
# morbs        = The lookup tibble for morbidity groupings
#
# This function does the final bits of wrangling required for HSMR. These
# are done separately from the rest because they are quite resource-heavy
# and prone to crashing
smr01 <- smr_pmorbs(smr01        = smr01,
                    smr01_minus5 = data_pmorbs,
                    morbs        = morbs)

# smr01      = The output from smr_pmorbs()
# base_start = The beginning of the baseline period
# base_end   = The end of the baseline period
# index      = Indicating whether the patient indexing is done quarterly
#              or annually
# save_model = Boolean flag for saving out the full model as an RDS
#
# This function runs the risk model and appends the probability of death on
# to the SMR01 extract. As of April 2024 the full model is written out to an RDS
# in base_files. This takes quite a while; smr_model() is likely to run
# for ~ 90 minutes. Boolean flag can be set to F to skip this step.

smr01 <- smr_model(smr01      = smr01,
                   base_start = start_date,
                   base_end   = base_end,
                   index      = "Y",
                   save_model = T)

# smr01 = The output from smr_model()
# index = Indicating whether the patient indexing is done quarterly
#         or annually
#
# This function aggregates the data down into quarterly/annual SMR figures
smr_data <- smr_data(smr01 = smr01,
                     index = "Y",
                     hospital_lookup = hospitals)


### 3 - Save data ----
# This is the level 3 caselisting file
save_file(smr01 %>% filter(admission_date >= start_date + years(2)) %>%
                    change_hbcodes(version_to = "14", code_cols = "hbtreat_currentdate"),
          "SMR-with-predprob", "base_files", "csv", dev = F, overwrite = F)

save_file(smr_data, "SMR-data", "output", "csv", dev = F, overwrite = F)


# File for dashboard, bringing previous publication data and adding new period
smr_data_dash <- readr::read_csv(paste0(data_folder, previous_pub,
                                 "/output/", previous_pub, "_SMR-data_dashboard.csv")) %>%
                mutate(completeness_date = paste0(substr(completeness_date,7,10),
                                                  "-", substr(completeness_date,4,5),
                                                  "-", substr(completeness_date,1,2)))

smr_data_dash <- smr_data %>%
  # Required locations specified in setup_environment
  filter(location %in% locations_filter) %>%
  change_hbcodes(version_to = "14") %>%  # Tableau uses 2014 codes, but code produces 2019
  rbind(smr_data_dash) # add old dashboard data, already uses 2014 codes and has correct location filtering

# Used for the offline dashboard
save_file(smr_data_dash, "SMR-data_dashboard", "output", "csv", dev = F, overwrite = F)

# Create TDE files
# yyyy-mm-dd_SMR-data_dashboard.csv – Discovery HSMR Level 1 SMR & Discovery HSMR Level 1 SMR Live
save_file(smr_data_dash, "Discovery HSMR Level 1 SMR", out_folder = "tde",
          type = "xlsx", dev = F, overwrite = F)
save_file(smr_data_dash, "Discovery HSMR Level 1 SMR Live", out_folder = "tde",
          type = "xlsx", dev = F, overwrite = F)


# Create file for RShiny public dashboard
# Update the HB codes back to the 2019 codes
public_dash <- smr_data_dash %>%
  change_hbcodes(version_to = "19") %>%
  # Create a variable that is used to sort time periods
  mutate(year = stringr::word(period_label, 2, 2),
         month = sprintf("%02d", match(stringr::word(period_label, 1, 1), month.name)),
         order_var = paste0(year, "-", month))

# Create a Scotland row to add after funnel limits have been calculated
public_dash_scot <- public_dash %>%
  filter(location == 'Scot' & period == 3)

# Create warning and control confidence limits for funnel plot
public_dash_hosps <- public_dash %>%
  filter(period == 3 & location %in% c(hosp_filter)) %>%
  mutate(st_err = round_half_up(sqrt(1/round_half_up(pred, 8)), 8),
         z = if_else(location_type == "hospital",
                     round_half_up(((round_half_up(smr, 8) - 1)/round_half_up(st_err,8)), 8),
                     0)) %>%
  mutate(z_max = max(z),
         z_min = min(z),
         z_flag = case_when(z == z_max ~ 1,
                            z == z_min ~ -1,
                            TRUE ~ 0),
         z = if_else(z == z_max | z == z_min, 0, z),
         z_max = max(z),
         z_min = min(z),
         z = case_when(z_flag == 1 ~ z_max,
                       z_flag == -1 ~ z_min,
                       TRUE ~ z),
         z_flag = if_else(z != 0, 1, 0),
         w_score = round_half_up(sqrt(sum(round_half_up(z * z, 8))/sum(z_flag)),8)) %>%
  # Calculate funnel limits for funnel plot
  mutate(uwl = 1 + 1.96 * round_half_up(st_err * w_score,8),
         ucl = 1 + 3.09 * round_half_up(st_err * w_score,8),
         lwl = 1 - 1.96 * round_half_up(st_err * w_score,8),
         lcl = 1 - 3.09 * round_half_up(st_err * w_score,8)) %>%

  # Create flag for where hospital sits on funnel plot
  mutate(flag = case_when(smr > ucl ~ "1",
                          smr < lcl ~ "2",
                          smr > uwl & smr <= ucl ~ "3",
                          smr <lwl & smr >= lcl ~ "4",
                          TRUE ~ "0"))

# Keep only variables that are required for dashboard
public_dash_all <- bind_rows(public_dash_scot, public_dash_hosps) %>%
  select(hb, location, location_name, order_var, period_label, deaths, pred,
                 pats, smr, crd_rate, smr_scot, death_scot, pats_scot,
                 uwl, ucl, lwl, lcl, flag) %>%
  arrange(order_var, location_name)


# Save into output folder
save_file(public_dash_all, "SMR_data_public_dashboard", "output", "rds", dev = F, overwrite = F)

### 4 - Model diagnostic -----------------------------------

rmarkdown::render("roc/model_checks.Rmd")


### END OF SCRIPT ###
Public-Health-Scotland/hsmr documentation built on June 24, 2024, 1:48 a.m.