R/smr_wrangling.R

Defines functions smr_wrangling

Documented in smr_wrangling

#' @title Initial Wrangling for HSMR data
#'
#' @description Creates the majority of necessary variables to be used in the
#' HSMR model.
#'
#'
#' @details \code{smr_wrangling} expects a \code{tibble} of data extracted from
#' SMR01. It also expects \code{tibbles} of data extracted from the GRO deaths
#' database and lookups for primary diagnosis, postcode (for deprivation) and
#' morbidities (comorbs_sum). This is the first step in a four-step process.
#' The next steps are \code{\link{smr_pmorbs}}, then \code{\link{smr_model}}
#' and finally \code{\link{smr_data}}.
#'
#'
#' @param smr01 Input tibble for admissions, see details.
#' @param gro Input tibble for deaths, see details.
#' @param pdiags Input tibble for primary diagnosis groupings lookup.
#' @param postcode Input tibble for deprivation lookup.
#' @param morbs Input tibble for the charlson index for comorbidities lookup.
#' @param spec Input tibble for for specialty groupings lookup.
#'
#' @importFrom magrittr %>%
#' @importFrom magrittr %<>%
#'
#' @export

smr_wrangling <- function(smr01, gro, pdiags, postcode, morbs, spec) {

  ### 1 - Error handling ----

  if(!tibble::is_tibble(smr01) | !tibble::is_tibble(gro) |
     !tibble::is_tibble(pdiags) | !tibble::is_tibble(postcode) |
     !tibble::is_tibble(morbs) | !tibble::is_tibble(spec)) {

    stop(paste0("All arguments provided to the function ",
                "must be in tibble format. Verify whether ",
                "an object is a tibble or not with ",
                "the tibble::is_tibble() function"))
  }

  if(!all(c("link_no", "admission_date", "discharge_date", "cis_marker",
            "postcode", "discharge_type", "sex", "admgrp", "ci_chi_number",
            "admfgrp", "ipdc", "age_grp", "quarter", "location",
            "main_condition", "other_condition_1", "other_condition_2",
            "other_condition_3", "other_condition_4", "other_condition_5",
            "specialty", "ipdc", "age_in_years", "hbtreat_currentdate",
            "year") %in% names(smr01))) {

    stop(paste0("Object smr01 does not contain the required variables. ",
                "Must contain:
                link_no
                admission_date
                discharge_date
                cis_marker
                postcode
                discharge_type
                sex
                admgrp
                ci_chi_number
                admfgrp
                ipdc
                age_in_years
                age_grp
                quarter
                year
                location
                hbtreat_currentdate
                main_condition
                other_condition_1 to other_condition_5
                specialty
                ipdc"))
  }

  if(!all(c("diag1_4", "diagnosis_group") %in% names(pdiags))){

    stop(paste0("Object pdiags does not contain the required variables.",
                "Must contain:
                diag1_4
                diagnosis_group"))

  }

  if(!all(c("morb", "wmorbs", "diag") %in% names(morbs))){

    stop(paste0("Object morbs does not contain the required variables.",
                "Must contain:
                morb
                wmorbs
                diag"))

  }

  if(!all(c("link_no", "date_of_death") %in% names(gro))) {

    stop(paste0("Object gro does not contain the required variables.",
                "Must contain:
                link_no
                date_of_death"))
  }

  if(!is.numeric(smr01$link_no)){

    stop(paste0("Link_no must be a numeric"))

  }

  if(!is.numeric(smr01$cis_marker)){

    stop(paste0("cis_marker must be a numeric"))

  }

  if(!lubridate::is.POSIXct(smr01$admission_date)){

    stop(paste0("Admission_date variable must be POSIXct of format",
                " %Y-%m-%d"))

  }

  if(!lubridate::is.POSIXct(smr01$discharge_date)){

    stop(paste0("Discharge_date variable must be POSIXct of format",
                " %Y-%m-%d"))

  }

  if(!all(1:140 %in% pdiags$diagnosis_group)){

    stop(paste0("Primary diagnosis lookup does not contain all 140 ",
                "primary diagnosis groupings."))

  }

  if(!all(1:17 %in% morbs$morb)){

    stop(paste0("Co-morbidities lookup does not contain all 17 comorbidity
                groups."))

  }

  ### 2 - Match deaths data to SMR01 ----
  # Remove duplicate records on link_no
  # The deaths file is matched on to SMR01 by link_no,
  # therefore link_no needs to be unique
  gro %<>%
    tidylog::distinct(link_no, .keep_all = TRUE)

  # Match deaths data on to SMR01 data
  smr01 %<>%
    tidylog::left_join(gro, by = "link_no") %>%

    # Sort data by link_no, cis_marker, adm_date and dis_date as per guidance
    dplyr::arrange(link_no, cis_marker, admission_date, discharge_date)


  ### 3 - Basic SMR01 processing ----

  # Create the following variables:
  # death_inhosp = 1 if the patient died in hospital during that episode of care
  # dthdays      = the number of days from admission till death
  # death30      = 1 if the patient died within 30 days of admission date
  # quarter_name = quarter name in text form
  # quarter      = quarter in number form (quarter 1 = Jan - Mar 2011)
  # location     = recodes some hospital codes for combined sites
  # diag1_4      = main condition ICD-10 code to 4 digits
  # diagx        = ICD-10 code to 3 and 4 digits, separated by an underscore
  # pdiag_grp    = matches the primary diagnosis group on the 4-digit ICD-10
  #                code
  # wcomorbsx    = matches the charlson index weighting if the relevant ICD-10
  #                codes are present in any of the five "other diagnosis"
  #                positions
  # comorbs_sum  = sum of the wcomorbsx values across the episode

  smr01 %<>%
    tidylog::mutate(death_inhosp = dplyr::if_else(dplyr::between(
      as.numeric(discharge_type), 40, 49),
      1, 0),
      dthdays = lubridate::interval(admission_date, date_of_death) /
        lubridate::days(1),
      death30 = dplyr::case_when(
        dplyr::between(dthdays, 0, 30) ~ 1,
        TRUE ~ 0),
      quarter_name = paste0(year, "Q", quarter),
      quarter = as.numeric(as.factor(quarter_name)),
      diag1_4 = substr(main_condition, 1, 4),
      diag2 = paste(substr(other_condition_1, 1, 3),
                    substr(other_condition_1, 1, 4),
                    sep = "_"),
      diag3 = paste(substr(other_condition_2, 1, 3),
                    substr(other_condition_2, 1, 4),
                    sep = "_"),
      diag4 = paste(substr(other_condition_3, 1, 3),
                    substr(other_condition_3, 1, 4),
                    sep = "_"),
      diag5 = paste(substr(other_condition_4, 1, 3),
                    substr(other_condition_4, 1, 4),
                    sep = "_"),
      diag6 = paste(substr(other_condition_5, 1, 3),
                    substr(other_condition_5, 1, 4),
                    sep = "_")) %>%

    # Create the pdiag_grp and wcomorbsx variables using joins to the morbs
    # dataset
    tidylog::left_join(tidylog::select(pdiags,
                                       pdiag_grp = diagnosis_group,
                                       diag1_4),
                       by = "diag1_4") %>%

    # Match on specialty grouping by the specialty variable
    tidylog::left_join(spec, by = "specialty") %>%

    # Fuzzy joins add the (in this case, not needed) joining variable by
    # default, so append these with "_z" so they can be easily removed
    # afterwards
    fuzzyjoin::fuzzy_left_join(tidylog::select(morbs,
                                               wcomorbs1 = wmorbs,
                                               comorbs1 = morb,
                                               diag2_z = diag),
                               by = c("diag2" = "diag2_z"),
                               match_fun = stringr::str_detect) %>%
    fuzzyjoin::fuzzy_left_join(tidylog::select(morbs,
                                               wcomorbs2 = wmorbs,
                                               comorbs2 = morb,
                                               diag3_z = diag),
                               by = c("diag3" = "diag3_z"),
                               match_fun = stringr::str_detect) %>%
    fuzzyjoin::fuzzy_left_join(tidylog::select(morbs,
                                               wcomorbs3 = wmorbs,
                                               comorbs3 = morb,
                                               diag4_z = diag),
                               by = c("diag4" = "diag4_z"),
                               match_fun = stringr::str_detect) %>%
    fuzzyjoin::fuzzy_left_join(tidylog::select(morbs,
                                               wcomorbs4 = wmorbs,
                                               comorbs4 = morb,
                                               diag5_z = diag),
                               by = c("diag5" = "diag5_z"),
                               match_fun = stringr::str_detect) %>%
    fuzzyjoin::fuzzy_left_join(tidylog::select(morbs,
                                               wcomorbs5 = wmorbs,
                                               comorbs5 = morb,
                                               diag6_z = diag),
                               by = c("diag6" = "diag6_z"),
                               match_fun = stringr::str_detect) %>%

    # Remove joining variables
    tidylog::select(-dplyr::ends_with("_z")) %>%

    # Replace cases with no match with zero
    tidyr::replace_na(list(wcomorbs1 = 0,
                           wcomorbs2 = 0,
                           wcomorbs3 = 0,
                           wcomorbs4 = 0,
                           wcomorbs5 = 0,
                           comorbs1  = 0,
                           comorbs2  = 0,
                           comorbs3  = 0,
                           comorbs4  = 0,
                           comorbs5  = 0)) %>%
    # Comorbs cleaning
    tidylog::mutate(wcomorbs1 = replace(wcomorbs1,
                                        comorbs1 == 12 &
                                          (comorbs2 != 6 & comorbs3 != 6 &
                                             comorbs4 != 6 & comorbs5 != 6),
                                        2),
                    wcomorbs2 = replace(wcomorbs2,
                                        comorbs2 == 12 &
                                          (comorbs1 != 6 & comorbs3 != 6 &
                                             comorbs4 != 6 & comorbs5 != 6),
                                        2),
                    wcomorbs3 = replace(wcomorbs3,
                                        comorbs3 == 12 &
                                          (comorbs2 != 6 & comorbs1 != 6 &
                                             comorbs4 != 6 & comorbs5 != 6),
                                        2),
                    wcomorbs4 = replace(wcomorbs4,
                                        comorbs4 == 12 &
                                          (comorbs2 != 6 & comorbs3 != 6 &
                                             comorbs1 != 6 & comorbs5 != 6),
                                        2),
                    wcomorbs5 = replace(wcomorbs5,
                                        comorbs5 == 12 &
                                          (comorbs2 != 6 & comorbs3 != 6 &
                                             comorbs4 != 6 & comorbs1 != 6),
                                        2)) %>%
    tidylog::mutate(wcomorbs1 = replace(wcomorbs1,
                                        comorbs1 == 11 & (comorbs2 == 15 |
                                                            comorbs3 == 15 |
                                                            comorbs4 == 15 |
                                                            comorbs5 == 15),
                                        0),
                    wcomorbs2 = replace(wcomorbs2,
                                        comorbs2 == 11 & (comorbs1 == 15 |
                                                            comorbs3 == 15 |
                                                            comorbs4 == 15 |
                                                            comorbs5 == 15),
                                        0),
                    wcomorbs3 = replace(wcomorbs3,
                                        comorbs3 == 11 & (comorbs2 == 15 |
                                                            comorbs1 == 15 |
                                                            comorbs4 == 15 |
                                                            comorbs5 == 15),
                                        0),
                    wcomorbs4 = replace(wcomorbs4,
                                        comorbs4 == 11 & (comorbs2 == 15 |
                                                            comorbs3 == 15 |
                                                            comorbs1 == 15 |
                                                            comorbs5 == 15),
                                        0),
                    wcomorbs5 = replace(wcomorbs5,
                                        comorbs5 == 11 & (comorbs2 == 15 |
                                                            comorbs3 == 15 |
                                                            comorbs4 == 15 |
                                                            comorbs1 == 15),
                                        0)) %>%
    tidylog::mutate(wcomorbs2 = replace(wcomorbs2,
                                        comorbs2 == comorbs1,
                                        0),
                    wcomorbs3 = replace(wcomorbs3,
                                        comorbs3 == comorbs1 |
                                          comorbs3 == comorbs2,
                                        0),
                    wcomorbs4 = replace(wcomorbs4,
                                        comorbs4 == comorbs1 |
                                          comorbs4 == comorbs2 |
                                          comorbs4 == comorbs3,
                                        0),
                    wcomorbs5 = replace(wcomorbs5,
                                        comorbs5 == comorbs1 |
                                          comorbs5 == comorbs2 |
                                          comorbs5 == comorbs3 |
                                          comorbs5 == comorbs4,
                                        0)) %>%
    tidylog::mutate(comorbs_sum = rowSums(
      tidylog::select(., dplyr::starts_with("wcomorbs")))) %>%

    # Create two further variables at CIS level:
    # epinum = the episode number for each individual episode within the CIS
    # death_inhosp_max = 1 if the patient died in hospital during any episode
    # of the CIS
    tidylog::group_by(link_no, cis_marker) %>%
    tidylog::mutate(epinum = dplyr::row_number(),
                    death_inhosp_max = max(death_inhosp)) %>%
    dplyr::ungroup() %>%

    # Sort data as per guidance and remove variables no longer required
    dplyr::arrange(link_no, cis_marker, admission_date, discharge_date) %>%
    tidylog::select(-dplyr::contains("condition"),
                    -dplyr::starts_with("wcomorbs"),
                    -comorbs1, -comorbs2, -comorbs3, -comorbs4, -comorbs5,
                    -quarter_name)

  ### 4 - SIMD ----

  # Fix formatting of postcode variable
  smr01 %<>%

    # First remove all spaces from postcode variable
    tidylog::mutate(postcode = gsub("\\s", "", postcode),

                    # Then add space (or spaces) at appropriate juncture
                    # (depending on the number of characters) to get the
                    # postcode into 7-character format
                    postcode = dplyr::case_when(
                      is.na(postcode)
                      ~ NA_character_,
                      stringr::str_length(postcode) == 5
                      ~ sub("(.{2})", "\\1  ", postcode),
                      stringr::str_length(postcode) == 6
                      ~ sub("(.{3})", "\\1 ", postcode),
                      TRUE
                      ~ postcode
                    )) %>%

    # Join to the postcode lookup
    tidylog::left_join(postcode, by = "postcode") %>%

    # Assign the appropriate SIMD value to a patient depending on the year they
    # were admitted
    tidylog::mutate(simd = dplyr::case_when(
      year >=2017 ~ simd_2020,
      year >= 2014 & year < 2017 ~ simd_2016,
      year < 2014 ~ simd_2012
    )) %>%

    # Remove the not needed year-specific SIMD variables
    tidylog::select(-c(simd_2012, simd_2016, simd_2020)) %>%
    tidylog::mutate(simd = dplyr::case_when(
      is.na(simd) & postcode == "NK010AA" ~ 6,
      is.na(simd) & postcode != "NK010AA" ~ 7,
      TRUE ~ simd
    ))

  return(smr01)

}

### END OF SCRIPT ###
Public-Health-Scotland/hsmr documentation built on June 24, 2024, 1:48 a.m.