R/chars_functions.R

Defines functions chars_icd_ccs_count chars_icd_ccs chars_injury_matrix_count chars_injury_matrix

Documented in chars_icd_ccs chars_icd_ccs_count chars_injury_matrix chars_injury_matrix_count

# chars_injury_matrix() ----
#' View available combinations of CHARS injury mechanisms and intents
#'
#' @description
#' Function to view all combinations of the Comprehensive
#' Hospital Abstract Reporting System (CHARS) Injury Matrix mechanisms and
#' intents available in rads.
#'
#' Generates a table with two columns, \code{mechanism} & \code{intent}.
#' Use it to identify the combinations of \code{mechanism} & \code{intent} that
#' you want to use in \code{chars_injury_matrix_count}.
#'
#' @details
#' This function provides the terms used by the hospitalization function
#' \code{chars_injury_matrix_count} and may not be the same as those used with
#' \code{death_injury_matrix_count}.
#'
#' @note
#' This function does not take any arguments.
#'
#' This function will return all available categories, some of which are specific
#' to ICD9-cm (2012-2015) and some of which are specific to ICD10-cm (2016+).
#'
#' @source
#' Derived from columns beginning with `mechanism_` and `intent_` in
#' Azure Server 16 (chars.final_analytic) that were created during the CHARS
#' ETL process.
#'
#' @return
#' A data.table with two columns: \code{mechanism} & \code{intent}. The number
#' of rows are determined dynamically by scanning the data available in SQL.
#'
#' @references
#' WA DOH CHAT: \url{https://secureaccess.wa.gov/doh/chat/Content/FilesForDownload/CodeSetDefinitions/Hospitalization%20Injury%20Matrix%20ICD10CM.xlsx}
#'
#' @export
#'
#' @name chars_injury_matrix
#'
#' @examples
#' # Save and view table as a data.table named 'blah'
#' blah <- chars_injury_matrix()
#' print(blah)
#'
#' @import data.table rads.data
#'
chars_injury_matrix<- function(){
  # Global variables used by data.table declared as NULL here to play nice with devtools::check() ----
  chars_injury_matrix_list <- mechanism <- intent <-  NULL

  # get column names from SQL table
  con <- validate_hhsaw_key('hhsaw')
  chars.names <- names(DBI::dbGetQuery(con, "SELECT TOP (0) * FROM [chars].[final_analytic]"))

  # create a matrix of every possible mechanism and intent
  chars_injury_matrix_list = unique(data.table::setDT(expand.grid(
    mechanism = sort(c("motor_vehicle_traffic", setdiff(gsub("^mechanism_", "", grep("^mech", chars.names, value = T)), ""))), # motor vehicle is not in data, but will combine mvt variables
    intent = setdiff(gsub("^intent_", "", grep("^intent", chars.names, value = T)), '') #
  )))

  return(chars_injury_matrix_list)
}

# chars_injury_matrix_count() ----
#' Generate injury matrix counts from line-level Comprehensive Hospital
#' Abstract Reporting System (CHARS) data
#'
#' @description
#' Generate hospitalization counts for an injury matrix
#' specifying the intent and mechanism of injury. Needs line-level CHARS data
#' with columns beginning with `mechanism_` and `intent_`. Covers both ICD9-cm
#' (2012-2015) and ICD10-cm (2016+).
#'
#' @param ph.data a data.table or data.frame. Must contain CHARS data structured
#' with one person per row and predetermined `mechanism_` and `intent_` columns.
#' In other words, this ph.data should come from use of \code{get_data_chars()}.
#'
#' The default is \code{ph.data = NULL}
#'
#' @param intent a character vector. It specifies the
#' intent of the injury related hospitalization that you want returned (e.g.,
#' "assault", "intentional", "unintentional", etc.). "none"
#' will ignore the intent and only return the mechanism of the injury leading
#' to hospitalization.
#'
#' **NOTE**
#' You do not have to type the entire keyword for the intent, a
#' partial string match is sufficient and is case insensitive. E.g.,
#' \code{intent = c("intent")} would return both "intentional" and "unintentional" and
#' \code{intent = c("un")} would return both "undetermined" and "unintentional".
#'
#' The default is \code{intent = '*'}, which selects all possible intents
#'
#' @param mechanism a character vector. It specifies the
#' mechanism of the injury related hospitalization that you want returned
#' (e.g., "cut_pierce", "drowning", "fall", "firearm", etc.). "none" will ignore
#' the mechanism and only return the intent of the injury leading to hospitalization.
#'
#' To see the complete list of mechanisms, type
#' \code{unique(chars_injury_matrix()$mechanism)} in your
#' R console.
#'
#' **NOTE**
#' You do not have to type the entire keyword for the mechanism, a
#' partial string match is sufficient and is case insensitive. E.g.,
#' \code{mechanism = c("fire")} would return both "firearm" and
#' "fire_burn".
#'
#' The default is \code{mechanism = '*'}, which selects all possible mechanisms
#'
#' @param group_by a character vector of indeterminate length. This is used to
#' specify all the variables by which you want to group (a.k.a. stratify) the
#' results. For example, if you specified \code{group_by = c('chi_sex',
#' 'chi_race_6')}, the results would be stratified by each combination of sex
#' and race.
#'
#' The default is \code{group_by = NULL}
#'
#' @param def a character vector of length one, limited to 'narrow' or 'broad'.
#' It specifies whether you want to use the CDC's recommended 'narrow' approach,
#' which requires that the \strong{principal diagnosis} of an injury
#' hospitalization be a nature-of-injury ICD10-CM code. Or, alternatively, the
#' 'broad' definition that searches all available diagnosis fields on the
#' hospital discharge record (there can be a maximum of 54 diagnosis fields in
#' CHARS data).
#'
#' **NOTE**
#' ph.data must contain the columns named \code{injury_nature_narrow} &
#' \code{injury_nature_broad}.
#'
#' The default is \code{def = 'narrow'}
#'
#' @param primary_ecode a logical vector of length one. It specifies whether you
#' want to limit the analysis to using just the primary ecode (i.e., the
#' \code{injury_ecode} variable), rather than all available ecodes.
#'
#' As of RADS 1.1.7.7, the only valid argument is TRUE (T). Those wanting to perform
#' an analysis with other ecodes would need to perform a custom analysis in SQL using
#' the `chars.stage_diag` & `chars.stage_ecode` tables.
#'
#' The default is \code{primary_ecode = TRUE}
#'
#' @param kingco a logical vector of length one. It specifies whether you want to
#' limit the analysis to King County. Note that this only works when you have
#' the column `chi_geo_kc` imported from the \code{get_data_chars()} function.
#'
#' The default is \code{kingco = TRUE}
#'
#' @details
#' Since the injury analysis uses many columns, we suggest that you obtain
#' ph.data with get_data_chars(cols = NA), rather than trying to specify the
#' columns of interest.
#'
#' This function will only count injuries where the type of injury (from
#' ICD10-CM codes) AND the corresponding external cause information (from an
#' ecode) is present. To examine injuries that do not have an external cause,
#' please examine the analytic data directly.
#'
#' See Jeremy Whitehurst's documentation for the CHARS ETL to understand exactly
#' which codes are mapped to which intents and mechanisms.
#'
#' @return
#' The function returns a data.table with a minimum of three columns:
#' \code{mechanism}, \code{intent}, & \code{hospitalizations}. Any
#' \code{group_by} variables would also have their own columns.
#'
#' The function default is to return the matrix of all intents and mechanisms
#' of injury related hospitalizations. You can choose to only return the intent
#' or only return the mechanism. If you set both to "none", you will receive a
#' summary of all injury hospitalizations without regard to the intent or mechanism.
#'
#' @references
#' WA DOH CHAT: \url{https://secureaccess.wa.gov/doh/chat/Content/FilesForDownload/CodeSetDefinitions/Hospitalization%20Injury%20Matrix%20ICD10CM.xlsx}
#'
#' @export
#'
#' @name chars_injury_matrix_count
#'
#' @examples
#' # example: 2019 King County hospitalizations due to intentional injury, by sex
#' \donttest{
#' blah = get_data_chars(year = 2019, kingco = TRUE)
#' myresult <- chars_injury_matrix_count(ph.data = blah,
#'                                       intent = '^intentional',
#'                                       mechanism = 'none',
#'                                       group_by = c('chi_sex'))
#' print(myresult)
#' }
#' @import data.table rads.data
#'
chars_injury_matrix_count<- function(ph.data = NULL,
                                     intent = "*",
                                     mechanism = "*",
                                     group_by = NULL,
                                     def = 'narrow',
                                     primary_ecode = T,
                                     kingco = T){
  # Global variables used by data.table declared as NULL here to play nice with devtools::check() ----
    chi_geo_kc <- icd10 <- bingo <- hospitalizations <- icd10cm <- icd10cm_desc <- NULL
    var.names <- injury_nature_narrow <- injury_ecode <- injury_nature_broad <- NULL
    injury_intent <- injury_mechanism <- NULL
    chi_geo_kc <- NULL
    yage4 <- age <- age6 <- geo_type <- geo_id <- pov200grp <- race4 <- race3 <- NULL
    chi_race_aic_hisp <- race3_hispanic <- NULL
    mechanism_motor_vehicle_traffic <- NULL

  # Check arguments ----
    # ph.data ----
      if (missing(ph.data) || !is.data.frame(ph.data)) {
        stop("\n\U0001f47f `ph.data` must be the unquoted name of a data.frame or data.table")
      }
      if (!data.table::is.data.table(ph.data)) {
        data.table::setDT(ph.data)
      }
      ph.data <- data.table::copy(ph.data) # to prevent changing of original by reference

    # seq_no (unique identifier) ----
        if(!'seq_no' %in% names(ph.data)){
          stop("\U2620\U0001f47f\U2620\nph.data must contain the 'seq_no' column, which is the unique identifier.")}
        if('seq_no' %in% names(ph.data) & length(unique(ph.data$seq_no)) != nrow(ph.data)){
          stop("\U2620\U0001f47f\U2620\nThe 'seq_no' is a unique patient identifier and should not be repeated across rows.")}

    # intent ----
        if("none" %in% intent & length(intent) != 1){
          stop("\n\U0001f47f The intent 'none' cannot be specified with any other intents.")
        }
        if("*" %in% intent & length(intent) != 1){
          stop("\n\U0001f47f The intent '*' cannot be specified with any other intents.")
        }

        if(nrow(setDT(quiet(list_dataset_columns('chars')))[grepl('^intent_', var.names)]) >
           length(grep('^intent_', names(ph.data), value = T))){
          mi_col_intent <- setdiff(setDT(list_dataset_columns('chars'))[grepl('^intent_', var.names)]$var.names, grep('^intent_', names(ph.data), value = T))
          warning(paste0("\n\u26A0\ufe0f ph.data is missing the following `intent_**` columns: ", paste0(mi_col_intent, collapse = ', '), ". This may impact the completeness of your results."))
        }

    # mechanism ----
        if("none" %in% mechanism & length(mechanism) != 1){
          stop("\n\U0001f47f The mechanism 'none' cannot be specified with any other mechanisms.")
        }
        if("*" %in% mechanism & length(mechanism) != 1){
          stop("\n\U0001f47f The mechanism '*' cannot be specified with any other mechanisms.")
        }

        if(nrow(setDT(quiet(list_dataset_columns('chars')))[grepl('^mechanism_', var.names)]) >
           length(grep('^mechanism_', names(ph.data), value = T))){
          mi_col_mechanism <- setdiff(setDT(list_dataset_columns('chars'))[grepl('^mechanism_', var.names)]$var.names, grep('^mechanism_', names(ph.data), value = T))
          warning(paste0("\n\u26A0\ufe0f ph.data is missing the following `mechanism_**` columns: ", paste0(mi_col_mechanism, collapse = ', '), ". This may impact the completeness of your results."))
        }

    # group_by ----
        if(!is.null(group_by)){
          if(!inherits(group_by, 'character')){
            stop("\n\U0001f47f `group_by` must either be NULL or specify a character vector of column names.")}
          if(length(setdiff(group_by, names(ph.data))) > 0){
            stop(paste0("\n\U0001f47f `group_by` contains the following column names which do not exist in ph.data: ", paste(setdiff(group_by, names(ph.data)), collapse = ', ') ))}
        }

    # def ----
        if(!def %in% c('narrow', 'broad') | length(def) != 1){
          stop("\n\U0001f47f `def` can only have ONE of two values, 'narrow' or 'broad'.")
        }
        if(!"injury_nature_narrow" %in% names(ph.data) | !is.logical(ph.data$injury_nature_narrow)){
          stop("\n\U0001f47f `injury_nature_narrow` must exist exist in ph.data and must be of type logical (TRUE|FALSE).")
        }
        if(!"injury_nature_broad" %in% names(ph.data) | !is.logical(ph.data$injury_nature_broad)){
          stop("\n\U0001f47f `injury_nature_broad` must exist exist in ph.data and must be of type logical (TRUE|FALSE).")
        }

    # primary_ecode ----
        if(!is.logical(primary_ecode)){stop("\n\U0001f47f `primary_ecode` must be a logical vector of length 1, i.e,. TRUE or FALSE.")}
        if(isFALSE(primary_ecode)){stop(paste0("\n\U1F6D1 \U2620 \U0001f47f\n",
                                        " You set 'primary_ecode = F'. This is no longer a valid option. If you want to use other ecodes\n",
                                        " you will have to perform a custom analysis using [chars].[stage_diag] & [chars].[stage_ecode]."))}

    # kingco ----
        if(!is.logical(kingco)){stop("\n\U0001f47f `kingco` must be a logical vector of length 1, i.e,. TRUE or FALSE.")}
        if (isTRUE(kingco) & (!"chi_geo_kc" %in% names(ph.data))){
          stop("\n\U0001f47f You specified kingco=TRUE, but `ph.data` does not have the following columns that identify King County data:
                   chi_geo_kc")
        }
        if (isTRUE(kingco)){ph.data <- ph.data[chi_geo_kc == "King County"]}

  # Apply narrow or broad definition ----
      if(def == 'narrow'){ph.data <- ph.data[injury_nature_narrow == T & !is.na(injury_intent)]}
      if(def == 'broad'){ph.data <- ph.data[injury_nature_broad == T & !is.na(injury_intent)]}

  # Get complete list of all possible mechanisms and intents ----
      possible.intents <- as.character(unique(chars_injury_matrix()$intent))
      possible.mechanisms <- as.character(unique(chars_injury_matrix()$mechanism))

  # Identify intent of interest ----
      intent = tolower(intent)

      if("none" %in% intent){ # none means 'any intent', i.e., 'ignore' the intent
        x_intent = "any"
      }

      if("*" %in% intent){x_intent = possible.intents}

      if(length(intersect(c("*", "none"), intent)) == 0){
        x_intent = c()
        for(i in intent){
          x_intent <- unique(c(x_intent, grep(i, possible.intents, value = TRUE, ignore.case = TRUE)))
        }
      }
      if(length(x_intent) == 0){stop(paste0(
        "\n\U0001f47f \nYour `intent` value (", intent, ") has filtered out all of the hospitalization injury intents.\nPlease enter 'none', '*', or a new partial keyword term and try again."))}

  # Identify mechanism of interest ----
      mechanism = tolower(mechanism)

      if("none" %in% mechanism){ # none means 'any mechanism', i.e., 'ignore' the mechanism
        x_mechanism = "any"
      }

      if("*" %in% mechanism){x_mechanism = possible.mechanisms}

      if(length(intersect(c("*", "none"), mechanism)) == 0){
        x_mechanism = c()
        for(i in mechanism){
          x_mechanism <- unique(c(x_mechanism, grep(i, possible.mechanisms, value = TRUE, ignore.case = TRUE)))
        }
      }
      if(length(x_mechanism) == 0){stop(paste0(
        "\n\U0001f47f \nYour `mechanism` value (", mechanism, ") has filtered out all of the hospitalization injury mechanisms.\nPlease enter 'none', '*', or a new partial keyword term and try again.\n",
        "Entering `rads::chars_injury_matrix()` into the console will provide you with a table of valid options."))}

  # Create motor_vehicle_traffic column when needed ----
      if('motor_vehicle_traffic' %in% x_mechanism){
        mvt_cols <- paste0('mechanism_', grep('mvt_', possible.mechanisms, value = TRUE))
        ph.data[, mechanism_motor_vehicle_traffic := Reduce(function(x, y) pmax(x, y, na.rm = TRUE),
                                                            .SD,
                                                            init = NA_real_),
                                                      .SDcols = mvt_cols]
      }

  # Count hospitalizations for each intent_x_mechanism of interest ----
        # create matrix of all mechanisms and intents of interest ----
        x_grid <- data.table::setDT(expand.grid(mechanism = x_mechanism, intent = x_intent))

        # count number of hospitalizations (i.e., rows) when def == 'narrow' ----
        x_combo <- rbindlist(lapply(1:nrow(x_grid), function(ii) {
          temp.ph.data <- copy(ph.data)

        # Identify whether the combination of mech & intent in x_grid has any hospitalizations in person level data ----
          # could theoretically use injury_mechanism & injury_intent, but would need extra coding to address when either has value 'any'
            temp.ph.data[, bingo := as.integer(get(paste0("mechanism_", x_grid[ii]$mechanism)) >= 1 &
                                                 get(paste0("intent_", x_grid[ii]$intent)) >= 1)]

        # Aggregate (sum) the number of hospitalizations for the mech / intent combination from x_grid ----
            if(!is.null(group_by)){
              temp.ph.data <- temp.ph.data[, list(mechanism = as.character(x_grid[ii]$mechanism),
                                                  intent = as.character(x_grid[ii]$intent),
                                                  hospitalizations = sum(bingo)),
                                           by = group_by]}
            if(is.null(group_by)){
              temp.ph.data <- temp.ph.data[, list(mechanism = as.character(x_grid[ii]$mechanism),
                                                  intent = as.character(x_grid[ii]$intent),
                                                  hospitalizations = sum(bingo))]}

          # create grid of all possible combinations of group_by vars ----
            gridvars <- setdiff(names(temp.ph.data), 'hospitalizations')
            complete.grid <- do.call(CJ, lapply(gridvars, function(x) unique(temp.ph.data[[x]])))
            setnames(complete.grid, gridvars)

          # merge temp.ph.data onto complete.grid ----
            temp.ph.data <- merge(complete.grid, temp.ph.data, all = T)
            temp.ph.data[is.na(hospitalizations), hospitalizations := 0]

          return(temp.ph.data)
        }), fill=TRUE)

  # Tidy ----
    # Additional collapse/aggregate if mechanism == 'none' ----
      if("none" %in% mechanism){
        x_combo[, mechanism := "Any mechanism"]
        x_combo <- x_combo[, list(hospitalizations = sum(hospitalizations)), by = setdiff(names(x_combo), "hospitalizations")]
      }

      x_combo[mechanism == 'any', mechanism := "Any mechanism"]

    # Additional collapse/aggregate if intent == 'none' ----
      if("none" %in% intent){
        x_combo[, intent := "Any intent"]
        x_combo <- x_combo[, list(hospitalizations = sum(hospitalizations)), by = setdiff(names(x_combo), "hospitalizations")]
      }

      x_combo[intent == 'any', intent := "Any intent"]

    # Sort columns and rows ----
        setcolorder(x_combo, c("mechanism", "intent", "hospitalizations"))
        setorderv(x_combo, c("mechanism", "intent", setdiff(names(x_combo), c("hospitalizations", "mechanism", "intent")) ))

  # Return data ----
    return(x_combo)
}

# chars_icd_ccs() ----
#' View available CHARS ICD-9-CM OR ICD-10-CM (diagnosis) codes, descriptions,
#'  and summary 'broad' and 'detailed' classifications that can be used with
#' \code{chars_icd_ccs_count}
#'
#' @description
#' A function to view the complete list of ICD-9-CM OR ICD-10-CM codes and
#' descriptions as well as corresponding 'broad' and 'detailed' classifications.
#' The 'broad' and 'detailed' classifications broadly follow AHRQ's HCUP Clinical
#' Classifications Software Refined (CCSR) standards for ICD-10-CM. ICD-9-CM
#' codes were then mapped to the same 'broad' and 'detailed' categories to maximize
#' comparability across time. The 'superlevel' and 'midlevel' categorizations
#' were developed by APDE, based in large part on the 'broad' and 'detailed'
#' classifications.
#'
#' Output is provided in the form of a table. Use this table to inform your
#' arguments in the  \code{chars_icd_ccs_count} function.
#'
#' @note
#' If you do not specify any arguments, the function will return a table with
#' all ICD-10-CM codes as well ICD-10-CM, superlevel, broad, midlevel, and
#' detailed descriptions.
#'
#' @source
#' \code{kcitazrhpasqlprp16.azds.kingcounty.gov >> hhs_analytics_workspace >>
#' ref.icdcm_codes}
#'
#' @references
#' \url{https://hcup-us.ahrq.gov/toolssoftware/ccsr/ccs_refined.jsp}
#'
#' @param ref_type a character vector of length one specifying the hospital diagnosis
#' descriptions that are of interest to you. Acceptable options include: \code{'all'},
#' \code{'icdcm'}, \code{'superlevel'}, \code{'broad'}, \code{'midlevel'}, & \code{'detailed'}.
#'
#' The default is \code{ref_type = 'all'}.
#'
#' @param mykey Character vector of length 1. Identifies
#' the keyring:: service that can be used to access the Health & Human Services
#' Analytic Workspace (HHSAW).
#'
#' The default is \code{mykey = 'hhsaw'}
#'
#' @param icdcm_version an integer vector of length one specifying the ICD CM
#' version that you want to reference. Acceptable options include: \code{9}
#' & \code{10}.
#'
#' The default is \code{icdcm_version = 10}.
#'
#' @return
#' A data.table. The number of rows and columns are dependent upon the arguments
#' submitted to the function.
#'
#' @export
#'
#' @name chars_icd_ccs
#'
#' @examples
#' # Save and view table as a data.table named 'blah'
#' blah <- chars_icd_ccs(ref_type = 'all')
#' head(blah)
#'
#' @import data.table rads.data
#'
chars_icd_ccs <- function(ref_type = 'all',
                          mykey = "hhsaw",
                          icdcm_version = 10){

  # Global variables used by data.table declared as NULL here to play nice with devtools::check() ----
  chars_list <- icdcm <- icdcm_code <- superlevel <- broad <- midlevel <- detailed  <- NULL

  # check arguments ----
  if(length(ref_type) != 1){
    stop("\n \U0001f47f the `ref_type` must have a single value. Valid options are 'all', 'icdcm', 'superlevel, 'broad', 'midlevel', & 'detailed'")}
  if(!ref_type %in% c('all', 'icdcm', 'superlevel', 'broad', 'midlevel', 'detailed')){
    stop(paste0("\n \U0001f47f'", ref_type, "' is not a valid option for the `ref_type` argument. \nValid options are 'all', 'icdcm', 'broad', 'midlevel', & 'detailed'."))}

  if(length(mykey) != 1 | !is.character(mykey)){
    stop("\n \U0001f47f the `mykey` argument must be a string of length == 1, \nwhich is the name of your keyring:: service providing the \npassword for connecting to HHSAW, it is typically 'hhsaw'.")}
  if(!mykey %in% keyring::key_list()[]$service){
    stop("\n \U0001f47f the `mykey` value passed to this function ('", mykey, "') is not in your `keyring::key_list`.\nPlease create it using `keyring::key_set` and try again.")
  }
  if(!icdcm_version %in% c(9, 10) | length(icdcm_version) != 1){stop("\n \U0001f47f the `icdcm_version` argument is limited to the integers '9' OR '10'")}

  # get data ----
  con <- validate_hhsaw_key(hhsaw_key = mykey)
  chars_list <- setDT(DBI::dbGetQuery(conn = con,
                                   paste0("SELECT icdcm_version,
                                   icdcm_code = icdcm,
                                   icdcm = icdcm_description,
                                   superlevel = ccs_superlevel_desc,
                                   broad = ccs_broad_desc,
                                   detailed = ccs_detail_desc,
                                   midlevel = ccs_midlevel_desc
                                   FROM [ref].[icdcm_codes] WHERE icdcm_version = ", icdcm_version )))

  if(ref_type == 'all'){chars_list <- chars_list[, list(icdcm_code, icdcm, superlevel, broad, midlevel, detailed, icdcm_version)]}
  if(ref_type == 'icdcm'){chars_list <- chars_list[, list(icdcm_code, icdcm, icdcm_version)]}
  if(ref_type == 'superlevel'){chars_list <- chars_list[, list(superlevel, icdcm_version)]}
  if(ref_type == 'broad'){chars_list <- chars_list[, list(broad, icdcm_version)]}
  if(ref_type == 'midlevel'){chars_list <- chars_list[, list(midlevel, icdcm_version)]}
  if(ref_type == 'detailed'){chars_list <- chars_list[, list(detailed, icdcm_version)]}

  return(unique(chars_list))
}


# chars_icd_ccs_count() ----
#' Count (non-injury) Comprehensive Hospital Abstract Reporting System
#' (CHARS) hospitalizations
#'
#' @description
#' Generate hospitalization counts from WA State Comprehensive Hospital
#' Abstract Reporting System (CHARS) data using partial strings from the ICD-10-CM
#' or ICD-9-CM descriptions or AHRQ HCUP's CCSR based 'broad' and 'detailed'
#' classifications. Needs line-level CHARS data with a properly formatted
#' ICD-CM column (e.g., the data available from \code{get_data_chars()}).
#'
#' See \code{chars_icd_ccs()} for a complete list of available ICD-10-CM,
#' ICD-9-CM, and superlevel, broad, midlevel, and narrow classifications.
#'
#' \strong{¡¡¡REMEMBER!!!} ICD-10-CM started in 2016! Be sure to use the correct
#' **\code{icdcm_version}**.
#'
#'
#' @details
#' This function needs the user to enter a search string in one or more of the
#' following arguments in order to search the CHARS data for the corresponding
#' ICD CM codes: \code{icdcm}, \code{superlevel}, \code{broad}, \code{midlevel},
#' or \code{detailed}.
#' Partial search terms are acceptable and they are case-insensitive. For
#' example, if you set \code{broad = 'ex'} with \code{icdcm_version = 10}, the
#' function would return counts for "Diseases of the eye and adnEXa" as well as
#' "EXternal causes of morbidity". It also understands simple regex syntax,
#' including **\code{^}**, **\code{$}**, and **\code{|}**.
#'
#' **Note:** If you submit values for more than one of \code{icdcm},
#' \code{superlevel}, \code{broad}, \code{midlevel}, or \code{detailed} they must
#' be nested. For example, \code{broad = 'neoplasms', detailed = 'sarcoma'} will
#' give results because sarcomas are type of cancers. However,
#' \code{broad = 'neoplasms', detailed = 'intestinal infection'} will return an
#' error because your resulting table will have zero rows.
#'
#' @param ph.data a data.table or data.frame. Must contain CHARS data structured
#' with one person per row and with at least one column of ICD CM codes.
#'
#' **NOTE!** ph.data must have a column named `seq_no`, which is a unique patient
#' level identifier.
#'
#' The default is \code{ph.data = NULL}
#'
#' @param icdcm_version an integer vector of length one specifying the ICD CM
#' version that you want to reference. Acceptable options include: \code{9}
#' & \code{10}.
#'
#' The default is \code{icdcm_version = 10}.
#'
#' @param CMtable An optional data.table containing the reference table of ICD codes
#' and their classifications. This should come from \code{\link{chars_icd_ccs}}
#' and have the following columns: \code{icdcm_code}, \code{icdcm}, \code{superlevel},
#' \code{broad}, \code{midlevel}, \code{detailed}, and \code{icdcm_version}. If
#' provided, the function will use this table instead of making a new call to
#' \code{\link{chars_icd_ccs}}, which can significantly improve performance when
#' making multiple calls to this function.
#'
#' The default is \code{CMtable = NULL}, which means the function will fetch the reference
#' table from the database using \code{\link{chars_icd_ccs}}.
#'
#' @param icdcm a character vector of length 1. An ICD CM description OR code.
#' It is case agnostic and works with partial strings. For example, both
#' 'rotavira' & 'A080' would provide the results for 'Rotaviral enteritis' for
#' ICD-10-CM. You can also combine multiple search terms. For example,
#' 'rotavira|choler' would count all Rotaviral enteritis AND cholera
#' hospitalizations. View available options with
#' \code{chars_icd_ccs(ref_type = 'icdcm', icdcm_version = 10)}.
#'
#' The default is \code{icdcm = NULL}
#'
#' @param superlevel a character vector of length 1. Case agnostic and works
#' with partial strings. View available options with
#' \code{chars_icd_ccs(ref_type = 'superlevel', icdcm_version = 10)}.
#'
#' The default is \code{superlevel = NULL}
#'
#' @param broad a character vector of length 1. Case agnostic and works with
#' partial strings. View available options with
#' \code{chars_icd_ccs(ref_type = 'broad', icdcm_version = 10)}.
#'
#' The default is \code{broad = NULL}
#'
#' @param midlevel a character vector of length 1. Case agnostic and works with
#' partial strings. View available options with
#' \code{chars_icd_ccs(ref_type = 'midlevel', icdcm_version = 10)}.
#'
#' The default is \code{midlevel = NULL}
#'
#' @param detailed a character vector of length 1. Case agnostic and works with
#' partial strings. View available options with
#' \code{chars_icd_ccs(ref_type = 'detailed', icdcm_version = 10)}.
#'
#' The default is \code{detailed = NULL}
#'
#' @param icdcol a character vector of length one that specifies the name of the
#' column in ph.data that contains the ICD10-cm codes of interest.
#'
#' The default is \code{icdcol = 'diag1'}, which refers to the principal
#' diagnosis code provided by \code{get_data_chars()}).
#'
#' @param group_by a character vector of indeterminate length. This is used to
#' specify all the variables by which you want to group (a.k.a. stratify) the
#' results. For example, if you specified \code{group_by = c('chi_sex',
#' 'chi_race_6')}, the results would be stratified by each combination of sex
#' and race.
#'
#' The default is \code{group_by = NULL}
#'
#' @param kingco a logical vector of length one. It specifies whether you want to
#' limit the analysis to King County.
#'
#' **NOTE** this only works with data imported
#' with the \code{get_data_chars()} function because it needs the variable
#' \code{chi_geo_kc}.
#'
#' The default is \code{kingco = TRUE}.
#'
#' @param mykey Character vector of length 1. Identifies the keyring:: service
#' that can be used to access the Health & Human Services Analytic Workspace
#' (HHSAW).
#'
#' The default is \code{mykey = 'hhsaw'}
#'
#' @return
#' Generates a table with columns for each of the search terms you entered (e.g.,
#' \code{icdcm}, \code{broad}, and/or \code{detailed}) as well as
#' any \code{group_by} variables and a column named \code{hospitalizations} that
#' contains the relevant counts.
#'
#' @export
#'
#' @name chars_icd_ccs_count
#'
#' @examples
#' # example #1: 2019 King County hospitalizations for chemotherapy, by sex
#' \donttest{
#' blah = get_data_chars(year = 2019, kingco = TRUE)
#' myresult <- chars_icd_ccs_count(ph.data = blah,
#'                                 detailed = 'headache',
#'                                 group_by = c('chi_sex'))
#' print(myresult)
#' }
#'
#' # example #2: 2022 King County hospitalizations for asthma using
#' # an external reference table
#' \donttest{
#' myrefTable <- chars_icd_ccs()
#'
#' mydata = get_data_chars(year = 2022, kingco = TRUE)
#' myresult <- chars_icd_ccs_count(ph.data = mydata,
#'                                 CMtable = myrefTable,
#'                                 detailed = 'asthma',
#'                                 group_by = c('chi_sex'))
#' print(myresult)
#' }
#'
#' @import data.table rads.data
#'
chars_icd_ccs_count <- function(ph.data = NULL,
                                icdcm_version = 10,
                                CMtable = NULL,
                                icdcm = NULL,
                                superlevel = NULL,
                                broad = NULL,
                                midlevel = NULL,
                                detailed = NULL,
                                icdcol = 'diag1',
                                group_by = NULL,
                                kingco = T,
                                mykey = 'hhsaw'){

  # Global variables used by data.table declared as NULL here to play nice with devtools::check() ----
    CMtable_cols <- CMtable.expanded <- filter.count <- problem.icds <- superlevel_desc <- broad_desc <-
      midlevel_desc <- detailed_desc <- chi_geo_kc <- hospitalizations <- icdcm_code <- KeepMe <-
      icdcm_desc <- icdcm_code <- query.group <- diag1 <- intent_ignore <-
      chars_injury_matrix_count <- mechanism_ignore <- dummy <- NULL

  # Check arguments & filter reference table of all ICD CM (CMtable) ----
    # ph.data ----
        if (missing(ph.data) || !is.data.frame(ph.data)) {
          stop("\n\U0001f47f `ph.data` must be the unquoted name of a data.frame or data.table")
        }
        if (!data.table::is.data.table(ph.data)) {
          data.table::setDT(ph.data)
        }
        ph.data <- data.table::copy(ph.data) # to prevent changing of original by reference

    # icdcm_version ----
        if(!icdcm_version %in% c(9, 10) | length(icdcm_version) != 1){stop("\n \U0001f47f the `icdcm_version` argument is limited to the integers '9' OR '10'")}

    # CMtable ----
        if (!is.null(CMtable)){
          if (!is.data.frame(CMtable)) {
            stop("\n\U0001f47f `CMtable` must be the unquoted name of a data.frame or data.table, typically from rads::chars_icd_ccs()")
          }
          if (!data.table::is.data.table(CMtable)) {
            data.table::setDT(CMtable)
          }

          CMtable_cols <- c("icdcm_code", "icdcm", "superlevel", "broad", "midlevel", "detailed", "icdcm_version")
          if (!identical(sort(CMtable_cols), sort(names(CMtable)))){
            stop(paste0("\n\U0001f47f CMtable must have the following columns as specified in rads::chars_icd_ccs:",
                        "\n ", paste(CMtable_cols, collapse = ', ')))
          }
        }

    # seq_no (unique identifier) ----
        if(!'seq_no' %in% names(ph.data)){
          stop("\U2620\U0001f47f\U2620\nph.data must contain the 'seq_no' column, which is the unique identifier.")}
        if('seq_no' %in% names(ph.data) & length(unique(ph.data$seq_no)) != nrow(ph.data)){
          stop("\U2620\U0001f47f\U2620\nThe 'seq_no' is a unique patient identifier and should not be repeated across rows.")}

    # icdcm + superlevel + broad + midlevel + detailed ----
        if(is.null(icdcm) & is.null(superlevel) & is.null(broad) & is.null(midlevel) & is.null(detailed)){
          stop("\n\U0001f47f `icdcm`, `superlevel`, `broad`, `midlevel`, and `detailed` are all NULL. This doesn't make sense! Specify a value for at least one of these arguments.")
        }

        if(is.null(CMtable)){
          CMtable <- chars_icd_ccs(icdcm_version = icdcm_version) # reference table of all potential search terms for this function
        }
        CMtable <- CMtable[, list(icdcm_code, icdcm_desc = icdcm, superlevel_desc = superlevel, broad_desc = broad, midlevel_desc = midlevel, detailed_desc = detailed)]

        filter.count <- sum(!is.null(icdcm), !is.null(superlevel), !is.null(broad), !is.null(midlevel), !is.null(detailed))

    # icdcm ----
        if(!is.null(icdcm)){
          if(length(icdcm) != 1){
            stop("\n\U0001f47f When specified, `icdcm` must be a character vector of length one.")
            }
          CMtable <- CMtable[grepl(icdcm, icdcm_desc, ignore.case = T) | grepl(icdcm, icdcm_code, ignore.case = T)]
          if(nrow(CMtable) < 1){
            stop(paste0("\n\U0001f47f Setting the argument <icdcm='", icdcm, "'> filtered out all possible ICD CM codes in the reference table. Please change your argument(s)."))
          }
        }

    # superlevel ----
        if(!is.null(superlevel)){
          if(length(superlevel) != 1){
            stop("\n\U0001f47f When specified, `superlevel` must be a character vector of length one.")
          }
          CMtable <- CMtable[grepl(superlevel, superlevel_desc, ignore.case = T)]
          if(nrow(CMtable) < 1){
            if(filter.count == 1){
              stop(paste0("\n\U0001f47f Setting the argument <superlevel='", superlevel, "'> filtered out all possible ICD CM codes in the reference table. Please change your argument(s)."))
            }
            if(filter.count > 1){
              stop(paste0("\n\U0001f47f Setting the argument <superlevel='", superlevel, "'>, either alone or in combinaton with other arguments, filtered out all possible ICD CM codes in the reference table. Please change your argument(s)."))
            }
          }
        }

    # broad ----
        if(!is.null(broad)){
          if(length(broad) != 1){
            stop("\n\U0001f47f When specified, `broad` must be a character vector of length one.")
          }
          CMtable <- CMtable[grepl(broad, broad_desc, ignore.case = T)]
          if(nrow(CMtable) < 1){
            if(filter.count == 1){
              stop(paste0("\n\U0001f47f Setting the argument <broad='", broad, "'> filtered out all possible ICD CM codes in the reference table. Please change your argument(s)."))
            }
            if(filter.count > 1){
              stop(paste0("\n\U0001f47f Setting the argument <broad='", broad, "'>, either alone or in combinaton with other arguments, filtered out all possible ICD CM codes in the reference table. Please change your argument(s)."))
            }
          }
        }

    # midlevel ----
        if(!is.null(midlevel)){
          if(length(midlevel) != 1){
            stop("\n\U0001f47f When specified, `midlevel` must be a character vector of length one.")
          }
          CMtable <- CMtable[grepl(midlevel, midlevel_desc, ignore.case = T)]
          if(nrow(CMtable) < 1){
            if(filter.count == 1){
              stop(paste0("\n\U0001f47f Setting the argument <midlevel='", midlevel, "'> filtered out all possible ICD CM codes in the reference table. Please change your argument(s)."))
            }
            if(filter.count > 1){
              stop(paste0("\n\U0001f47f Setting the argument <midlevel='", midlevel, "'>, either alone or in combinaton with other arguments, filtered out all possible ICD CM codes in the reference table. Please change your argument(s)."))
            }
          }
        }

    # detailed ----
        if(!is.null(detailed)){
          if(length(detailed) != 1){
            stop("\n\U0001f47f When specified, `detailed` must be a character vector of length one.")
          }
          CMtable <- CMtable[grepl(detailed, detailed_desc, ignore.case = T)]
          if(nrow(CMtable) < 1){
            if(filter.count == 1){
              stop(paste0("\n\U0001f47f Setting the argument <detailed='", detailed, "'> filtered out all possible ICD CM codes in the reference table. Please change your argument(s)."))
            }
            if(filter.count > 1){
              stop(paste0("\n\U0001f47f Setting the argument <detailed='", detailed, "'>, either alone or in combinaton with other arguments, filtered out all possible ICD CM codes in the reference table. Please change your argument(s)."))
            }
          }
        }

    # icdcol ----
        if(is.null(icdcol)){stop("\n\U0001f47f The `icdcol` argument cannot be NULL. If you are unsure of what enter, try using `icdcol = 'diag1`, which is the default")}
        if(isFALSE(icdcol %in% colnames(ph.data))){
          stop(paste0("\n\U0001f47f You specified icdcol='", icdcol, "', but '", icdcol, "' does not exist in `ph.data`."))
        }

        ph.data[, (icdcol) := toupper(get(icdcol))]

        if(length(grep("\\.|-", ph.data[[icdcol]], value = T) >0 )){
          warning(paste0("\u26A0\ufe0f There is at least one row where `icdcol` (",
          icdcol, ") contains a hyphen (-), period (.), space or some other ",
          "non alpha-numeric character. These characters will be deleted, e.g., ",
          "A85.2 will become A852. This is necessary because causeids in ",
          "rads::chars_icd_ccs contains no hyphens or periods."
          ))
          ph.data[, paste0(icdcol) := gsub("[[:space:].]+", "", gsub("([^A-Za-z0-9 ])+", "", x = get(icdcol)))]
        }

        if(icdcm_version == 10){
          if(nrow(ph.data[is.na(get(icdcol)) | !grepl("^[A-Z][0-9]", get(icdcol))]) > 0){
            problem.icds <- ph.data[is.na(get(icdcol)) | !grepl("^[A-Z][0-9]", get(icdcol)), ][[icdcol]]
            warning(paste0("\u26A0\ufe0f There is/are ", length(problem.icds), " row(s) where `icdcol` (",
            icdcol, ") does not follow the proper ICD-10-CM pattern. All ICD-10-CMs that do not begin with a ",
            "single capital letter followed by a number have been replaced with NA."))
            ph.data[!grepl("^[A-Z][0-9]", get(icdcol)) , paste0(icdcol) := NA]
          }
        }

    # group_by ----
        if(!is.null(group_by)){
          if(!inherits(group_by, 'character')){
            stop("\n\U0001f47f `group_by` must either be NULL or specify a character vector of column names.")}
          if(length(setdiff(group_by, names(ph.data))) > 0){
            stop(paste0("\n\U0001f47f `group_by` contains the following column names which do not exist in ph.data: ", paste(setdiff(group_by, names(ph.data)), collapse = ', ') ))}
          # convert group_by values to character b/c factors cause an error
          for(gb_var in group_by){
            ph.data[, paste0(gb_var) := as.character(get(gb_var))]
          }
        }

    # kingco ----
        if (isTRUE(kingco) & (!"chi_geo_kc" %in% names(ph.data))){
          stop("\U0001f47f You specified kingco=TRUE, but `ph.data` does not have the column `chi_geo_kc` that identifies King County")
        }
        if (isTRUE(kingco)){ph.data <- ph.data[chi_geo_kc == 'King County']}

    # mykey ----
        if(length(mykey) != 1 | !is.character(mykey)){
          stop("\n \U0001f47f the `mykey` argument must be a string of length == 1, \nwhich is the name of your keyring:: service providing the \npassword for connecting to HHSAW, it is typically 'hhsaw'.")
        }
        if(!mykey %in% keyring::key_list()[]$service){
          stop("\n \U0001f47f the `mykey` value passed to this function ('", mykey, "') is not in your `keyring::key_list`.\nPlease create it using `keyring::key_set` and try again.")
        }

  # Drop unnecessary columns from reference table (CMtable) ----
    KeepMe <- c("icdcm_code")
    for(grr in c('icdcm', 'superlevel', 'broad', 'midlevel', 'detailed')){
     if(!is.null(get(grr))){
       KeepMe <- c(KeepMe, paste0(grr, "_desc"))
     }
    }

    CMtable <- CMtable[, KeepMe, with = FALSE]

  # Get counts of hospitalizations for each group of search terms in CMtable (search term reference table) ----
    # flatten CMtable for regex search ----
      CMtable = CMtable[, list(icdcm_code = list(icdcm_code)), by = setdiff(names(CMtable), c("icdcm_code"))]

    # Create a 'query.group' for each combination of levels/icdcm ----
      CMtable[, query.group := .GRP, by = setdiff(names(CMtable), "icdcm_code")]

    # generate counts for each query.group ----
      HospCounts <- rbindlist(lapply(unique(CMtable$query.group), function(QG) {
        tempHospCounts <- if (is.null(group_by)) {
          ph.data[diag1 %in% unlist(CMtable[query.group == QG]$icdcm_code), list(hospitalizations = .N)]
        } else {
          ph.data[diag1 %in% unlist(CMtable[query.group == QG]$icdcm_code), list(hospitalizations = .N), by = group_by]
        }
        tempHospCounts[, query.group := QG]
        # tempHospCounts <- CMtable[tempHospCounts, on = "query.group"] # native data.table merge syntax
        tempHospCounts <- merge(CMtable, tempHospCounts, by = 'query.group', all = FALSE)
        tempHospCounts[, icdcm_code := NULL]
        return(tempHospCounts)
      }), fill = TRUE)

  # Expand reference table for each combination of group_by variables ----
      CMtable[, icdcm_code := NULL]
      if (is.null(group_by)) {
        CMtable.expanded <- CMtable
      } else {
        # Create a list of unique values for each group_by variable
        unique_vals_list <- lapply(group_by, function(col) unique(ph.data[[col]]))
        names(unique_vals_list) <- group_by

        # Create the Cartesian product of unique values using CJ
        template.xyz <- do.call(CJ, unique_vals_list)

        # Expand CMtable for each combination of group_by variables
        CMtable.expanded <- merge(CMtable[, dummy := 1],
                                  template.xyz[, dummy := 1],
                                  by = 'dummy',
                                  allow.cartesian = TRUE)[, dummy := NULL]
      }

  # Merge counts onto the expanded table to get table with all possible combination, even when counts == 0 ----
    HospCounts <- merge(CMtable.expanded, HospCounts,
                        by = intersect(names(HospCounts), names(CMtable.expanded)),
                        all = TRUE)
    HospCounts[is.na(hospitalizations), hospitalizations := 0]
    setorderv(HospCounts, c('query.group', group_by))
    HospCounts[, c("query.group") := NULL]

  # Return data ----
  return(HospCounts)
}


# The end ----
PHSKC-APDE/rads documentation built on April 14, 2025, 10:47 a.m.