R/get_data.R

Defines functions get_data_pums get_data_hys get_data_death get_data_chars get_data_brfss get_data_birth get_data

Documented in get_data get_data_birth get_data_brfss get_data_chars get_data_death get_data_hys get_data_pums

# get_data() ----
#' Get (micro)data from APDE storage.
#'
#' @description Simple front-end for pulling in standard APDE data
#'
#' @param dataset Character vector of length 1. Identifies the dataset to be
#' fetched. Use \code{list_apde_data} for available options
#' @param cols Character vector of length >=1. Identifies which columns should
#' be returned. NULL returns all columns in the analytic dataset. See
#' \code{\link{list_dataset_columns}} for more information on which columns are
#' considered default by dataset.
#' @param year Numeric vector. Identifies which year(s) of data should be pulled
#' @param ... Additional named arguments based on the specific dataset. To see
#' what these options should be, do \code{help(get_data_`dataset`)}
#'
#'
#' @return Typically a data.table (adminstrative data) or
#' \code{\link[dtsurvey]{dtsurvey}} object (survey data) for further analysis.
#' When requesting HRA or Region data from the BRFSS dataset, it will return an
#' \code{\link[mitools]{imputationList}} comprised of
#' survey-weighted \code{\link[dtsurvey]{dtsurvey}} objects.
#'
#' @export
#' @references \url{https://github.com/PHSKC-APDE/rads/wiki/get_data}
#' @examples
#' \donttest{
#'  test <- get_data(
#'           dataset = 'death',
#'           cols = c('chi_year', 'chi_sex'),
#'           year = c(2021))
#'
#'  head(test)
#' }
get_data <- function(dataset, cols = NULL, year = 2021, ...){

  f <- match.fun(paste0('get_data_', dataset))
  f(cols = cols, year = year, ...)

  #ensure that the requested dataset exists

  #ensure that the years are legit

  #ensure that the selected columns are valid

  #validate ... for the specific dataset function

  #dispatch dataset specific function

  #add dataset to the class list

  #return dataset

}

# get_data_birth() ----
#' Get Birth microdata from storage.
#'
#' @param cols Character vector of length >=1. Identifies which columns should be returned. NA returns all columns in the analytic dataset.
#'     See \code{\link{list_dataset_columns}} for more information on which columns are considered default by dataset.
#'
#' Default = NA
#'
#' @param year Numeric vector. Identifies which year(s) of data should be pulled. Defaults to the most recent year.
#'
#' Default = most recent year only
#'
#' @param kingco Logical. Return dataset for analyses where mother's residence is in King County only.
#'
#' Default = T
#'
#' @param version Character vector of length 1. Either 'final' or 'stage'.
#'
#' Default = 'final'
#'
#' @param mykey Character vector of length 1. Identifies
#' the keyring:: 'service' name that can be used to access the Health & Human Services
#' Analytic Workspace (HHSAW).
#'
#' Default == 'hhsaw'
#'
#' @return a single data.table
#'
#' @import data.table
#' @export
#'
#' @examples
#'
#' \donttest{
#'  test <- get_data_birth(
#'             cols = c('chi_year', 'chi_geo_kc', 'chi_sex'),
#'             year = c(2019),
#'             kingco = FALSE,
#'             version = 'final',
#'             mykey = 'hhsaw')
#'
#'  head(test)
#' }
get_data_birth <- function(cols = NA,
                           year = NA,
                           kingco = T,
                           version = 'final',
                           mykey = 'hhsaw'){
  if(is.null(cols)) cols <- NA
  if(is.null(year)) year <- NA

  # validate arguments other than mykey ----
    if(!(length(cols) == 1 && is.na(cols))){
      if(!is.character(cols)){stop('\n\U0001f6d1 `cols` must specify a vector of variables or be NA (to get all possible columns).')}
    }
    if(!(length(year) == 1 && is.na(year))){
      if( (!is.numeric(year)) | sum(year%%1) != 0 ) {stop('\n\U0001f6d1 `year` must specify a vector of integers (e.g., c(2017, 2019)) or be NA (to get the most recent year).')}
    }
    if((length(kingco) == 1 && is.na(kingco)) | !is.logical(kingco)){stop('\n\U0001f6d1 `kingco` must be a logical (TRUE | FALSE, or equivalently, T | F).')}
    if(length(version) != 1){stop("\n\U0001f6d1 `version` must have a single value, either 'final' or 'stage'.")}
    if((length(version) == 1 && is.na(version)) | !version %in% c('final', 'stage')){stop("\n\U0001f6d1 `version` must have the value 'final' or 'stage'.")}

  # validate mykey ----
    con <- validate_hhsaw_key(hhsaw_key = mykey)

  # create SQL table name ----
    mysqltable <- DBI::Id(schema = 'birth', table = paste0(version, '_analytic'))

  # get list of all colnames from SQL ----
    birth.names <- tolower(names(DBI::dbGetQuery(con, glue::glue_sql("SELECT top (0) * FROM  {`mysqltable`}", .con = con))))
    birth.years <- unique(DBI::dbGetQuery(con, glue::glue_sql("SELECT DISTINCT chi_year FROM {`mysqltable`}", .con = con))$chi_year)

  # identify columns and years to pull from SQL ----
    cols <- tolower(cols)
    if(!all(is.na(cols))){
      invalid.cols <- setdiff(cols, birth.names)
      valid.cols <- intersect(birth.names, cols)
      if(length(valid.cols) > 0){cols <- glue::glue_sql_collapse(valid.cols, sep=", ")}
      if(length(valid.cols) == 0){stop("Birth data cannot be extracted because no valid column names have been submitted. To get all columns, use the argument 'cols = NA'")}
      if(length(invalid.cols) > 0){message(paste0("The following column names do not exist in the birth data and have not been extracted: ", paste0(invalid.cols, collapse = ", ")))}
    }
    if(all(is.na(cols))){cols <- "*"}

      if(length(year) == 1 && is.na(year)){
        year = max(birth.years)
        message(paste0("You did not specify a year so the most recent available year, ", max(birth.years), ", was selected for you. Available years include ", format_time(birth.years)))}
      invalid.year <- setdiff(year, birth.years)
      year <- intersect(year, birth.years)
      if(length(year) == 0){stop(paste0("Birth data cannot be extracted because no valid years have been provided. Valid years include: ", format_time(birth.years)))}
      if(length(invalid.year)>0){message(paste0("The following years do not exist in the birth data and have not been extracted: ", format_time(invalid.year)))}

  # pull columns and years from SQL ----
    validyears <- glue::glue_sql_collapse(year, sep=", ")

    if(isTRUE(kingco)){
        kco_sub <- SQL(" AND chi_geo_kc = 'King County'")
    }else{
        kco_sub = SQL('')
      }

    query.string <- glue_sql('select {DBI::SQL(cols)} from {`mysqltable`} where chi_year in ({`validyears`*}) {kco_sub}', .con = con)



    dat <- data.table::setDT(DBI::dbGetQuery(con, query.string))

  # Format string variables due to SQL import quirks ----
    original.order <- names(dat)
    string_clean(dat, stringsAsFactors = TRUE) # clean random white spaces and change strings to factors

  # reorder table----
    setcolorder(dat, original.order)

    setDT(dat) # set it as a data.table again b/c otherwise, ascribing the new class above makes a copy

  # return object ----
    return(dat)
}

# get_data_brfss() ----
#' Get BRFSS microdata with adjusted weights from storage.
#'
#' @description
#' Retrieves Behavioral Risk Factor Surveillance System (BRFSS) data
#' and adjusts survey weights to ensure accurate representation for multi-year
#' analysis.
#'
#' @param cols Character vector specifying which columns to include in the
#' returned data. If NULL, all columns identified by
#' \code{list_dataset_columns('brfss')} will be included. Defaults to
#' \code{cols = NULL}
#' @param year Integer vector specifying which years to include in the data.
#' If NULL, the most recent year available in the data set will be used. Defaults
#' to \code{year = NULL}
#' @param kingco Logical. \code{TRUE} returns a dataset for King
#' County analyses. \code{FALSE} returns a dataset for WA State
#' analyses and \emph{should not be used for King County analyses}. Defaults to
#' \code{kingco = TRUE}
#' @param wt_method Character string specifying the name of the method used
#' to rescale the weights when selecting multiple years. Options include:
#' - '\code{obs}': Rescales weights based on the number of observations per year.
#' This is WA DOH's recommendation
#' - '\code{pop}': Rescales weights by the survey weighted population for each year
#' - '\code{simple}': Rescales weights uniformly by the number of surveys. Use
#' when the survey years have approximately the same sample sizes
#'
#'  Defaults to '\code{obs}'
#'
#' @details
#' Note that while \code{get_data_brfss} automatically creates multi-year weights
#' for all years included in the data download, these weights may not be
#' appropriate for all analyses. Some BRFSS questions are only asked in specific
#' years, requiring custom weights to be calculated for those specific time
#' periods. Please refer to \code{\link{pool_brfss_weights}}
#' to learn how to easily re-weight and survey set the data.
#'
#' As stated in the \bold{Value} section below, this function will return a
#' \code{\link[mitools]{imputationList}} when selecting King County Health
#' Reporting Area (HRA) or region variables. This
#' is necessary because BRFSS is provided at the ZIP code level and ZIP codes
#' do not nest perfectly within HRAs (and regions are defined by HRAs). When using
#' a BRFSS \code{\link[mitools]{imputationList}} as the \code{ph.data} argument
#' in \code{\link{calc}}, the function will properly process the
#' \code{\link[mitools]{imputationList}} to account for the uncertainty in
#' allocation of ZIP codes to HRAs. In other words, it is fine if you are
#' unfamiliar with imputation because \code{\link{calc}} will deal with the
#' details for you.
#'
#' @return
#' If '\code{hra20_id}', \code{hra20_name}', and '\code{chi_geo_region}'
#' \emph{\bold{are not requested}}: Returns a survey-weighted
#' \code{\link[dtsurvey]{dtsurvey}}/data.table object with the specified columns,
#' years, and '\code{default_wt}` (the rescaled / adjusted weight).
#'
#' If any of '\code{hra20_id}', '\code{hra20_name}', or '\code{chi_geo_region}'
#' \emph{\bold{are requested}}: Returns an \code{\link[mitools]{imputationList}} comprised of
#' survey-weighted \code{\link[dtsurvey]{dtsurvey}}/data.table objects with the
#' specified columns, years, and '\code{default_wt}` (the rescales / adjusted
#' weight).
#'
#' @references
#' For information regarding the BRFSS ETL process, file locations, etc.,
#' see: \url{https://github.com/PHSKC-APDE/BRFSS}
#'
#' @examples
#' \dontrun{
#' # Get data for specific columns and years
#' brfss_data <- get_data_brfss(
#'   cols = c('chi_sex'),
#'   year = 2019:2022
#' )
#'
#' # Get data for all columns for the most recent year
#' brfss_data <- get_data_brfss()
#' }
#'
#' @import data.table
#' @import mitools
#' @export
#'
get_data_brfss <- function(cols = NULL,
                           year = NULL,
                           kingco = TRUE,
                           wt_method = 'obs'){

  # Visible bindings for data.table/check global variables ----
    chi_year <- finalwt1 <- x_llcpwt <- x_ststr <- hra20_id <- hra20_name <- NULL
    my_weight_var <- chi_geo_region <- region_name <- NULL

  # Validate kingco and load data ----
    if (length(kingco) != 1 || !is.logical(kingco) || is.na(kingco)) {
      stop("\n\U0001f6d1 `kingco` must be a logical (TRUE | FALSE, or equivalently, T | F).")
    }

    if(isTRUE(kingco)){
      myfile_path <- "//dphcifs/APDE-CDIP/BRFSS/prog_all/final_analytic.rds"
      my_weight_var <- 'finalwt1'
    } else {
      myfile_path <- "//dphcifs/APDE-CDIP/BRFSS/WA/wa_final_analytic.rds"
      my_weight_var <- 'x_llcpwt'
    }

    validate_network_path(myfile_path, is_directory = FALSE)
    dt <- setDT(readRDS(myfile_path))

  # Validate other arguments ----
    # Validate the `cols` argument
    if (!is.null(cols)) {
      cols <- unique(c(cols, 'chi_year'))
      impute_cols <- intersect(c('hra20_id', 'hra20_name', 'chi_geo_region'), cols)
      cols <- setdiff(cols, impute_cols)
      invalid_cols <- cols[!cols %in% names(dt)]

      if (length(invalid_cols) > 0) {
        stop(sprintf("\n\U1F6D1 The following columns are not available in the dataset: %s",
                     paste(invalid_cols, collapse = ", ")))
      }
    } else {
      cols <- names(dt)
      if(isTRUE(kingco)){
        impute_cols <- c('hra20_id', 'hra20_name', 'chi_geo_region')
      } else {
        impute_cols <- NULL
      }
      }

    # Validate the `year` argument
    if (!is.null(year)) {
      # Check if year is numeric or can be converted to integer losslessly
      if (!is.numeric(year) || !all(year == as.integer(year))) {
        stop('\n\U0001f6d1 `year` must specify a vector of integers (e.g., c(2019, 2022)) or be NULL (to get the most recent year).')
      }
      # Check if all years are available in the dataset
      if (!all(year %in% unique(dt$chi_year))) {
        missing_years <- year[!year %in% unique(dt$chi_year)]
        if (length(missing_years) > 0) {
          stop(sprintf("\n\U1F6D1 The following years are not available in the dataset: %s",
                       paste(missing_years, collapse = ", ")))
        }
      }
    } else {year <- max(dt$chi_year)}

    # Validate the `wt_method` argument
    if (!is.character(wt_method) || length(wt_method) != 1 ||
        is.na(wt_method) || !wt_method %in% c("simple", "obs", "pop")) {
      stop("\n\U1F6D1 'wt_method' must be one of: 'obs', 'pop', or 'simple'")
    }

  # Subset the data ----
    dt <- dt[chi_year %in% year]

    if(length(impute_cols) > 0){
        dt <- dt[, unique(c(cols, my_weight_var, 'x_ststr', grep('hra20_id', names(dt), value = T, ignore.case = T))), with = FALSE]
    } else {dt <- dt[, unique(c(cols, my_weight_var, 'x_ststr')), with = FALSE]}

  # Adjust weights and survey set ----
    dt <- pool_brfss_weights(
      ph.data = dt,
      years = year,
      year_var = 'chi_year',
      old_wt_var = my_weight_var,
      new_wt_var = 'default_wt',
      wt_method = wt_method,
      strata = 'x_ststr')

  # Create an imputation list if requesting HRA or region (because xwalked from ZIP codes) ----
    if(length(impute_cols) > 0){
      dt <- as_imputed_brfss(dt, impute_cols = impute_cols)
    }

  # Return object ----
    if(isFALSE(kingco)){
      message('\033[33m', 'Note!!\n',
              'You submitted the argument ', '\033[36m', '`kingco = FALSE`', '\033[33m', ', which provides WA State data.\n',
              'This data set cannot be used for King County, KC Regions, or KC HRAs.', '\033[0m')
    }

    return(dt)
}

# get_data_chars() ----
#' Get CHARS (Comprehensive Hospital Abstract Reporting System) microdata from storage.
#'
#' @param cols Character vector of length >=1. Identifies which columns should be returned. NA returns all columns in the analytic dataset.
#'     See \code{\link{list_dataset_columns}} for more information on which columns are considered default by dataset.
#'
#' Default = NA
#' @param year Numeric vector. Identifies which years of data should be pulled. Defaults to the most recent year.
#'
#' Default = most recent year only
#' @param kingco logical OR 'zip'.
#'
#' When `kingco = T`,  returns dataset where the patient county is King County -- based on country code.
#'
#' When `kingco = 'zip'`, returns a dataset where the patient county is King County -- based on zip code.
#'
#' Default = T
#'
#' @param wastate logical. Return dataset for Washington State CHARS data only. When FALSE includes Oregon.
#'
#' Default = T
#'
#' @param version Character vector of length 1. Either 'final' or 'stage'.
#'
#' Default = 'final'
#'
#' @param inpatient logical. Return dataset for inpatients only. When FALSE includes observation patients.
#'
#' Default = T
#'
#' @param deaths logical. Return dataset with or without patients who died in the hospital. When TRUE the
#' dataset includes those who died.
#'
#' Default = T
#'
#' @param topcode logical. Do you want to top code chi_age at 100 to match population data?
#'
#' Default = T
#'
#' @param mykey Character vector of length 1. Identifies
#' the keyring:: 'service' name that can be used to access the Health & Human Services
#' Analytic Workspace (HHSAW).
#'
#' Default == 'hhsaw'
#'
#' @return a single data.table
#'
#' @import data.table
#' @import DBI
#' @import odbc
#' @import dtsurvey
#' @export
#'
#' @examples
#'
#' \donttest{
#'  test <- get_data_chars(
#'           cols = c('seq_no', 'chi_age'),
#'           year = c(2020),
#'           kingco = TRUE,
#'           wastate = TRUE,
#'           version = 'final',
#'           inpatient = TRUE,
#'           deaths = FALSE,
#'           topcode = FALSE,
#'           mykey = 'hhsaw')
#'
#'  head(test)
#' }
get_data_chars <- function(cols = NA,
                           year = NA,
                           kingco = T,
                           version = 'final',
                           wastate = T,
                           inpatient = T,
                           deaths = T,
                           topcode = T,
                           mykey = 'hhsaw'){

  chi_age <- date_of_birth <- date_of_chars <- age_years <- chi_race_eth7 <- NULL
  chi_race_6 <- chi_race_eth8 <- chi_race_7 <- geo_id_code <- chi_geo_wastate <- NULL
  chi_sex <- chi_geo_kc <- chi_race_aic_hisp <- yage4 <- age <- age6 <- NULL
  geo_type <- geo_id <- race4 <- race3 <- race3_hispanic <- NULL

  # validate arguments other than mykey ----
    if(!(length(cols) == 1 && is.na(cols))){
      if(!is.character(cols)){stop('\n\U0001f6d1 `cols` must specify a vector of variables or be NA (to get all possible columns).')}
    }
    if(!(length(year) == 1 && is.na(year))){
      if( (!is.numeric(year)) | sum(year%%1) != 0 ) {stop('\n\U0001f6d1 `year` must specify a vector of integers (e.g., c(2017, 2019)) or be NA (to get the most recent year).')}
    }
  if((length(kingco) == 1 && is.na(kingco)) | !(is.logical(kingco) | kingco == 'zip') ){stop('\n\U0001f6d1 `kingco` must be a logical (TRUE | FALSE, or equivalently, T | F).')}
  if((length(wastate) == 1 && is.na(wastate)) | !is.logical(wastate)){stop('\n\U0001f6d1 `wastate` must be a logical (TRUE | FALSE, or equivalently, T | F).')}
  if((length(inpatient) == 1 && is.na(inpatient)) | !is.logical(inpatient)){stop('\n\U0001f6d1 `inpatient` must be a logical (TRUE | FALSE, or equivalently, T | F).')}
  if((length(deaths) == 1 && is.na(deaths)) | !is.logical(deaths)){stop('\n\U0001f6d1 `deaths` must be a logical (TRUE | FALSE, or equivalently, T | F).')}
  if((length(topcode) == 1 && is.na(topcode)) | !is.logical(topcode)){stop('\n\U0001f6d1 `topcode` must be a logical (TRUE | FALSE, or equivalently, T | F).')}

  # Validate mykey ----
    con <- validate_hhsaw_key(hhsaw_key = mykey)

  # create SQL table name ----
    mysqltable <- DBI::Id(schema = 'chars', table = paste0(version, '_analytic'))

  # Get list of all colnames from SQL ----
      chars.names <- tolower(names(DBI::dbGetQuery(con, glue::glue_sql("SELECT TOP (0) * FROM {`mysqltable`}", .con = con))))
      chars.years <- sort(unique(DBI::dbGetQuery(con, glue::glue_sql("SELECT DISTINCT chi_year FROM {`mysqltable`}", .con = con))$chi_year))

  # Identify columns and years to pull from SQL ----
      cols <- tolower(cols)
      if(!all(is.na(cols))){
        # for custom CHI/CHNA vars
          original.cols <- copy(cols)
          var.2.calc <- c()
          if('wastate' %in% cols){cols <- c(cols, 'chi_geo_wastate'); var.2.calc=c(var.2.calc, 'wastate')}
          if('yage4' %in% cols){cols <- c(cols, 'age'); var.2.calc=c(var.2.calc, 'yage4')}
          if('age6' %in% cols){cols <- c(cols, 'age'); var.2.calc=c(var.2.calc, 'age6')}
          if('race3' %in% cols){cols <- c(cols, 'chi_race_6', 'chi_race_aic_hisp'); var.2.calc=c(var.2.calc, 'race3')}
          if('race4' %in% cols){cols <- c(cols, 'chi_race_eth7'); var.2.calc=c(var.2.calc, 'race4')}
          cols <-  unique(setdiff(cols, var.2.calc))
        # generic vars
          invalid.cols <- setdiff(cols, chars.names)
          valid.cols <- intersect(chars.names, cols)
          if(length(valid.cols) > 0){cols <- glue::glue_sql_collapse(valid.cols, sep=", ")}
          if(length(invalid.cols) > 0){message(paste0("The following column names do not exist in the chars data and have not been extracted: ", paste0(invalid.cols, collapse = ", ")))}
          if(length(valid.cols) == 0){stop("chars data cannot be extracted because no valid column names have been submitted. To get all columns, use the argument 'cols = NA'")}
      }
      if(all(is.na(cols))){
        original.cols <- copy(cols)
        cols <- glue::glue_sql_collapse("*", sep = ', ')
        }


      if(length(year) == 1 && is.na(year)){
        year = max(chars.years)
        message(paste0("You did not specify a year so the most recent available year, ", max(chars.years), ", was selected for you. Available years include ", format_time(chars.years)))}

      invalid.year <- setdiff(year, chars.years)
      year <- intersect(year, chars.years)
      if(length(year) == 0){stop(paste0("chars data cannot be extracted because no valid years have been provided. Valid years include: ", format_time(chars.years)))}
      if(length(invalid.year)>0){message(paste0("The following years do not exist in the chars data and have not been extracted: ", format_time(invalid.year)))}

  # Pull columns and years from SQL ----
      validyears <- glue::glue_sql_collapse(year, sep=", ")
      query.string <- glue_sql('select {DBI::SQL(cols)} from {`mysqltable`} where chi_year in ({`validyears`*})', .con = con)

      if(isTRUE(inpatient)){query.string <- glue:: glue_sql (query.string, " AND STAYTYPE = 1", .con = con)}
      if(isFALSE(deaths)){query.string <- glue:: glue_sql (query.string, " AND STATUS != 20", .con = con)} # 20 means Expired / did not recover
      if(isTRUE(wastate)){query.string <- glue:: glue_sql (query.string, " AND chi_geo_wastate = 1", .con = con)}
      if(isTRUE(kingco)){query.string <- glue:: glue_sql (query.string, " AND chi_geo_kc = 1", .con = con)}
      if(kingco == 'zip'){query.string <- glue:: glue_sql (query.string, " AND chi_geo_kczip = 1", .con = con)}

      dat <- data.table::setDT(DBI::dbGetQuery(con, query.string))
      setnames(dat, tolower(names(dat)))

      odbc::dbDisconnect(con)

  # Top code age (if wanted) ----
      if( 'chi_age' %in% cols | 'chi_age' %in% names(dat) ){
        dat[chi_age < 0, chi_age := NA] # cannot have a negative age (due to 9999 as year of birth)
        if(isTRUE(topcode)){
          dat[chi_age > 100, chi_age := 100] # top code to 100 to match population data
        }
      }

  # Format string variables due to SQL import quirks ----
      original.order <- names(dat)

      string_clean(dat, stringsAsFactors = F) # clean random white spaces and ensure all factors are strings
      string.vars <- setdiff(names(dat)[sapply(dat, is.character)], c('diag1', 'proc1', 'ecode1'))
      if(length(string.vars) > 0){dat[, (string.vars) := lapply(.SD, tolower), .SDcols = string.vars]}

  # Label race_ethnicity data ----
      if( 'chi_race_eth7' %in% cols | 'chi_race_eth7' %in% names(dat) ){
        dat[, chi_race_eth7 := factor(chi_race_eth7,
                                      levels = c(2, 1, 5, 7, 8, 6, 3),
                                      labels = c("Black", "White", "Multiple", "Asian", "NHPI", "Hispanic", "AIAN"))]
      }
      if( 'chi_race_eth8' %in% cols | 'chi_race_eth8' %in% names(dat) ){
        dat[, chi_race_eth8 := factor(chi_race_eth8,
                                      levels = c(2, 1, 5, 7, 8, 6, 3, 9),
                                      labels = c("Black", "White", "Multiple", "Asian", "NHPI", "Hispanic", "AIAN", "Oth/unk"))]
      }
      if( 'chi_race_6' %in% cols | 'chi_race_6' %in% names(dat) ){
        dat[, chi_race_6 := factor(chi_race_6,
                                   levels = c(3, 6, 4, 2, 1, 5),
                                   labels = c("Black", "White", "Multiple", "Asian", "AIAN", "NHPI"))]
      }
      if( 'chi_race_7' %in% cols | 'chi_race_7' %in% names(dat) ){
        dat[, chi_race_7 := factor(chi_race_7,
                                   levels = c(3, 6, 4, 2, 1, 5, 9),
                                   labels = c("Black", "White", "Multiple", "Asian", "AIAN", "NHPI", "Oth/unk"))]
      }

  # Label gender ----
      if( 'chi_sex' %in% cols | 'chi_sex' %in% names(dat) ){
        dat[, chi_sex := factor(chi_sex, levels = 0:1, labels = c("Female", "Male"))]
      }

  # Create custom CHI vars if requested ----
    if('chi_geo_kc' %in% names(dat)){
      dat[, chi_geo_kc := as.character(chi_geo_kc)]
      dat[, chi_geo_kc := fcase(chi_geo_kc=='TRUE', 'King County',
                                default = NA_character_)]
    }
    if('wastate' %in% original.cols || (length(original.cols) == 1 && is.na(original.cols))){
      dat[isTRUE(chi_geo_wastate), wastate := 'Washington State']
    }
    if('yage4' %in% original.cols || (length(original.cols) == 1 && is.na(original.cols))){
      dat[, yage4 := fcase(age %in% 0:4, '0-4',
                           age %in% 5:9, '5-9',
                           age %in% 10:14, '10-14',
                           age %in% 15:17, '15-17',
                           default = NA_character_)]
    }
    if('age6' %in% original.cols || (length(original.cols) == 1 && is.na(original.cols))){
      dat[, age6 := fcase(age %in% 0:17, '<18',
                          age %in% 18:24, '18-24',
                          age %in% 25:44, '25-44',
                          age %in% 45:64, '45-64',
                          age %in% 65:74, '65-74',
                          age >= 75, '75+',
                          default = NA_character_)]
    }
    if('race4' %in% original.cols || (length(original.cols) == 1 && is.na(original.cols))){
      dat[, race4 := chi_race_eth7]
    }
    if('race3' %in% original.cols | (length(original.cols) == 1 && is.na(original.cols))){
      dat[, race3 := chi_race_6]
      dat[isTRUE(chi_race_aic_hisp), race3_hispanic := 'Hispanic']
    }

  # reorder table ----
      if(!(length(original.cols) == 1 && is.na(original.cols))){
        if('race3_hispanic' %in% names(dat)){
          original.cols <- c(original.cols, 'race3_hispanic')
          }
        dat <- dat[, original.cols, with = F]
        }

      data.table::setDT(dat) # set it as a data.table again b/c otherwise, ascribing the new class above makes a copy

  # return object ----
      return(dat)
}


# get_data_death() ----
#' Get Death microdata from storage.
#'
#' @param cols Character vector of length >=1. Identifies which columns should be
#' returned. NA returns all columns in the analytic dataset. See
#' \code{\link{list_dataset_columns}} for more information on which columns are
#' considered default by dataset.
#'
#' Default = NA
#' @param year Numeric vector. Identifies which years of data should be pulled. NA returns the most recent year.
#'
#' Default = NA
#'
#' @param kingco logical. Return dataset for analyses where county of decedent's residence is King County.
#'
#' Default = TRUE
#'
#' @param version Character vector of length 1. Either 'final' or 'stage'.
#'
#' Default = 'final'
#'
#' @param topcode logical. Whether to top code chi_age at 100 to match population data.
#'
#' Default = TRUE
#'
#' @param mykey Character vector of length 1. Identifies the keyring:: 'service'
#' name that can be used to access the Health & Human Services Analytic Workspace (HHSAW).
#'
#' Default == 'hhsaw'
#'
#' @param include_prelim logical. Whether to include preliminary data.
#'   WARNING: Keep as FALSE for production use. Preliminary data is
#'   incomplete and unsuitable for analysis.
#'
#' Default = FALSE
#'
#' @return a single data.table
#'
#' @import data.table
#' @import DBI
#' @import odbc
#' @import dtsurvey
#' @export
#'
#' @examples
#'
#' \donttest{
#'  test <- get_data_death(
#'           cols = c('chi_year', 'race4', 'chi_sex'),
#'           year = c(2019),
#'           kingco = TRUE,
#'           version = 'final',
#'           topcode = FALSE,
#'           mykey = 'hhsaw')
#'
#'  head(test)
#' }
get_data_death <- function(cols = NA,
                           year = NA,
                           kingco = TRUE,
                           version = 'final',
                           topcode = TRUE,
                           mykey = 'hhsaw',
                           include_prelim = FALSE){
  # Visible bindings for data.table/check global variables ----
    chi_age <- chi_geo_kc <- chi_year <- NULL

  # Validate arguments other than mykey ----
    if(!(length(cols) == 1 && is.na(cols))){
      if(!is.character(cols)){
        stop('\n\U0001f6d1 `cols` must specify a vector of variables or be NA (to get all possible columns).')
        }
    }
    if(!(length(year) == 1 && is.na(year))){
      if( (!is.numeric(year)) | sum(year%%1) != 0 ) {
        stop('\n\U0001f6d1 `year` must specify a vector of integers (e.g., c(2017, 2019)) or be NA (to get the most recent year).')
        }
    }
    if(!is.logical(kingco) || length(kingco) != 1 || is.na(kingco)){
      stop('\n\U0001f6d1 `kingco` must be a logical (TRUE | FALSE, or equivalently, T | F).')
      }
    if(!is.character(version) || length(version) != 1 ||
       !(tolower(version) %in% c('final', 'stage'))){
      stop('\n\U0001f6d1 `version` must be either "final" or "stage".')
    }
    if(!is.logical(topcode) || length(topcode) != 1 || is.na(topcode)){
      stop('\n\U0001f6d1 `topcode` must be a logical (TRUE | FALSE, or equivalently, T | F).')
    }
    if(!is.logical(include_prelim) || length(include_prelim) != 1 || is.na(include_prelim)){
      stop('\n\U0001f6d1 `include_prelim` must be a logical (TRUE | FALSE, or equivalently, T | F).')
    }

  # Validate mykey ----
      con <- validate_hhsaw_key(hhsaw_key = mykey)

  # create SQL table name
    mysqltable <- DBI::Id(schema = 'death', table = paste0(version, '_analytic'))

  # Get list of all colnames & years from SQL ----
      death.names <- tolower(names(DBI::dbGetQuery(con, glue::glue_sql("SELECT TOP (0) * FROM {`mysqltable`}", .con = con))))
      if(isTRUE(include_prelim)){
        death.years <- sort(unique(DBI::dbGetQuery(con, glue::glue_sql("SELECT DISTINCT chi_year FROM {`mysqltable`}",.con = con))$chi_year))
      } else {
        death.years <- sort(unique(DBI::dbGetQuery(con, glue::glue_sql("SELECT DISTINCT chi_year FROM {`mysqltable`} WHERE apde_file_status = 'F'",.con = con))$chi_year))
      }

  # Identify columns and years to pull from SQL ----
      cols <- tolower(cols)
      if(!all(is.na(cols))){
        # convert user defined vector of columns to a SQL statement
        invalid.cols <- setdiff(cols, death.names)
        valid.cols <- intersect(death.names, cols)
        if(length(valid.cols) > 0){cols <- glue::glue_sql_collapse(valid.cols, sep=", ")}
        if(length(invalid.cols) > 0){message(paste0("The following column names do not exist in the death data and have not been extracted: ", paste0(invalid.cols, collapse = ", ")))}
        if(length(valid.cols) == 0){stop("death data cannot be extracted because no valid column names have been submitted. To get all columns, use the argument 'cols = NA'")}
      }

      if(all(is.na(cols))){
        cols <- glue::glue_sql("*")
        }

      if(length(year) == 1 && is.na(year)){
        year = max(death.years)
        message(paste0("You did not specify a year so the most recent available year, ", max(death.years), ", was selected for you. Available years include ", format_time(death.years)))}

      invalid.year <- setdiff(year, death.years)
      year <- intersect(year, death.years)
      if(length(year) == 0){stop(paste0("Death data cannot be extracted because no valid years have been provided. Valid years include: ", format_time(death.years)))}
      if(length(invalid.year)>0){message(paste0("The following years do not exist in the death data and have not been extracted: ", format_time(invalid.year)))}

  # Pull columns and years from SQL ----
      validyears <- glue::glue_sql_collapse(year, sep = ", ")
      query.string <- glue_sql('select {DBI::SQL(cols)} from {`mysqltable`} where chi_year in ({`validyears`*})', .con = con)

      if(isTRUE(kingco)){query.string <- paste0(query.string, " AND chi_geo_kc = 'King County'")}

      if(isFALSE(include_prelim)){query.string <- paste0(query.string, "AND apde_file_status = 'F'")}

      dat <- data.table::as.data.table(DBI::dbGetQuery(con, query.string))

  # Top code age (if wanted) ----
      if('chi_age' %in% names(dat) ){
        dat[chi_age < 0, chi_age := NA] # cannot have a negative age (due to 9999 as year of birth)
        if(isTRUE(topcode)){
          dat[chi_age > 100, chi_age := 100] # top code to 100 to match population data
        }
      }

  # Identify and format class == DATE ----
      datevars <- DBI::dbGetQuery(con, "select column_name FROM death.ref_column_list  WHERE table_name = 'analytic' AND column_type = 'DATE'")[]$column_name
      datevars <- intersect(datevars, names(dat)) # names of all date variables that are in actual dataset

      if(length(datevars) > 0){
        dat[, (datevars) := lapply(.SD, function(x) as.Date(as.character(x))), .SDcols = datevars]
      }

  # Format string variables due to SQL import quirks ----
      # Clean string variables to handle SQL import quirks like extra whitespace and encoding issues
      string_clean(dat, stringsAsFactors = FALSE) # clean random white spaces

  # Return object ----
      return(dat)
}

# get_data_hys() ----
#' Get HYS microdata from storage.
#'
#'
#' @param cols Character vector of length >-1. Identifies which columns should be returned. NULL or NA returns all columns in the analytic dataset.
#'     See \code{\link{list_dataset_columns}} for more information on which columns are considered default by dataset.
#' @param year Numeric vector. Identifies which years of data should be pulled
#' @param weight_variable Character vector of length 1. Identifies which weight column
#' @param kingco logical. Return dataset for analyses in King County only. The only option
#' @param version version of the HYS dataset to pull. Defaults to best. Don't change unless you know what you are doing.
#' @param ar logical. Whether to pull from the analytic ready dataset. FALSE will load stage data
#' @return dataset either in data.table (administrative data) or svy_tbl (survey data) for further analysis/tabulation
#'
#' @import dtsurvey
#' @importFrom data.table ":=" .I
#' @export
#'
#' @examples
#'
#' \donttest{
#'  get_data_hys(cols = NULL, year = c(2016, 2018), weight_variable = 'wt_grade_kc')
#' }
get_data_hys <- function(cols = NULL, year = c(2021), weight_variable = 'wt_grade_kc', kingco = TRUE, version = 'best', ar = TRUE){

  colname <- chi_geo_kc <- weight1 <- psu <- chi_year <- NULL

  stopifnot(all(year %in% c(seq(2004,2018,2), 2021, 2023)))

  # Ensure directory is accessible
  hysDir <- '//dphcifs/APDE-CDIP/HYS/releases/'
  validate_network_path(hysDir, is_directory = TRUE)

  # pull the list of vars
  vars = file.path(hysDir, version, 'hys_cols.csv')
  vars = data.table::fread(vars)

  # subset by year
  yyy = year
  vars = vars[year %in% yyy]

  # confirm that the columns requested exist in the dataset
  # and load them if that is the case
  if(!is.null(cols)){
    noexist = setdiff(cols, vars[, colname])
    if(length(noexist)>0){
      stop(paste('The following columns were requested but do not exist for the supplied years:',
                 paste(noexist, collapse = ',')))
    }
  } else{
    if(ar) cols = vars[ar == TRUE, colname]
    if(!ar) cols = vars[ar == FALSE, colname]
  }
  cols = unique(cols)

  #figure out whether to load stage, analytic ready or both
  vars = vars[colname %in% cols]
  arfp = c()
  sfp = c()
  if(any(vars[, ar])){
    arfp = file.path(hysDir, version, '/', paste0('hys_ar_', year, '.rds'))
    ardat = data.table::rbindlist(lapply(arfp, readRDS), use.names = T, fill = T)

  }
  if(any(!vars[, ar])){
    sfp = file.path(hysDir, version, '/', paste0('hys_stage_', year, '.rds'))
    sdat = data.table::rbindlist(lapply(sfp, readRDS), use.names = T, fill = T)

  }
  # If both were loaded, merge them
  if(exists('ardat') && exists('sdat')){
    scols = unique(vars[ar == FALSE, colname]) # prevent column duplication
    dat = merge(ardat, sdat[, .SD, .SDcols = c('obs_id', scols)], all.x = T, by = 'obs_id')
  }else if(exists('ardat'))(
    dat = ardat
  )else{
    dat = sdat
  }

  #create the survey object
  if(isTRUE(kingco)){
    dat <- dat[chi_geo_kc == 1,]
  }else{
    warning('Survey will be set to self-weighting so that rows outside of KC do not get dropped for having weights of 0')
    dat[, weight1 := 1]
    weight_variable = 'weight1'
  }
  if(all(vars[, ar == FALSE])){
    warning('Requested staged data only. This dataset does not have weights. Survey set to be self weighting')
    dat[, weight1 := 1]
    weight_variable = 'weight1'
  }

  #prep the dataset
  dat[is.na(psu), psu := -1 * .I]
  dat[is.na(get(weight_variable)), (weight_variable) := 0]

  #subset by year
  yvar = year
  dat = dat[chi_year %in% yvar, ]

  dat = dat[get(weight_variable)>0]
  svy <- dtsurvey::dtsurvey(dat, psu = 'psu', strata = 'chi_year', weight = weight_variable, nest = T)

  svy <- svy[, .SD, .SDcols = c(cols, '_id')]

  return(svy)
}


# get_data_pums() ----
#' Get PUMS microdata from storage
#'
#' @description
#' Retrieves American Community Survey (ACS) Public Use Microdata Sample (PUMS) data
#' from storage. Can return person-level, household-level, or combined records
#' with appropriate survey weights applied.
#'
#' @param cols Character vector specifying which columns to include in the
#' returned data. If NULL, all columns will be included. Note that survey weight
#' columns (wgtp/pwgtp) and chi_year are always included regardless of selection.
#' Defaults to \code{cols = NULL}
#' @param year Integer vector specifying which years to include in the data.
#' Can be either a single year for 1-year estimates or five consecutive years
#' for 5-year estimates. If NULL, the most recent single year available will be used.
#' Note that 2020 is not available due to COVID-19 pandemic survey disruptions.
#' Defaults to \code{year = NULL}
#' @param kingco Logical indicating whether to restrict the data to King County
#' records only. Defaults to \code{kingco = TRUE}
#' @param records Character string specifying whether to return person-level,
#' household-level, or combined records. Must be one of "person", "household",
#' or "combined". When 'combined' is selected, person and household records are
#' merged using the household identifier (serialno) and survey set for
#' person-level analyses. Defaults to \code{records = 'person'}
#'
#' @details
#' The function automatically applies the appropriate survey weights (person or
#' household) based on the \code{records} parameter. For person-level and
#' combined records, it uses the person weight (pwgtp) and its replicate weights.
#' For household-level records, it uses the household weight (wgtp) and its
#' replicate weights.
#'
#' The function uses the JK1 (jackknife) method for variance estimation with
#' 80 replicate weights, following Census Bureau recommendations for PUMS data.
#'
#' When you select \code{records = "combined"}, household-level variables with
#' the same names as person-level variables are given a '_hh' suffix to
#' distinguish them. You are strongly encouraged to review the Census Bureau's
#' [ACS PUMS documentation](https://www.census.gov/programs-surveys/acs/microdata.html)
#' if you plan to set \code{records = "combined"}.
#'
#' @return
#' Returns a survey-weighted \code{\link[dtsurvey]{dtsurvey}}/data.table object
#' with the specified columns and years that is ready for use with
#' \code{\link{calc}}.
#'
#' @references
#' For information regarding the ACS PUMS ETL process, file locations, data
#' dictionaries, etc., see: \url{https://github.com/PHSKC-APDE/svy_acs}
#'
#' @examples
#' \donttest{
#' # Get person-level data for specific columns from the most recent year
#' pums_person <- get_data_pums(
#'   cols = c("agep", "race4"),
#'   kingco = TRUE
#' )
#'
#' # Get household-level data for a 5-year period
#' pums_households <- get_data_pums(
#'   year = 2018:2022,
#'   records = "household"
#' )
#'
#' # Get combined person-household level data for WA State in 2022
#' pums_combo <- get_data_pums(
#'   year = 2022,
#'   records = "combined",
#'   kingco = FALSE
#' )
#' }
#'
#'
#'
#' @import data.table
#' @import dtsurvey
#' @importFrom survey svrepdesign
#' @export
#'
get_data_pums <- function(cols = NULL,
                          year = NULL,
                          kingco = TRUE,
                          records = "person") {
# Visible bindings for data.table/check global variables ----
  chi_geo_kc <- serialno <- NULL

# Get PUMS file availability ----
baseDir <- "//dphcifs/APDE-CDIP/ACS/PUMS_data/"
validate_network_path(baseDir, is_directory = TRUE)

preppedFiles <- grep("household|person", grep("\\.rds$", grep("prepped_R_files", list.files(baseDir, full.names = TRUE, recursive = TRUE), value = TRUE), value = TRUE, ignore.case = T), value = T)
if (uniqueN(preppedFiles) == 0){
  stop("\n\U1F6D1 There are no available analytic ready RData files in: ", baseDir, "\n\n",
       "The standard file paths should have the form:\n",
       "- ", gsub('/', '\\\\', paste0(baseDir, "YYYY_1_year/prepped_R_files/YYYY_1_year_person.rds")), "\n",
       "- ", gsub('/', '\\\\', paste0(baseDir, "YYYY_yyyy_5_year/prepped_R_files/YYYY_yyyy_5_year_household.rds")))
}

maxYear = max(as.integer(substr(gsub(baseDir, "", grep("1_year", preppedFiles, value = TRUE)), 1, 4)))
minYear = min(as.integer(substr(gsub(baseDir, "", grep("1_year", preppedFiles, value = TRUE)), 1, 4)))
max5Year = max(as.integer(substr(gsub(baseDir, "", grep("5_year", preppedFiles, value = TRUE)), 1, 4)))
min5Year = min(as.integer(substr(gsub(baseDir, "", grep("5_year", preppedFiles, value = TRUE)), 1, 4)))

# Validate arguments ----
## Validate the `year` argument ----
  if (is.null(year)) {
    year <- maxYear
    useFile <- grep(paste0(year, "_1_year"), preppedFiles, value = TRUE)
  } else {
    # Check if year is numeric or can be converted to integer losslessly & is correct length
    if (!is.numeric(year) || !all(year == as.integer(year)) || !length(year) %in% c(1, 5)) {
      stop("\n\U1F6D1 `year` must be an integer vector with: \n one value (e.g., 2022), ",
           "\n five continuous values (e.g., 2018:2022), or \n NULL for the most recent year.")
    }

    # Check if select years are available
    if (length(year) == 1) {
      if (year == 2020) {
        stop("\n\U1F6D1 `year` cannot equal 2020 due to the COVID-19 pandemic survey disruptions.")
      }
      useFile <- grep(paste0(year, "_1_year"), preppedFiles, value = TRUE)
    } else if (length(year) == 5) {
      if (all(diff(sort(year)) != 1)) {
        stop("\n\U1F6D1 the values of `year` are not continuous.")
      }
      useFile <- grep(paste0(max(year), "_", min(year), "_5_year"), preppedFiles, value = TRUE)
    }

    if (length(useFile) == 0) {
      stop("\n\U1F6D1 The `year` value is invalid.\n",
           "The minimum single year is ", minYear, " and the maximum single year is ", maxYear, ".\n",
           "The minimum five-year period is ", min5Year-4, ":", min5Year, " and the maximum is ", max5Year-4, ":", max5Year, ".")
    }
  }

## Validate the records argument ----
if (length(records) != 1 || !records %in% c("person", "household", "combined")) {
  stop("\n\U1F6D1 `records` must have the value 'person', 'household', or 'combined'.")
}

### Load RDS data for remaining validation ----
  if (records == "person") {
    person_file <- useFile[grep("person", useFile)]
    dt <- readRDS(person_file)

  } else if (records == "household") {
    household_file <- useFile[grep("household", useFile)]
    dt <- readRDS(household_file)

  } else if (records == "combined") {
    person_file <- useFile[grep("person", useFile)]
    household_file <- useFile[grep("household", useFile)]

    person_dt <- readRDS(person_file)
    household_dt <- readRDS(household_file)

    # Ensure serialno exists in both datasets (should be there, but you never know!)
    if (!all(c("serialno") %in% names(person_dt)) ||
        !all(c("serialno") %in% names(household_dt))) {
      stop("\n\U1F6D1 Required column 'serialno' missing from person or household data")
    }

    # Merge person and household data
    dt <- merge(
      person_dt,
      household_dt,
      by = "serialno",
      all.x = TRUE,
      suffixes = c("", "_hh")
    )

    # Verify combined has same number of rows as person level data
    if (nrow(dt) != nrow(person_dt)) {
      stop("\n\U1F6D1 Merge resulted in unexpected number of records")
    }
  }

## Validate the cols argument ----
  if (is.null(cols)) {
    cols <- names(dt)
  } else {
    cols <- unique(c(cols,
                     "chi_year", # always include CHI year
                     grep("wgtp", names(dt), value = TRUE))) # pwgtp = weights for person, wgtp = weights for household
    missing_cols <- cols[!cols %in% names(dt)]
    if (length(missing_cols) > 0) {
      stop(sprintf("\n\U1F6D1 The following columns are not available in the dataset: %s",
                   paste(missing_cols, collapse = ", ")))
    }
  }

  if (isTRUE(kingco)) { cols <- unique(c(cols, "chi_geo_kc")) }

## Validate the kingco argument ----
  if (isTRUE(kingco) && !("chi_geo_kc" %in% names(dt))) {
    stop("\n\U1F6D1 Column 'chi_geo_kc' not found in the dataset, required when `kingco` is TRUE.")
  }
  if (length(kingco) != 1 || !is.logical(kingco) || is.na(kingco)) {
    stop("\n\U0001f6d1 `kingco` must be a logical (TRUE | FALSE, or equivalently, T | F).")
  }

# Subset the data ----
  if (isTRUE(kingco)) {dt <- dt[chi_geo_kc == "King County"]} # subset KC before columns in case do not want column
  dt <- dt[, c(cols), with = FALSE]

# Survey set ----
  # confirm all replicate weights exist
    weight_cols <- if (records %in% c("person", "combined")) {
      c("pwgtp", grep("pwgtp[0-9]+", names(dt), value = TRUE))
    } else {
      c("wgtp", grep("wgtp[0-9]+", names(dt), value = TRUE))
    }
    if (length(weight_cols) < 81) { # 1 base weight + 80 replicates
      stop("\n\U1F6D1 Missing required weight columns")
    } else {weight_cols <- sort(weight_cols)}

  if (records %in% c("person", "combined")) {
    dt <- survey::svrepdesign(
      data = dt,
      weights = ~pwgtp,
      repweights = dt[, grep("pwgtp[0-9]+", names(dt), value = TRUE), with = FALSE], # need matrix, not col names
      type = "JK1",
      combined.weights = TRUE,
      scale = 4/80,
      rscales = rep(1, 80),
      mse = TRUE
    )
  } else {
    dt <- survey::svrepdesign(
      data = dt,
      weights = ~wgtp,
      repweights = dt[, grep("wgtp[0-9]+", names(dt), value = TRUE), with = FALSE],
      type = "JK1",
      combined.weights = TRUE,
      scale = 4/80,
      rscales = rep(1, 80),
      mse = TRUE
    )
  }

  dt <- dtsurvey::dtrepsurvey(dt)

# Return dtsurvey object ----
  message("Your data was survey set with the following parameters is ready for rads::calc():\n",
          " - record type = ", records, "\n",
          " - valid years = ", format_time(year), "\n",
          " - replicate weights = ", weight_cols[1], ", ",  weight_cols[2], " ... ", weight_cols[length(weight_cols)], " \n")

  return(dt)
}


# The end ----
PHSKC-APDE/rads documentation built on April 14, 2025, 10:47 a.m.