R/data_pull_fcns.R

Defines functions clean_histTable pull_wedss pull_histTable

Documented in clean_histTable pull_histTable pull_wedss

#' Pull down DHS COVID-19 Historical Data Table
#'
#' This function uses the REST API to pull down the data that are currently
#' posted at https://data.dhsgis.wi.gov/datasets/covid-19-historical-data-table/
#' using the GeoJSON REST API. \emph{Note: currently it does not pull down any
#' geometry data used to produce maps.}
#'
#' @param end_date (default = NULL) If specified it is the end date of the
#'                 time series that you wish to analyze. It should be something
#'                 coercible to Date format with as.Date. If you want all data
#'                 leave it blank.
#'
#' @return a cleaned version of the COVID-19 Historical Data Table including
#' HERC regions with the following columns
#'   \describe{
#'     \item{fips}{renamed from GEOID}
#'     \item{geo_type}{renamed from GEO}
#'     \item{geo_name}{renamed from NAME}
#'     \item{post_date}{DATE converted to Date format}
#'     \item{case_daily}{cleaned daily new positive cases from POS_NEW except for the first day which is from POSITIVE}
#'     \item{test_daily}{cleaned daily new total tests from TEST_NEW except for the first day which is from POSITIve + NEGATIVE}
#'     \item{death_daily}{cleaned daily new deaths from DTH_NEW except for the first day which is from DEATHS}
#'     \item{pop_2020}{2020 Population from NCHS}
#'     \item{case_cum}{daily cumulative positive cases calculated from case_daily}
#'     \item{test_cum}{daily cumulative total tests calculated from test_daily}
#'     \item{death_cum}{daily cumulative deaths calculated from death_daily}
#'   }
#'   and likely one or more of the following columns if applicable
#'   \describe{
#'     \item{case_daily_raw}{original daily new positive cases before cleaning}
#'     \item{test_daily_raw}{original daily new total tests before cleaning}
#'     \item{death_daily_raw}{cleaned daily new deaths before cleaning}
#'   }
#'
#' @export
#'
#' @importFrom sf st_read
#' @importFrom dplyr group_by
#' @importFrom dplyr transmute
#' @importFrom dplyr mutate
#' @importFrom dplyr arrange
#' @importFrom dplyr %>%
#' @importFrom dplyr distinct
#' @importFrom dplyr if_else
#' @importFrom dplyr left_join
#' @importFrom dplyr filter
#' @importFrom dplyr bind_rows
#' @importFrom rlang .data
#' @importFrom utils data
#'
#' @examples
#' #for all available data
#' hdt <- pull_histTable()
#'
#' #for data through a certain date
#' hdt_old <- pull_histTable(end_date = "2020-06-17")
#'
pull_histTable <- function(end_date = NULL) {
  #Pull down the data
  #REsT API URL
  api_url <- "https://dhsgis.wi.gov/server/rest/services/DHS_COVID19/COVID19_WI/MapServer/12/query?where=1%3D1&outFields=GEOID,GEO,NAME,DATE,POSITIVE,POS_NEW,NEGATIVE,DEATHS,DTH_NEW,TEST_NEW&outSR=4326&f=json"
  message("Downloading data from DHS ...")
  hdt <- sf::st_read(api_url, quiet = TRUE, stringsAsFactors = FALSE)

  hdt <- dplyr::distinct(hdt)

  #Protect
  hdt$NEGATIVE <- as.integer(hdt$NEGATIVE)
  hdt$TEST_NEW <- as.integer(hdt$TEST_NEW)

  clean_histTable(hdt, end_date)
}

#' Pull Confirmed Case data from WEDSS
#'
#' This function supplies a wrapper to fetch the results from a SQL query
#' from WEDSS and then performs basic data cleaning to calculate the
#' confirmed case metrics.
#'
#' @param query query to pull data from WEDSS. This call should only pull
#'              counties or jurisdictions that will automatically be
#'              aggregated into HERC regions and Statewide.
#' @param conn connection to database (see \code{\link[RODBC]{odbcConnect}})
#'              or \code{\link[odbc]{OdbcConnection}}
#' @inheritParams pull_histTable
#'
#' @return a cleaned version of the COVID-19 Historical Data Table including
#' HERC regions with the following columns
#'   \describe{
#'     \item{fips}{renamed from GEOID}
#'     \item{geo_type}{renamed from GEO}
#'     \item{geo_name}{renamed from NAME}
#'     \item{post_date}{LoadDttm converted to Date format}
#'     \item{case_daily}{cleaned daily new positive cases from POS_NEW except for the first day which is from POSITIVE}
#'     \item{test_daily}{cleaned daily new total tests from TEST_NEW except for the first day which is from POSITIve + NEGATIVE}
#'     \item{death_daily}{cleaned daily new deaths from DTH_NEW except for the first day which is from DEATHS}
#'     \item{pop_2020}{2020 Population from NCHS}
#'     \item{case_cum}{daily cumulative positive cases calculated from case_daily}
#'     \item{test_cum}{daily cumulative total tests calculated from test_daily}
#'     \item{death_cum}{daily cumulative deaths calculated from death_daily}
#'   }
#'   and likely one or more of the following columns if applicable
#'   \describe{
#'     \item{case_daily_raw}{original daily new positive cases before cleaning}
#'     \item{test_daily_raw}{original daily new total tests before cleaning}
#'     \item{death_daily_raw}{cleaned daily new deaths before cleaning}
#'   }
#'
#' @export
#'
#' @importFrom RODBC sqlQuery
#' @importFrom odbc dbGetQuery
#'
#' @examples
#' \dontrun{
#'    library(RODBC)
#'    channel <- odbcConnect("databasename", ...)
#'
#'    sql_query <- "SELECT * FROM TABLE WHERE GEO = 'COUNTY'"
#'
#'    hdt <- pull_wedss(query = sql_query, conn = channel, end_date = as.Date("2020-06-24"))
#' }
#'
pull_wedss <- function(query, conn, end_date = NULL) {
  if (inherits(conn, "RODBC")) {
    hdt <- RODBC::sqlQuery(conn, query)
  } else if (inherits(conn, "DBIConnection")) {
    hdt <- odbc::dbGetQuery(conn, query)
  }

  #Might need to do some basic data cleaning in here

  clean_histTable(hdt, end_date)
}

#' INTERNAL function to clean case data for metrics
#'
#' @param hdt a data.frame from the historical data table
#' @inheritParams pull_histTable
#'
#' @importFrom dplyr group_by
#' @importFrom dplyr ungroup
#' @importFrom dplyr transmute
#' @importFrom dplyr mutate
#' @importFrom dplyr arrange
#' @importFrom dplyr %>%
#' @importFrom dplyr if_else
#' @importFrom dplyr left_join
#' @importFrom dplyr filter
#' @importFrom dplyr bind_rows
#' @importFrom rlang .data
#' @importFrom utils data
#' @importFrom tidyr nesting
#' @importFrom tidyr complete
#'
#' @return raw historical case data ready for cleaning
clean_histTable <- function(hdt, end_date) {
  utils::data("county_data")

  #Basic Selection/wrangling
  hdt <- hdt %>%
    dplyr::arrange(.data$GEOID, .data$DATE) %>%
    dplyr::rename(fips = .data$GEOID) %>%
    dplyr::group_by(.data$fips) %>%
    dplyr::transmute(
      geo_type = .data$GEO,
      geo_name = .data$NAME,
      post_date = .data$DATE,
      case_daily = dplyr::if_else(is.na(.data$POS_NEW), .data$POSITIVE, .data$POS_NEW),
      test_daily = dplyr::if_else(is.na(.data$TEST_NEW), .data$POSITIVE + dplyr::if_else(is.na(.data$NEGATIVE), 0L, as.integer(.data$NEGATIVE)), as.integer(.data$TEST_NEW)),
      death_daily = dplyr::if_else(is.na(.data$DTH_NEW), .data$DEATHS, .data$DTH_NEW)
    ) %>%
    dplyr::ungroup(.) %>%
    tidyr::complete(tidyr::nesting(fips, geo_type, geo_name), post_date,
                    fill = list(case_daily = 0L, test_daily = 0L, death_daily = 0L)) %>%
    dplyr::left_join(dplyr::select(county_data, .data$fips, .data$herc_region, .data$pop_2020), by = "fips")

  if (inherits(hdt$post_date, "POSIXt")) {
    hdt$post_date <- as.Date(hdt$post_date, tz = "America/Chicago")
  } else {
    hdt$post_date <- as.Date(as.POSIXct(as.numeric(hdt$post_date)/1000, origin = "1970-01-01 00:00.000 UTC"), tz = "America/Chicago")
  }

  if (!is.null(end_date)) {
    hdt <- dplyr::filter(hdt, .data$post_date <= as.Date(end_date))
  }

  #Clean reversals at county level
  if (any(hdt$case_daily < 0)) {
    message("Cleaning reversals in daily confirmed cases")
    hdt$case_daily_raw <- hdt$case_daily
    hdt <- hdt %>%
      dplyr::mutate(
        case_daily = clean_reversals(.data$case_daily, verbose = FALSE)
      )

    num_negs <- sum(hdt$case_daily < 0)
    i <- 1
    while (num_negs > 0 & i < 101) {
      hdt$case_daily <- clean_reversals(hdt$case_daily, verbose = FALSE)
      num_negs <- sum(hdt$case_daily < 0)
      i <- i + 1
    }
    message("  I had to call clean_reversals() on case_daily ", i, " times.")
  }

  if (any(hdt$test_daily < 0)) {
    message("Cleaning reversals in daily tests")
    hdt$test_daily_raw <- hdt$test_daily
    hdt <- hdt %>%
      dplyr::mutate(
        test_daily = clean_reversals(.data$test_daily, verbose = FALSE)
      )

    num_negs <- sum(hdt$test_daily < 0)
    i <- 1
    while (num_negs > 0 & i < 101) {
      hdt$test_daily <- clean_reversals(hdt$test_daily, verbose = FALSE)
      num_negs <- sum(hdt$test_daily < 0)
      i <- i + 1
    }
    message("  I had to call clean_reversals() on test_daily ", i, " times.")
  }

  if (any(hdt$death_daily < 0)) {
    message("Cleaning reversals in daily deaths")
    hdt$death_daily_raw <- hdt$death_daily
    hdt <- hdt %>%
      dplyr::mutate(
        death_daily = clean_reversals(.data$death_daily, verbose = FALSE)
      )

    num_negs <- sum(hdt$death_daily < 0)
    i <- 1
    while (num_negs > 0 & i < 101) {
      hdt$death_daily <- clean_reversals(hdt$death_daily, verbose = FALSE)
      num_negs <- sum(hdt$death_daily < 0)
      i <- i + 1
    }
    message("  I had to call clean_reversals() on death_daily ", i, " times.")
  }

  #Add in HERC and STATE rows
  herc <- hdt %>%
    dplyr::group_by(.data$post_date, .data$herc_region) %>%
    dplyr::summarize_at(dplyr::vars("case_daily", "test_daily", "death_daily", "pop_2020"), sum) %>%
    dplyr::mutate(
      fips = .data$herc_region,
      geo_name = .data$herc_region,
      geo_type = "HERC Region"
    )

  state <- hdt %>%
    dplyr::group_by(.data$post_date) %>%
    dplyr::summarize_at(dplyr::vars("case_daily", "test_daily", "death_daily", "pop_2020"), sum) %>%
    dplyr::mutate(
      fips = "55",
      geo_name = "Wisconsin",
      geo_type = "State"
    )

  hdt <- dplyr::bind_rows(hdt, herc, state) %>%
    dplyr::mutate(
      case_cum = cumsum(.data$case_daily),
      test_cum = cumsum(.data$test_daily),
      death_cum = cumsum(.data$death_daily)
    ) %>%
    select(-.data$herc_region)

}

#' Imports Hospitalization file produced from EM Resource
#'
#' @param file path to EM Resource hospitalization summary .csv file
#' @inheritParams pull_histTable
#'
#'
#' @return a data.frame suitable for metric calculations
#' @export
#'
#' @importFrom readr read_csv
#' @importFrom readr cols
#' @importFrom readr col_date
#' @importFrom readr col_datetime
#' @importFrom readr col_character
#' @importFrom readr col_double
#'
#' @examples
#' \dontrun{
#'   pull_hospital("hospdatafile.csv")
#' }
pull_hospital <- function(file, end_date = NULL) {
  #Enforce correct column types and names
  hosp_cols <- readr::cols(
    Report_Date = readr::col_date(format = "%m/%d/%Y"),
    BBB_Facility_Use_Status = readr::col_character(),
    BBB_Testing_Status = readr::col_character(),
    BBB_Crit_Supply_Status = readr::col_character(),
    BBB_Staff_Status = readr::col_character(),
    Total_ICU_Beds = readr::col_double(),
    Region = readr::col_double(),
    Most_Recent_Report_Date = readr::col_date(format = "%m/%d/%Y"),
    County = readr::col_character(),
    Run_Date = readr::col_datetime(format = "%m/%d/%Y %H:%M"),
    Hospital = readr::col_character(),
    `__Gen_Use_Bedside_Vent` = readr::col_double(),
    IBA__ICU = readr::col_double(),
    IBA__Intermediate_Care = readr::col_double(),
    IBA__Medical_Surgical = readr::col_double(),
    IBA__Neg_Flow_Isolation = readr::col_double(),
    Total___COVID_patients = readr::col_double(),
    `__ICU_COVID_patients` = readr::col_double(),
    Total_Intermediate_Care_Beds = readr::col_double(),
    Total_Medical_Surgical_Beds = readr::col_double(),
    Total_Neg_Flow_Isolation_Beds = readr::col_double(),
    Number_of_Ventilated_Patients = readr::col_double()
  )

  hosp_in <- readr::read_csv(file, col_types = hosp_cols)

  if (sum(is.na(hosp_in$County))) {
    stop("There are missing values in the 'County' column. Please fix this before proceeding.")
  }

  clean_hospital(hosp_in, end_date)
}

#' Clean Hospital Data for metric calculation
#'
#' @param hosp a data.frame from EMResource
#' @inheritParams pull_histTable
#'
#' @return a cleaned data.frame
#'
#' @examples
#' \dontrun{
#'   #example here
#' }
clean_hospital <- function(hosp, end_date) {
  #Grab Run date to append when we are finished
  run_date <- unique(as.Date(hosp$Run_Date, tz = "America/Chicago"))
  if (length(run_date) > 1) {
    stop("The input file has more than one Run_Date. Fix this and try again.")
  }

  #Basic Wrangling/Agg to County
  hosp <- hosp %>%
    mutate(
      Region = dplyr::case_when(
        Region == 1 ~ "Northwest",
        Region == 2 ~ "North Central",
        Region == 3 ~	"Northeast",
        Region == 4 ~ "Western" ,
        Region == 5 ~ "South Central",
        Region == 6 ~ "Fox Valley Area",
        Region == 7 ~ "Southeast"
      )
    )

  if (!is.null(end_date)) {
    hosp <- dplyr::filter(hosp, .data$Report_Date <= as.Date(end_date))
  }

  hosp_clean <- hosp  %>%
    dplyr::group_by(County, Report_Date) %>%
    dplyr::summarise(
      dailyCOVID_px = sum(Total___COVID_patients, na.rm=TRUE),
      dailyCOVID_ICUpx = sum(`__ICU_COVID_patients`, na.rm=TRUE)
    )

  hosp_cty <- fill_dates(hosp_clean, grouping_vars = "County", date_var = "Report_Date") %>%
    dplyr::mutate(
      dailyCOVID_px = dplyr::if_else(is.na(dailyCOVID_px), 0, dailyCOVID_px),
      dailyCOVID_ICUpx = dplyr::if_else(is.na(dailyCOVID_ICUpx), 0, dailyCOVID_ICUpx),
      geo_type = "County"
    ) %>%
    dplyr::left_join(select(county_data, fips, county, pop_2020, herc_region),
              by = c("County" = "county")) %>%
    dplyr::ungroup()

  hosp_herc <- hosp_cty %>%
    dplyr::group_by(herc_region, Report_Date) %>%
    dplyr::summarize_if(is.numeric, sum) %>%
    dplyr::mutate(
      County = herc_region,
      geo_type = "Region",
      fips = herc_region,
    )

  hosp_state <- hosp_cty %>%
    dplyr::group_by(Report_Date) %>%
    dplyr::summarize_if(is.numeric, sum) %>%
    dplyr::mutate(
      County = "Wisconsin",
      geo_type = "State",
      fips = "55",
    )

  hosp_summary <- bind_rows(hosp_herc, hosp_state) %>%
    dplyr::ungroup() %>%
    dplyr::select(-.data$herc_region)

  #HERC Daily Counts
  herc_daily <- hosp %>%
    dplyr::group_by(Report_Date, Region) %>%
    dplyr::summarize(
      totalbeds = sum(Total_Intermediate_Care_Beds, na.rm=TRUE) +
                  sum(Total_ICU_Beds, na.rm=TRUE) +
                  sum(Total_Neg_Flow_Isolation_Beds, na.rm=TRUE) +
                  sum(Total_Medical_Surgical_Beds, na.rm=TRUE),

      beds_IBA = sum(IBA__Intermediate_Care, na.rm=TRUE) +
                 sum(IBA__Medical_Surgical, na.rm=TRUE) +
                 sum(IBA__Neg_Flow_Isolation, na.rm=TRUE) +
                 sum(IBA__ICU, na.rm=TRUE),

      dailyCOVID_px = sum(Total___COVID_patients, na.rm=TRUE),

      totalICU = sum(Total_ICU_Beds, na.rm=TRUE),

      ICU_IBA = sum(IBA__ICU, na.rm=TRUE),

      dailyCOVID_ICUpx = sum(`__ICU_COVID_patients`, na.rm=TRUE),

      num_px_vent = sum(Number_of_Ventilated_Patients, na.rm=TRUE),

      total_vents = sum(`__Gen_Use_Bedside_Vent`, na.rm=TRUE),

      intermed_beds_IBA = sum(IBA__Intermediate_Care, na.rm=TRUE),

      negflow_beds_IBA = sum(IBA__Neg_Flow_Isolation, na.rm=TRUE),

      medsurg_beds_IBA = sum(IBA__Medical_Surgical, na.rm=TRUE),
    ) %>%
    ungroup() %>%
    rename(County = Region)

  #Agg HERC daily to State Daily
  state_daily <- herc_daily %>%
    dplyr::group_by(Report_Date) %>%
    dplyr::summarize_if(is.numeric, sum, na.rm=TRUE) %>%
    dplyr::mutate(
      County = "Wisconsin"
    )

  #Calc final vars for combined daily series
  hosp_daily <- dplyr::bind_rows(state_daily, herc_daily) %>%
    dplyr::mutate(
      PrctBeds_Used = (1 - (beds_IBA/totalbeds)) * 100,
      PrctICU_Used  = (1 - (ICU_IBA/totalICU)) * 100,
      PrctVent_Used = (num_px_vent/total_vents) * 100
    ) %>%
    dplyr::left_join(dplyr::select(county_data, fips, County = county), by = "County")

  rm(state_daily, herc_daily)

  #Bind summary and Daily
  out <- bind_rows(
    mutate(hosp_summary, RowType = "Summary"),
    mutate(hosp_daily, RowType = "Daily")
  ) %>%
  mutate(Run_Date = run_date) %>%
  group_by(County) %>%
  mutate(
    fips = unique(fips[!is.na(fips)])
  )
}

#' Pull data from WEDSS for Testing Metrics
#'
#' @param lab_query SQL query string for data from Lab table
#' @param stg_query SQL query string for data from Staging table
#'
#' @inheritParams pull_wedss
#'
#' @return a data.frame
#' @export
#'
#' @importFrom odbc dbGetQuery
#' @importFrom RODBC sqlQuery
#' @importFrom dplyr %>%
#' @importFrom dplyr select
#' @importFrom dplyr mutate
#' @importFrom readxl read_excel
#'
#' @examples
#' \dontrun{
#'   #write me an example please.
#' }
pull_testing <- function(lab_query, stg_query, conn, end_date = NULL) {
  if (inherits(conn, "RODBC")) {
    lab <- RODBC::sqlQuery(conn, lab_query)
    stg <- RODBC::sqlQuery(conn, stg_query)
  } else if (inherits(conn, "DBIConnection")) {
    lab <- odbc::dbGetQuery(conn, lab_query)
    stg <- odbc::dbGetQuery(conn, stg_query)
  }

  clean_testing(lab, stg, end_date)
}

#' Internal function to clean testing data
#'
#' @param lab data.frame from Lab table pull
#' @param stg data.frame from Staging table pull
#' @inheritParams pull_testing
#'
#' @return a data.frame
#'
#' @importFrom dplyr filter
#' @importFrom dplyr full_join
#' @importFrom dplyr arrange
#' @importFrom dplyr mutate
#' @importFrom dplyr across
#' @importFrom dplyr if_else
#' @importFrom tidyr pivot_wider
#'
#' @examples
#' \dontrun{
#'   #write me an example
#' }
clean_testing <- function(lab, stg, end_date) {

  message("  Counting the number of positive and negative specimens...")
  specimens <- calc_pos_neg(lab, as.Date(end_date))

  message("  Final wrangling on the testing data ...")

  test_raw <- dplyr::full_join(total_tests, specimens, by = c("Area", "resultdateonly")) %>%
    dplyr::arrange(Area, resultdateonly) %>%
    dplyr::mutate(dplyr::across(c("Tests", "NotPositive", "Positive"),
                                ~ dplyr::if_else(is.na(.x), 0L, .x)))

  #filter up to end date and discard dates before Jan 01, 2020 and missing dates
  if (!is.null(end_date)) {
    test_raw <- dplyr::filter(test_raw, .data$resultdateonly <= as.Date(end_date),
                                        .data$resultdateonly >= as.Date(end_date) - 13,
                                        !is.na(.data$resultdateonly))
  } else {
    test_raw <- dplyr::filter(test_raw, .data$resultdateonly >= Sys.Date() - 13,
                                        !is.na(.data$resultdateonly))
  }

  #run thru fill_dates ... might not be necessary since we don't need daily or weekly counts
  test_raw <- fill_dates(test_raw, "Area", "resultdateonly")

  #add volume targets
  names(test_vol) <- c("Area", "Testing_Volume")

  test_vol <- test_vol %>%
    dplyr::mutate(
      Testing_Volume = 2 * Testing_Volume,
      Area = std_region(Area)
    )

  test_cty <- dplyr::left_join(test_raw,
                               dplyr::select(county_data, Area = county,
                                             Region_ID = fips, herc_region),
                               by = "Area")

  #add on HERC rows and WI rows by date
  test_herc <- test_cty %>%
    dplyr::group_by(herc_region, resultdateonly) %>%
    dplyr::summarize_if(is.numeric, sum) %>%
    dplyr::mutate(
      Area = herc_region,
      Region_ID = herc_region
    )

  test_state <- test_cty %>%
    dplyr::group_by(resultdateonly) %>%
    dplyr::summarize_if(is.numeric, sum) %>%
    dplyr::mutate(
      Area = "Wisconsin",
      Region_ID = "55"
    )

  bind_rows(test_cty, test_herc, test_state) %>%
    dplyr::ungroup() %>%
    dplyr::select(-herc_region) %>%
    arrange(Area, resultdateonly) %>%
    dplyr::left_join(test_vol, by = "Area")
}

#' INTERNAL function to calculate ingredients for percent positive per county per day
#'
#' @inheritParams clean_testing
#'
#' @return data.frame
#'
#' @importFrom dplyr filter
#' @importFrom dplyr mutate
#' @importFrom dplyr group_by
#' @importFrom dplyr count
#' @importFrom dplyr pull
#' @importFrom dplyr if_else
#' @importFrom dplyr bind_rows
#' @importFrom dplyr full_join
#' @importFrom dplyr case_when
#' @importFrom dplyr select
#' @importFrom dplyr distinct
#' @importFrom dplyr rename
#' @importFrom tidyr pivot_wider
#'
#' @examples
#' \dontrun{
#'   #write me an example
#' }
calc_pos_neg <- function(lab, end_date) {
  max_date <- end_date
  min_date <- end_date - 13

  lab.result <- lab %>%
    dplyr::mutate(
      scdflag = dplyr::if_else(is.na(SpecCollectedDate),
                                "missing.scd", "present.scd")
    )

  lab.cast <- lab.result %>%
    dplyr::group_by(IncidentID, scdflag) %>%
    dplyr::count() %>%
    tidyr::pivot_wider(id_cols = "IncidentID",
                names_from = "scdflag",
                values_from = "n",
                values_fill = 0)

  #Divide into three groups
  ##Basin 1 ----
  ##  incident ids that have all scds
  basin1.ids <- lab.cast %>%
    dplyr::filter(missing.scd == 0) %>%
    dplyr::pull(IncidentID)

  basin1 <- lab.result %>%
    dplyr::filter(IncidentID %in% basin1.ids) %>%
    dplyr::mutate(
      date = SpecCollectedDate
    )

  ##Basin 2 ----
  ##  incident ids that have at least one scd and is missing some other scds
  basin2.ids <- lab.cast %>%
    dplyr::filter(missing.scd >= 1, present.scd >= 1) %>%
    dplyr::pull(IncidentID)

  basin2 <- lab.result %>%
    dplyr::filter(IncidentID %in% basin2.ids) %>%
    dplyr::mutate(
      date = dplyr::if_else(is.na(ResultDate), SpecReceivedDate, ResultDate)
    )

  basin2.leftover <- basin2 %>%
    dplyr::filter(is.na(date)) %>%
    dplyr::mutate(
      dateonly = AccessionNumber
    )

  basin2 <- basin2 %>%
    dplyr::filter(!is.na(date))

  ##Basin 3 ----
  ##  incident ids that don't have any scds
  basin3.ids <- lab.cast %>%
    dplyr::filter(present.scd == 0) %>%
    dplyr::pull(IncidentID)

  basin3 <- lab.result %>%
    dplyr::filter(IncidentID %in% basin3.ids) %>%
    dplyr::mutate(
      date = if_else(is.na(ResultDate), SpecReceivedDate, ResultDate)
    )

  basin3.leftover <- basin3 %>%
    dplyr::filter(is.na(date)) %>%
    dplyr::mutate(
      dateonly <- AccessionNumber
    )

  basin3 <- basin3 %>%
    dplyr::filter(!is.na(date))

  #Assemble basins ----
  lab2 <- dplyr::bind_rows(basin1, basin2, basin3) %>%
    dplyr::mutate(
      dateonly = as.Date(date, tz = "America/Chicago"),
      date = as.character(date)
    ) %>%
    dplyr::bind_rows(basin2.leftover, basin3.leftover) %>%
    dplyr::mutate(
      newid  = paste(IncidentID, dateonly, sep = ""),
      newid2 = paste(IncidentID, dateonly, SpecimenSourceText, sep = ""),
      displaydate = dplyr::case_when(
        !is.na(ResultDate) ~ as.Date(ResultDate, tz = "America/Chicago"),
        is.na(ResultDate) & !is.na(SpecCollectedDate) ~ as.Date(SpecCollectedDate, tz = "America/Chicago"),
        is.na(ResultDate) & is.na(SpecCollectedDate) & !is.na(SpecReceivedDate) ~ as.Date(SpecReceivedDate, tz = "America/Chicago"),
        TRUE ~ as.Date(NA)
      )
    )

  # lab3 <- lab2[!duplicated(lab2$newid2),]
  #
  # #NEXT IS LINE 147 in Nathan_All_Specimens.R
  #
  #     DerivedCounty = ifelse(trimws(DerivedCounty) == "Fond Du Lac",
  #                            "Fond du Lac", trimws(DerivedCounty)),
  #     resultdateonly = as.Date(ResultDate, tz = "America/Chicago")
  #   ) %>%
  #   dplyr::filter(!is.na(resultdateonly))

  #decided to assign first resultdateonly within the window to deduplicate rows
  #but keep the totals consistent with the original code.
  lab2 %>%
    dplyr::filter(resultdateonly >= min_date & resultdateonly <= max_date) %>%
    dplyr::group_by(newid) %>%
    dplyr::arrange(result, resultdateonly) %>%
    dplyr::mutate(
      first_positive = dplyr::first(resultdateonly[result == "Positive"]),
      first_negative = dplyr::first(resultdateonly[result %in% c("Inconclusive", "Indeterminate", "Negative")])
    ) %>%
    dplyr::count(newid, result, first_positive, first_negative) %>%
    tidyr::pivot_wider(id_cols = c("newid", "first_positive", "first_negative"),
                       names_from = "result",
                       values_from = "n",
                       values_fill = 0) %>%
    dplyr::mutate(
      notpositive = Inconclusive + Indeterminate + Negative,
    ) %>%
    dplyr::group_by(newid, first_positive, first_negative) %>%
    dplyr::mutate(
      anypos = sum(Positive),
      anyneg = sum(notpositive),
      result2 = dplyr::case_when(
        anyneg >= 1 & anypos == 0 ~ "NotPositive",
        anyneg == 0 & anypos >= 1 ~ "Positive",
        anyneg >= 1 & anypos >= 1 ~ "Positive",
        TRUE ~ "other"
      )
    ) %>%
    dplyr::full_join(lab2, by = c("newid")) %>%
    dplyr::filter(!is.na(result2)) %>%
    dplyr::mutate(
      DerivedCounty = if_else(trimws(DerivedCounty)== "Fond Du Lac",
                              "Fond du Lac", trimws(DerivedCounty))
    ) %>%
    dplyr::filter(!is.na(result2),
                  !is.na(DerivedCounty),
                  (!DerivedCounty %in% c("Non-Wisconsin", "Unknown")),
                  (resultdateonly >= min_date & resultdateonly <= max_date)) %>%
    dplyr::select(newid, result2, DerivedCounty, first_positive, first_negative) %>%
    dplyr::distinct() %>%
    dplyr::mutate(
      resultdateonly = dplyr::if_else(result2 == "NotPositive", first_negative, first_positive)
    ) %>%
    dplyr::group_by(DerivedCounty, resultdateonly) %>%
    dplyr::count(result2) %>%
    tidyr::pivot_wider(id_cols = c("DerivedCounty", "resultdateonly"),
                       names_from = "result2",
                       values_from = "n",
                       values_fill = 0) %>%
    dplyr::rename(Area = DerivedCounty)
}

#' Pulls Essence Data for 3 metrics: CLI, ILI, Total ED Visits
#'
#' @param api_url character string matching Essence API format
#' @param start_date Start date of the time series that you wish to analyze.
#'                   It should be something coercible to Date format with as.Date.
#' @inheritParams pull_histTable
#' @param metric one of "cli", "ili", "total_ed" depending on which metrics you
#'               wish to calculate.
#'
#' @return a data.frame
#' @export
#'
#' @importFrom dplyr tibble
#' @importFrom dplyr mutate
#' @importFrom dplyr rowwise
#' @importFrom purrr map2
#' @importFrom purrr map_dfr
#'
#' @examples
#' \dontrun{
#'   #write me an example please
#' }
pull_essence <- function(api_url, start_date, end_date = NULL, metric = c("cli", "ili", "total_ed")) {
  if (metric == "total_ed") {
    message("The TOTAL ED Metric has been removed from the dashboard, this function has been deprecated and will be phased out soon.")
  }

  start_date <- as.Date(start_date)
  if (is.null(end_date)) {
    end_date <- Sys.Date()
  } else {
    end_date <- as.Date(end_date)
  }

  chunk_dates <- dplyr::tibble(
    start = seq(start_date, end_date, by = 14)
  ) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(
      end = min(start + 13, end_date)
    )

  out <- purrr::map2(chunk_dates$start, chunk_dates$end, essence_query, url = api_url) %>%
    purrr::map_dfr(essence_data)

  switch(metric,
         "cli" = clean_cli(out),
         "ili" = clean_ili(out),
         "total_ed" = clean_total_ed(out))
}

#' Clean ESSENCE data for CLI metrics
#'
#' @param cli data.frame from \code{\link{pull_essence}}
#'
#' @return a cleaned data.frame
#'
#' @importFrom dplyr %>%
#' @importFrom dplyr filter
#' @importFrom dplyr mutate
#' @importFrom dplyr if_else
#' @importFrom dplyr full_join
#' @importFrom dplyr select
#' @importFrom dplyr across
#' @importFrom dplyr group_by
#' @importFrom dplyr first
#' @importFrom dplyr rename
#' @importFrom dplyr summarize
#' @importFrom dplyr bind_rows
#' @importFrom dplyr ungroup
#' @importFrom dplyr arrange
#' @importFrom tidyr complete
#'
#' @examples
#' \dontrun{
#'   #write me an example please
#' }
clean_cli <- function(cli){
  cli_raw <- cli %>%
    dplyr::mutate(
      ED_Visit = dplyr::if_else(FacilityType == "Emergency Care", 1L, 0L),
      Non_Resident = dplyr::if_else(grepl("WI_", Region), 0L, 1L),
      Visit_Date = as.Date(trunc(C_Visit_Date_Time, units = "days")),
      Hx_County = sub("WI_", "", HospitalRegion),
      County = sub("WI_", "", Region)
    ) %>%
    dplyr::filter(ED_Visit == 1L, Non_Resident == 0L)

  #retangularize the dates too so all counties are represented for all days
  cli_cty <- cli_raw %>%
    dplyr::group_by(County, Visit_Date) %>%
    dplyr::summarize(
      DailyED = sum(ED_Visit),
      .groups = "drop"
    ) %>%
    dplyr::full_join(dplyr::select(county_data, County = county, fips, herc_region, pop_2020),
                     by = "County") %>%
    tidyr::complete(County, Visit_Date,
                    fill = list("DailyED" = 0L)) %>%
    dplyr::group_by(County) %>%
    dplyr::mutate(dplyr::across(fips:pop_2020, ~dplyr::first(.x[!is.na(.x)]))) %>%
    dplyr::filter(!is.na(Visit_Date))

  #Aggregate to HERC and State and Append
  cli_herc <- cli_cty %>%
    dplyr::group_by(herc_region, Visit_Date) %>%
    dplyr::summarize(
      DailyED = sum(DailyED),
      pop_2020 = sum(pop_2020),
      .groups = "drop"
    ) %>%
    dplyr::rename(fips = herc_region) %>%
    dplyr::mutate(
      County = fips
    )

  cli_state <- cli_cty %>%
    dplyr::group_by(Visit_Date) %>%
    dplyr::summarize(
      DailyED = sum(DailyED),
      pop_2020 = sum(pop_2020),
      .groups = "drop"
    ) %>%
    dplyr::mutate(
      County = "Wisconsin",
      fips = "55"
    )

  dplyr::bind_rows(cli_cty, cli_herc, cli_state) %>%
    dplyr::select(-herc_region) %>%
    dplyr::arrange(County, Visit_Date) %>%
    dplyr::ungroup()
}

#' Clean ESSENCE data for ILI metrics
#'
#' @param ili data.frame from \code{\link{pull_essence}}
#'
#' @return a cleaned data.frame
#'
#' @importFrom dplyr %>%
#' @importFrom dplyr mutate
#' @importFrom dplyr if_else
#' @importFrom dplyr filter
#' @importFrom dplyr group_by
#' @importFrom dplyr summarize
#' @importFrom dplyr across
#' @importFrom dplyr left_join
#' @importFrom dplyr select
#' @importFrom dplyr rename
#' @importFrom dplyr bind_rows
#' @importFrom dplyr arrange
#' @importFrom dplyr ungroup
#' @importFrom tidyr complete
#'
#' @examples
#' \dontrun{
#'   #write me an example please
#' }
clean_ili <- function(ili) {
  ili_raw <- ili %>%
    dplyr::mutate(
      ED_Visit = dplyr::if_else(FacilityType == "Emergency Care", 1L, 0L),
      Non_Resident = dplyr::if_else(grepl("WI_", Region), 0L, 1L),
      Visit_Date = as.Date(trunc(C_Visit_Date_Time, units = "days")),
      County = sub("WI_", "", Region),
      Total_Visits = ifelse(is.na(C_BioSense_ID),0,1),
      ILI_Visits = ifelse(grepl("ILI CCDD v1", CCDDCategory_flat),1,0),
      ILI_dx = ifelse(grepl("CDC Influenza DD v1", CCDDCategory_flat),1,0)
    ) %>%
    dplyr::filter(ED_Visit == 1L, Non_Resident == 0L)

  #retangularize the dates too so all counties are represented for all days
  ili_cty <- ili_raw %>%
    dplyr::group_by(County, Visit_Date) %>%
    dplyr::summarize(
      dplyr::across(c("Total_Visits", "ILI_Visits", "ILI_dx"), sum),
      .groups = "drop"
    )  %>%
    dplyr::full_join(dplyr::select(county_data, County = county, fips, herc_region),
                     by = "County") %>%
    tidyr::complete(County, Visit_Date,
                    fill = list("Total_Visits" = 0L, "ILI_Visits" = 0L, "ILI_dx" = 0L)) %>%
    dplyr::group_by(County) %>%
    dplyr::mutate(dplyr::across(fips:herc_region, ~dplyr::first(.x[!is.na(.x)]))) %>%
    dplyr::filter(!is.na(Visit_Date))

  #Aggregate to HERC and State and Append
  ili_herc <- ili_cty %>%
    dplyr::group_by(herc_region, Visit_Date) %>%
    dplyr::summarize(
      dplyr::across(c("Total_Visits", "ILI_Visits", "ILI_dx"), sum),
      .groups = "drop"
    ) %>%
    dplyr::rename(fips = herc_region) %>%
    dplyr::mutate(
      County = fips
    )

  ili_state <- ili_cty %>%
    dplyr::group_by(Visit_Date) %>%
    dplyr::summarize(
      dplyr::across(c("Total_Visits", "ILI_Visits", "ILI_dx"), sum),
      .groups = "drop"
    ) %>%
    dplyr::mutate(
      County = "Wisconsin",
      fips = "55"
    )

  dplyr::bind_rows(ili_cty, ili_herc, ili_state) %>%
    dplyr::select(-herc_region) %>%
    dplyr::arrange(County, Visit_Date) %>%
    dplyr::ungroup()
}
carlbfrederick/ohiCovidMetrics documentation built on Jan. 10, 2022, 12:20 p.m.