#' Assemble a data frame of incident hospitalizations due to
#' COVID-19 or influenza as they were available as of a specified issue date.
#'
#' @param issue_date character issue date (i.e. report date) to use for
#' constructing truths in format 'yyyy-mm-dd'
#' @param location_code character vector of location codes. Default to NULL
#' This should be a list of state FIPS code and/or 'US'.
#' @param spatial_resolution character vector specifying spatial unit types to
#' include: state' and/or 'national'
#' This parameter will be ignored if location_code is provided.
#' @param temporal_resolution character vector specifying temporal resolution
#' to include: 'daily' or 'weekly'
#' @param measure character vector specifying measure of disease prevalence:
#' either 'hospitalizations' for COVID hospitalizations or
#' 'flu hospitalizations' for hospitalizations with influenza
#' @param replace_negatives boolean to replace negative incs with imputed data
#' Currently only FALSE is supported
#' @param adjustment_cases character vector specifying times and locations with
#' reporting anomalies to adjust. Only the value "none" is currently supported
#' @param adjustment_method string specifying how anomalies are adjusted.
#' Only the value "none" is currently supported.
#' @param geography character, which data to read. Only "US" is supported.
#' @param drop_last_date boolean indicating whether to drop the last 1 day of
#' data for the influenza and COVID hospitalization signals. The last day of
#' data from the HHS data source is unreliable, so it is recommended to set this
#' to `TRUE`. However, the default is `FALSE` so that the function maintains
#' fidelity to the authoritative data source.
#'
#' @return data frame with columns location (fips code), date, inc, and cum
#' all values of cum will currently be NA
#'
#' @export
load_healthdata_data <- function(
issue_date = NULL,
as_of = NULL,
location_code = NULL,
spatial_resolution = "state",
temporal_resolution = "weekly",
measure = c("hospitalizations", "flu hospitalizations"),
replace_negatives = FALSE,
adjustment_cases = "none",
adjustment_method = "none",
geography = "US",
drop_last_date = FALSE) {
# validate measure and pull in correct data set
measure <- match.arg(measure,
choices = c("hospitalizations", "flu hospitalizations"))
# retrieve data update history
healthdata_timeseries_history <- healthdata_timeseries_history()
healthdata_dailyrevision_history <- healthdata_dailyrevision_history()
all_avail_issue_date <- unique(c(healthdata_timeseries_history$issue_date,
healthdata_dailyrevision_history$issue_date))
# a vector of date objects
all_avail_issue_date <- unique(c(all_avail_issue_date,
covidData::healthdata_hosp_early_data$issue_date))
# validate issue_date and as_of
# and convert as_of and issue_date to date objects
if (!is.null(issue_date) && !is.null(as_of)) {
stop("Cannot provide both arguments issue_date and as_of to load_healthcare_data.")
} else if (is.null(issue_date) && is.null(as_of)) {
issue_date <- max(all_avail_issue_date)
} else if (!is.null(as_of)) {
avail_issues <- all_avail_issue_date[
all_avail_issue_date <= as.Date(as_of)
]
if (length(avail_issues) == 0) {
stop("Provided as_of date is earlier than all available issue dates.")
} else {
issue_date <- max(avail_issues)
}
} else {
issue_date <- lubridate::ymd(issue_date)
}
if (!(issue_date %in% all_avail_issue_date)) {
stop(paste0(
'Invalid issue date; must be one of: ',
paste0(all_avail_issue_date, collapse = ', ')
))
}
# validate spatial_resolution
spatial_resolution <- match.arg(
spatial_resolution,
choices = c("state", "national"),
several.ok = TRUE
)
# validate temporal_resolution
temporal_resolution <- match.arg(
temporal_resolution,
choices = c("daily", "weekly"),
several.ok = FALSE
)
# validate replace_negatives
if (replace_negatives) {
stop("Currently, only replace_negatives = FALSE is supported")
}
# validate adjustment_cases and adjustment_method
adjustment_cases <- match.arg(
adjustment_cases,
choices = "none",
several.ok = FALSE
)
adjustment_method <- match.arg(
adjustment_method,
choices = "none",
several.ok = FALSE
)
#download and preprocess data based on issue_date
raw_healthdata_data <- build_healthdata_data(
issue_date,
healthdata_timeseries_history,
healthdata_dailyrevision_history)
if (issue_date > as.Date("2021-03-12")) {
raw_healthdata_data <- preprocess_healthdata_data(
raw_healthdata_data,
covidData::fips_codes,
measure = measure)
} else if (measure == "flu hospitalizations") {
stop("Flu hospitalizations not available for specified issue date.")
}
# data with as_of/issue_date before/on "2021-03-12"
# is already in the correct format
healthdata_data <- raw_healthdata_data %>%
dplyr::pull(data) %>%
`[[`(1)
all_locations <- unique(healthdata_data$location)
if (!is.null(location_code)){
locations_to_keep <- match.arg(
location_code,
choices = all_locations,
several.ok = TRUE)
# ignore spatial_resolution
spatial_resolution <- NULL
} else {
# drop results for irrelevant locations
locations_to_keep <- NULL
if ("state" %in% spatial_resolution) {
locations_to_keep <- all_locations[all_locations != "US"]
}
if ("national" %in% spatial_resolution) {
locations_to_keep <- c(locations_to_keep, "US")
}
}
results <- healthdata_data %>%
dplyr::select(location, date, inc) %>%
dplyr::filter(location %in% locations_to_keep)
# If requested, drop the last day of data within each location
if (drop_last_date) {
results <- results %>%
dplyr::group_by(location) %>%
dplyr::filter(date < max(date)) %>%
dplyr::ungroup()
}
# aggregate daily incidence to weekly incidence
if (temporal_resolution == "weekly") {
results <- results %>%
dplyr::mutate(
sat_date = lubridate::ceiling_date(
lubridate::ymd(date), unit = "week") - 1
) %>%
dplyr::group_by(location) %>%
# if the last week is not complete, drop all observations from the
# previous Saturday in that week
dplyr::filter(
if (max(date) < max(sat_date)) date <= max(sat_date) - 7 else TRUE
) %>%
dplyr::ungroup() %>%
dplyr::select(-date) %>%
dplyr::rename(date = sat_date) %>%
dplyr::group_by(location, date) %>%
dplyr::summarize(inc = sum(inc, na.rm = FALSE), .groups = "drop")
}
# aggregate inc to get the correct cum
results <- results %>%
dplyr::mutate(
date = lubridate::ymd(date),
cum = results %>%
dplyr::group_by(location) %>%
dplyr::mutate(cum = cumsum(inc)) %>%
dplyr::ungroup() %>%
dplyr::pull(cum)
)
return(results)
}
#' Preprocess healthdata data set, calculating incidence, adjusting date, and
#' calculating national incidence.
#'
#' @param raw_healthdata_data tibble one row and columns issue_date and data
#' The data column should be a list of data frames, with column names
#' date, state, previous_day_admission_adult_covid_confirmed, and
#' previous_day_admission_pediatric_covid_confirmed
#' @param fips_codes covidData::fips_codes data object
#' @param measure the measure to retrieve, either "hospitalizations" or
#' "flu hospitalizations"
#'
#' @return a result of similar format to raw_healthdata_data, but columns
#' date, location, and inc
preprocess_healthdata_data <- function(raw_healthdata_data, fips_codes,
measure) {
result <- raw_healthdata_data
# calculate incidence column, change date to previous day, and
# rename state to abbreviation
if (measure == "hospitalizations") {
result$data[[1]] <- result$data[[1]] %>%
dplyr::transmute(
abbreviation = state,
date = as.Date(date) - 1,
inc = previous_day_admission_adult_covid_confirmed +
previous_day_admission_pediatric_covid_confirmed
)
} else if (measure == "flu hospitalizations") {
if (!("previous_day_admission_influenza_confirmed" %in% colnames(result$data[[1]]))) {
stop("Flu hospitalizations not available for specified issue date.")
}
result$data[[1]] <- result$data[[1]] %>%
dplyr::transmute(
abbreviation = state,
date = as.Date(date) - 1,
inc = previous_day_admission_influenza_confirmed
)
} else {
stop("Invalid measure: must be either 'hospitalizations' or ",
"'flu hospitalizations'.")
}
# add US location by summing across all others
result$data[[1]] <- dplyr::bind_rows(
result$data[[1]],
result$data[[1]] %>%
dplyr::group_by(date) %>%
dplyr::summarise(inc = sum(inc), .groups = "drop") %>%
dplyr::mutate(abbreviation = "US")
)
# add location column, remove abbreviation
result$data[[1]] <- result$data[[1]] %>%
dplyr::left_join(
fips_codes %>% dplyr::select(location, abbreviation),
by = "abbreviation"
) %>%
dplyr::select(-abbreviation)
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.