#' Prepare CE data for calculating an estimated mean or median
#'
#' @description Reads in the family characteristics (FMLI/-D) and expenditure
#' tabulation (MTBI/EXPD) files and merges the relevant data for calculating a
#' weighted mean or median.
#'
#' @param year A year between 1997 and the last year of available CE PUMD.
#' @param survey One of either interview, diary, or integrated as a character or
#' symbol.
#' @param uccs A character vector of UCCs corresponding to expenditure
#' categories in the hierarchical grouping (HG) for a given year and survey.
#' @param hg A data frame that has, at least, the title, level, ucc, and
#' factor columns of a CE HG file. Calling [ce_hg()] will generate a
#' valid HG file.
#' @param ... Variables to include in the dataset from the family
#' characteristics file. This is intended to allow the user to calculate
#' estimates for subsets of the data.
#' @param recode_variables A logical indicating whether to recode all coded
#' variables except 'UCC' using the codes in the CE's excel dictionary which can
#' be downloaded from the
#' [CE Documentation Page](https://www.bls.gov/cex/pumd_doc.htm)
#' @param int_zp String indicating the path of the Interview data zip file(s) if
#' already stored. If the file(s) does not exist its corresponding zip file will
#' be stored in that path. The default is `NULL` which causes the zip file
#' to be stored in temporary memory during function operation.
#' @param dia_zp Same as `int_zp` above, but for Diary data.
#' @param dict_path A string indicating the path where the CE PUMD dictionary
#' is stored if already stored. If the file does not exist and
#' `recode_variables = TRUE` the dictionary will be stored in this path.
#' The default is `NULL` which causes the zip file to be stored in
#' temporary memory during function operation. Automatically changed to
#' `NULL` if a valid input for `own_codebook` is given.
#' @param own_codebook An optional data frame containing a user-defined codebook
#' containing the same columns as the CE Dictionary "Codes " sheet. If the input
#' is not a data frame or does not have all of the required columns, the
#' function will give an error message. See details for the required columns.
#'
#' @return A data frame containing the following columns:
#' \itemize{
#' \item newid - A consumer unit (CU), or household, identifier
#' \item finlwt21 - CU weight variable
#' \item wtrep01 through wtrep44 - CU replicate weight variables (see details)
#' \item ... - Any family characteristics variables that were kept
#' \item mo_scope - Months in scope (see details)
#' \item popwt - An adjusted weight meant to account for the fact that a CUs
#' value of finlwt21 is meant to be representative of only 1 quarter of
#' data (see details)
#' \item ucc - The UCC for a given expenditure
#' \item ref_yr - The year in which the corresponding expenditure occurred
#' \item ref_mo - The month in which the corresponding expenditure occurred
#' \item cost - The value of the expenditure (in U.S. Dollars)
#' \item survey - An indicator of which survey the data come from: "I" for
#' Interview and "D" for Diary.
#' }
#'
#' @details
#' CE microdata include 45 weights. The primary weight that is used for
#' calculating estimated means and medians is finlwt21. The 44 replicate weights
#' are computed using Balanced Repeated Replication (BRR) and are used for
#' calculating weighted standard errors.
#'
#' "Months in scope" refers to the proportion of the data collection quarter for
#' which a CU reported expenditures. For the Diary survey the months in scope is
#' always 3 because the expenditure data collected are meant to be reported for
#' the quarter in which they are collected. The Interview Survey, on the other
#' hand, is a quarterly, rolling, recall survey and the CU's report expenditures
#' for the 3 months previous to the month in which the data are collected. For
#' example, if a CU was interviewed in February 2017, then they would be
#' providing data for November 2016, December 2016, and January 2017. If one is
#' calculating a weighted estimated mean for the 2017 calendar year, then only
#' the January 2017 data would be "in scope."
#'
#' CE data are reported quarterly, but the sum of the weights (finlwt21) is
#' for all CU's is meant to represent the total number of U.S. CU's for a given
#' year. Since a calculating a calendar year estimate requires the use of 4
#' quarters of data and the sum of the weights in each quarter equals the
#' number of households in the U.S. for a given year, adding up the sums of the
#' weights in the 4 quarters of data would yield a total number of households
#' that is approximately 4 times larger than the actual number of households in
#' the U.S. in the corresponding year.
#'
#' Since some UCC's can appear in both surveys, for the purposes of integration,
#' the CE has a source selection procedure by which to choose which source data
#' will be taken from for a given UCC. For example, of the 4 UCC's in the "Pets"
#' category in 2017 two were sourced for publication from the Diary and two from
#' the Interview. Please download the CE Source Selection Document for a
#' complete listing: <https://www.bls.gov/cex/ce_source_integrate.xlsx>.
#'
#' Family characteristic variables added through "..." will be read in as
#' character data type.
#'
#' @export
#'
#' @importFrom readxl excel_sheets
#' @importFrom readxl cell_cols
#' @importFrom dplyr select
#' @importFrom dplyr filter
#' @importFrom dplyr across
#' @importFrom dplyr mutate
#' @importFrom tidyr replace_na
#' @importFrom janitor clean_names
#'
#' @examples
#' \dontrun{
#' # The following workflow will prepare a dataset for calculating integrated
#' # pet expenditures for 2021 keep the "sex_ref" variable in the data to
#' # potentially calculate means by sex of the reference person.
#'
#' # First generate an HG file
#' my_hg <- ce_hg(2021, "integrated", "CE-HG-Inter-2021.txt")
#'
#' # Store a vector of UCC's in the "Pets" category
#' pet_uccs <- ce_uccs(my_hg, "Pets")
#'
#' # Store the diary data (not run)
#' pets_dia <- ce_prepdata(
#' year = 2021,
#' survey = "integrated",
#' uccs = pet_uccs,
#' integrate_data = FALSE,
#' hg = my_hg,
#' dia_zip = "diary21.zip"
#' sex_ref
#' )
#' }
# !diagnostics suppress = last_year, first_year, variable_name, code_value
# !diagnostics suppress = code_description
ce_prepdata <- function(year,
survey,
hg,
uccs,
...,
int_zp = NULL,
dia_zp = NULL,
recode_variables = FALSE,
dict_path = NULL,
own_codebook = NULL) {
survey <- rlang::ensym(survey)
survey_name <- rlang::as_string(survey) |> tolower()
grp_vars <- rlang::ensyms(...)
grp_var_names <- purrr::map(grp_vars, rlang::as_string) |>
unlist() |>
tolower()
if (year < 1997) {
stop("cepumd only works with data from 1997 onward.")
}
if (!survey_name %in% c("interview", "diary", "integrated")) {
stop("'survey' must be one of 'interview,' 'diary,' or 'integrated.'")
}
if (length(uccs) > 0 && is.character(uccs)) {
for (u in uccs) {
if (is.na(as.numeric(u)) || nchar(u) != 6) {
stop(
paste0(
"'", u, "' is not a valid UCC. Please review the CE PUMD",
" documentation."
)
)
}
}
} else {
stop("Please enter a valid UCC")
}
if (!is.null(hg)) {
if (
!is.data.frame(hg) ||
!all(c("title", "level", "ucc", "factor") %in% names(hg))
) {
stop(
paste(
"'hg' requires a valid HG dataframe. Please generate one using",
"ce_hg()."
)
)
}
}
if (recode_variables) {
if (!is.null(own_codebook)) {
if (
!is.data.frame(own_codebook) ||
!all(
c(
"survey", "file", "variable", "code_value", "code_description",
"first_year", "first_quarter", "last_year", "last_quarter"
) %in%
names(janitor::clean_names(own_codebook))
)
) {
stop(
stringr::str_c(
"Your codebook either is not a data frame or does not have ",
"the required columns. It should have:\n",
"survey, file, variable, code_value, code_description, ",
"first_year, first_quarter, last_year, last_quarter"
)
)
}
if (!all({{grp_var_names}} %in% tolower(own_codebook$variable))) {
warning(
"Some of your grouping variable(s) is (are) were not found in your.",
"codebook. Only variables found in the codebook will be recoded."
)
}
ce_codes <- own_codebook |>
dplyr::mutate(
variable = stringr::str_to_lower(.data$variable),
survey = stringr::str_to_upper(.data$survey) |> stringr::str_sub(1, 1)
)
rm(dict_path)
} else {
if (is.null(dict_path)) {
stop(
"Please provide a valid file path to your codebook (CE Dictionary) ",
"in order to recode variables."
)
} else if (isFALSE(file.exists(dict_path))) {
stop(
"Please provide a valid file path to your codebook (CE Dictionary) ",
"in order to recode variables."
)
}
code_sheet <- grep(
"^Codes",
readxl::excel_sheets(dict_path),
value = TRUE
)
ce_codes <- suppressWarnings(
readxl::read_excel(
dict_path,
sheet = code_sheet,
range = readxl::cell_cols("A:J"),
guess_max = 4000
)
) |>
janitor::clean_names() |>
dplyr::mutate(
survey = stringr::str_sub(.data$survey, 1, 1),
variable = stringr::str_to_lower(.data$variable),
last_year = dplyr::if_else(
is.na(.data$last_year),
max(.data$last_year, na.rm = TRUE),
.data$last_year
)
) |>
dplyr::filter(
.data$first_year <= year,
.data$last_year >= year,
) |>
dplyr::group_by(
.data$survey, .data$file, .data$variable, .data$code_value
) |>
dplyr::slice_max(.data$first_year, n = 1, with_ties = FALSE) |>
dplyr::slice_max(.data$first_quarter, n = 1, with_ties = FALSE) |>
dplyr::ungroup()
}
} # end "if (recode_variables)"
if (is.null(int_zp) && is.null(dia_zp)) {
stop(
"You must provide at least 1 zip file with data for either 'dia_zip' or ",
"'int_zip'. In previous versions of 'cepumd' can no longer download ",
"data automatically."
)
}
integrate_data <- ifelse(survey_name == "integrated", TRUE, FALSE)
if (survey_name %in% c("interview", "integrated")) {
# Create a vector of years for which data are required
if (year >= 2020) {
int_yrs <- stringr::str_sub(c(year - 1, year), 3, 4)
} else {
int_yrs <- stringr::str_sub(year, 3, 4)
}
# Create a vector of the required quarters for the given year(s)
int_qtrs <- c(
stringr::str_c(stringr::str_sub(year, 3, 4), 1:4),
stringr::str_c(stringr::str_sub((year + 1), 3, 4), 1)
)
interview_files <- get_survey_files(
year = year,
survey = "interview",
file_yrs = int_yrs,
qtrs = int_qtrs,
zp_file = int_zp
)
fmli <- purrr::map2_df(
interview_files$family$Name,
interview_files$family$zip_file,
\(x, y) read.fmli(x, y, year, !!!grp_vars)
) |>
dplyr::bind_rows()
mtbi <- purrr::map2_df(
interview_files$expenditure$Name,
interview_files$expenditure$zip_file,
\(x, y) {
read.mtbi(
x,
y,
year = year,
uccs = uccs,
integrate_data = integrate_data,
hg = hg
)
}
) |>
dplyr::bind_rows()
interview <- dplyr::left_join(fmli, mtbi, by = "newid") |>
dplyr::mutate(cost = dplyr::if_else(is.na(.data$cost), 0, .data$cost)) |>
dplyr::mutate(survey = "I")
if (recode_variables) {
interview <- recode_ce_variables(interview, ce_codes, "I")
}
}
if (survey_name %in% c("diary", "integrated")) {
dia_yrs <- stringr::str_sub(year, 3, 4)
dia_qtrs <- stringr::str_c(stringr::str_sub(year, 3, 4), 1:4)
diary_files <- get_survey_files(
year = year,
survey = "diary",
file_yrs = dia_yrs,
qtrs = dia_qtrs,
zp_file = dia_zp
)
fmld <- purrr::map2_df(
diary_files$family$Name,
diary_files$family$zip_file,
\(x, y) read.fmld(x, y, !!!grp_vars)
) |>
dplyr::bind_rows()
expd <- purrr::map2_df(
diary_files$expenditure$Name,
diary_files$expenditure$zip_file,
\(x, y) {
read.expd(
x,
y,
year = year,
uccs = uccs,
integrate_data = integrate_data,
hg = hg
)
}
) |>
dplyr::bind_rows()
diary <- dplyr::left_join(fmld, expd, by = "newid") |>
dplyr::mutate(cost = dplyr::if_else(is.na(.data$cost), 0, .data$cost)) |>
dplyr::mutate(survey = "D")
if (recode_variables) diary <- recode_ce_variables(diary, ce_codes, "D")
}
if (survey_name == "integrated") {
return(dplyr::bind_rows(interview, diary))
} else if (survey_name == "interview") {
return(interview)
} else if (survey_name == "diary") {
return(diary)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.