library(glptools) glp_load_packages() library(arrow) path <- "../../../glp_data_files/microdata/usa_00108.csv" data_path <- "../../../glp_data_files/microdata/acs_repwts/"
Read in ACS microdata from acs_micro.csv, clean it, and write it to a feather file
Uses these files: 1. usa_00108.csv - variables for 2000-2021 2. usa_00088.csv - repwt person for 2005-2009 3. usa_00109.csv - repwt person for 2010-2017 4. usa_00105.csv - repwt person for 2018-2022 5. usa_00110.csv - repwt household for 2005-2009 5. usa_00106.csv - repwt household for 2010-2017 6. usa_00107.csv - repwt household for 2018-2022
#setwd("../../../glp_data_files/microdata/acs_repwts") #R.utils::gunzip("usa_00109.csv.gz")
# Read American Community Survey in chunks # Make list of col_type double b/c NA throws off base r guessing # This needs to be changed based on the # of columns in your dataframe, you can run these lines: # acs_micro_test <- read_csv(path, n_max = 1) # specs(acs_micro_test) # to see if your columns specs are right and know how many columns there are types <- list("d") types <- rep(types, 61) # Nrows as of 2022 = 26-28M # Create a for loop to read and clean the data 2M rows at a time # Final result as of 2022 is 5.9M rows start_row <- 0 continue <- TRUE while(continue) { print(scales::comma(start_row)) # Read data if(start_row == 0) { temp_acs <- read_csv(path, n_max = 2000000, col_types = types) column_names <- names(temp_acs) } else { temp_acs <- read_csv(path, col_names = column_names, skip = start_row, n_max = 2000000, col_types = types) continue <- if_else(nrow(temp_acs) == 2000000, TRUE, FALSE) } gc() # Clean data temp_acs %<>% clean_acs_micro(gq = TRUE) # Join to output acs_micro <- assign_row_join(acs_micro, temp_acs) rm(temp_acs) gc() # Increase start row start_row = start_row + 2000000 } write_feather(acs_micro, "acs_micro.feather")
This part reads in the replicate weights and chops them down to only the observations in acs_micro. It saves the results to feather files to make them easier to read back into R. This code only needs to be run when adding new observations to the replicate weights.
# Read in the ACS microdata and select only a minimal number of identifiers acs_micro <- read_feather("acs_micro.feather") acs_micro %<>% select(year, SERIAL, PERNUM) gc() # For each replicate weight file, use the chunked package to subset the file to only observations included in acs_micro. # Write each result to a feather file to be combined in the next chunk. # Person 2005 - 2009 rep1 <- chunked::read_csv_chunkwise(data_path %p% "usa_00088.csv", chunk_size = 1000000) rep1 %<>% semi_join(acs_micro, by = c("YEAR" = "year", "SERIAL", "PERNUM")) rep1 %<>% collect() write_feather(rep1, data_path %p% "rep1.feather") rm(rep1) gc() # Person 2010 - 2017 rep2 <- chunked::read_csv_chunkwise(data_path %p% "usa_00109.csv", chunk_size = 1000000) rep2 %<>% semi_join(acs_micro, by = c("YEAR" = "year", "SERIAL", "PERNUM")) rep2 %<>% collect() write_feather(rep2, data_path %p% "rep2.feather") rm(rep2) gc() # Person 2018 - 2022 rep3 <- chunked::read_csv_chunkwise(data_path %p% "usa_00105.csv", chunk_size = 1000000) rep3 %<>% semi_join(acs_micro, by = c("YEAR" = "year", "SERIAL", "PERNUM")) rep3 %<>% collect() write_feather(rep3, data_path %p% "rep3.feather") rm(rep3) gc() # Household 2005 - 2009 rep4 <- chunked::read_csv_chunkwise(data_path %p% "usa_00110.csv", chunk_size = 1000000) rep4 %<>% semi_join(acs_micro, by = c("YEAR" = "year", "SERIAL")) rep4 %<>% collect() %>% distinct() write_feather(rep4, data_path %p% "rep4.feather") rm(rep4) gc() # Household 2010 - 2017 rep5 <- chunked::read_csv_chunkwise(data_path %p% "usa_00106.csv", chunk_size = 1000000) rep5 %<>% semi_join(acs_micro, by = c("YEAR" = "year", "SERIAL")) rep5 %<>% collect() %>% distinct() write_feather(rep5, data_path %p% "rep5.feather") rm(rep5) gc() # Household 2018 - 2022 rep6 <- chunked::read_csv_chunkwise(data_path %p% "usa_00107.csv", chunk_size = 1000000) rep6 %<>% semi_join(acs_micro, by = c("YEAR" = "year", "SERIAL")) rep6 %<>% collect() %>% distinct() write_feather(rep6, data_path %p% "rep6.feather") rm(rep6) gc()
This section cleans and organizes the replicate weights saved to feather files above. It keeps only the necessary variables and outputs feather files that can be joined to acs_micro. This code only needs to be run when adding new observations to the replicate weights.
## PERSON WEIGHTS # Read and combine the person-level replicate weights rep1 <- read_feather(data_path %p% "rep1.feather") rep2 <- read_feather(data_path %p% "rep2.feather") rep3 <- read_feather(data_path %p% "rep3.feather") acs_replicate_person <- bind_rows(rep1, rep2, rep3) rm(rep1, rep2, rep3) gc() # Inspect the weights to ensure they are correct test1 <- acs_replicate_person %>% group_by(YEAR) %>% reframe(repwt_n=n()) test2 <- acs_micro %>% filter(year > 2000) %>% group_by(year) %>% reframe(micro_n=n()) test3 <- left_join(test1, test2, by = c("YEAR" = "year")) test3 %<>% mutate(check = (repwt_n == micro_n)) # Organize the data frame acs_replicate_person %<>% select( -REPWTP, -SAMPLE, -CBSERIAL, -HHWT, -CLUSTER, -STRATA, -GQ, -PERWT) %>% select( year = YEAR, SERIAL, PERNUM, everything()) write_feather(acs_replicate_person, data_path %p% "person_repwts.feather") rm(acs_replicate_person) gc() ## HOUSEHOLD WEIGHTS # Read and combine the household-level replicate weights rep4 <- read_feather(data_path %p% "rep4.feather") rep5 <- read_feather(data_path %p% "rep5.feather") rep6 <- read_feather(data_path %p% "rep6.feather") acs_replicate_household <- bind_rows(rep4, rep5, rep6) rm(rep4, rep5, rep6) gc() # Inspect the weights to ensure they are correct test1 <- acs_replicate_household %>% group_by(YEAR) %>% reframe(repwt_n=n()) test2 <- acs_micro %>% filter( year > 2000, PERNUM == 1) %>% group_by(year) %>% reframe(micro_n=n()) test3 <- left_join(test1, test2, by = c("YEAR" = "year")) test3 %<>% mutate(check = (repwt_n == micro_n)) # Organize the data frame acs_replicate_household %<>% select(-REPWT, -SAMPLE, -CBSERIAL, -HHWT, -CLUSTER, -STRATA, -GQ) %>% select(year = YEAR, SERIAL, everything()) %>% distinct() write_feather(acs_replicate_household, data_path %p% "household_repwts.feather") rm(acs_replicate_household) rm(test1, test2, test3) gc()
This sections reads in the final acs_micro file and the acs_repwts file, joins them, and saves the output.
MR- I create one county-level output and one MSA-level output. Alternatively, if it would make your computer happier, you could create a person-level file and a household-level file. acs_micro_repwts
acs_micro_repwts <- left_join(acs_micro, acs_replicates_person, by = c("year", "SERIAL", "PERNUM")) %>% left_join(acs_replicates_household, by = c("year", "SERIAL"))
acs_micro_FIPS_repwts <- acs_micro_repwts %>% pull_peers(geog = "FIPS")
write_feather(acs_micro_repwts, path %p% "acs_micro_repwts.feather") write_feather(acs_micro_FIPS_repwts, path %p% "acs_micro_FIPS_repwts.feather")
acs_micro <- read_feather("acs_micro.feather") acs_replicates_person <- read_feather(data_path %p% "person_repwts.feather") acs_replicates_household <- read_feather(data_path %p% "household_repwts.feather") # Join person and household weights to acs_micro acs_micro_repwts <- left_join(acs_micro, acs_replicates_person, by = c("year", "SERIAL", "PERNUM")) %>% left_join(acs_replicates_household, by = c("year", "SERIAL")) rm(acs_replicates_person, acs_replicates_household) gc() write_feather(acs_micro_repwts, "acs_micro_repwts.feather") acs_micro_FIPS_repwts <- acs_micro_repwts %>% pull_peers(geog = "FIPS") write_feather(acs_micro_FIPS_repwts, "acs_micro_FIPS_repwts.feather")
Current Population Survey
cps_micro <- read_csv("cps_micro.csv") cps_micro %<>% clean_cps_micro() arrow::write_feather(cps_micro, "cps_micro.feather")
SMART Behavioral Risk Factor Surveillance System
brfss_micro <- brfss_time("brfss") arrow::write_feather(brfss_micro, "brfss_micro.feather")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.