library(glptools)
glp_load_packages()
library(arrow)
library(labelled)

Read in ACS microdata from acs_micro.csv, clean it, and write it to a feather file

MR: I was a bit confused about the function "process_acs" so I renamed it to "clean_acs_micro" to make its purpose clear and to keep it from being confused with "process_census". It is located in glptools/R/microdata. I wonder if it should actually just exist here, since this is the only place it's really used?

After you download the IPUMS extract, you can unzip it and rename it to acs_micro.csv to run the code below. You can rename the old file to hang onto it just in case, or you can delete it.

Depending on how much RAM you have, you might either A) run into issues reading acs_micro.csv in the first place or B) get an error that you can't allocate enough vector space when you run clean_acs_micro If that happens, then we can figure out how to do it piecewise or chunkwise in a way that works. (FYI: the function gc() instructs R to make sure it has cleared out any objects you removed from memory. Hopefully giving you a fighting chance at running clean_acs_micro).

You could write a loop to read acs_micro.csv and run clean_acs_micro in more than two pieces like so:

for(i in 1:10) {

temp <- part of acs_micro.csv temp %<>% clean_acs_micro(gq = FALSE) final_df <- assign_row_join(final_df, temp) rm(temp) gc()

}

# Read American Community Survey in chunks

#make list of col_type double b/c NA throws off base r guessing 
#this needs to be changed based on the # of columns in your dataframe, you can run these lines:
#acs_micro_test <- read_csv("acs_micro.csv", n_max = 1)
#specs(acs_micro_test)
#to see if your columns specs are right and know how many columns there are 
types <- list("d", "d")
types <- rep(types, 28)

#ok here we're actually reading the data
acs_micro1 <- read_csv("acs_micro.csv", n_max = 2000000, col_types = types)
acs_micro2 <- read_csv("acs_micro.csv", col_names = names(acs_micro1), skip = 2000000,
                       n_max = 2000000,col_types = types)
acs_micro3 <- read_csv("acs_micro.csv", col_names = names(acs_micro1), skip = 4000000, n_max = 2000000, col_types = types)
acs_micro4 <- read_csv("acs_micro.csv", col_names = names(acs_micro1), skip = 6000000,  col_types = types)
acs <- list(acs_micro1, acs_micro2, acs_micro3, acs_micro4)
gc()

#make loop because my computer can't handle all of it at once
for(x in acs) {

 temp <- x
 temp %<>% clean_acs_micro(gq = FALSE)
 final_df <- assign_row_join(final_df, temp)
 rm(temp)
 gc()

}

rm(acs, acs_micro1, acs_micro2, acs_micro3, acs_micro4,x)
gc()

#Do this insead if your computer is a stud
# Bind rows and clean the data frame
#acs_micro <- bind_rows(acs_micro1, acs_micro2, acs_micro3)
#rm(acs_micro1, acs_micro2, acs_micro3)
#gc()

#acs_micro %<>% clean_acs_micro(gq = FALSE)

write_feather(final_df, "acs_micro.feather")

This part reads in the replicate weights and chops them down to only the observations in acs_micro. It saves the results to feather files to make them easier to read back into R. This code only needs to be run when adding new observations to the replicate weights.

# Read in the ACS microdata and select only a minimal number of identifiers
acs_micro <- read_feather("acs_micro.feather")

acs_micro %<>% select(year, SERIAL, PERNUM)


# Read all replicate weights, subset to responses in acs_micro, and write to feather files
#   for easier processing
rep1 <- chunked::read_csv_chunkwise("acs_repwts/usa_00084.csv", chunk_size = 1000000)
rep1 %<>% semi_join(acs_micro, by = c("YEAR" = "year",
                                      "SERIAL",
                                      "PERNUM"))
rep1 %<>% collect()
write_feather(rep1, "acs_repwts/rep1.feather")
rm(rep1)
gc()

rep2 <- chunked::read_csv_chunkwise("acs_repwts/usa_00088.csv", chunk_size = 1000000)
rep2 %<>% semi_join(acs_micro, by = c("YEAR" = "year",
                                      "SERIAL",
                                      "PERNUM"))
rep2 %<>% collect()
write_feather(rep2,"acs_repwts/rep2.feather")
rm(rep2)
gc()

rep3 <- chunked::read_csv_chunkwise("acs_repwts/usa_00086.csv", chunk_size = 1000000)
rep3 %<>% semi_join(acs_micro, by = c("YEAR" = "year",
                                      "SERIAL",
                                      "PERNUM"))
rep3 %<>% collect()
write_feather(rep3, "acs_repwts/rep3.feather")
rm(rep3)
gc()

rep4 <- chunked::read_csv_chunkwise("acs_repwts/usa_00089.csv", chunk_size = 1000000)
rep4 %<>% semi_join(acs_micro, by = c("YEAR" = "year",
                                      "SERIAL",
                                      "PERNUM"))
rep4 %<>% collect()
write_feather(rep4, "acs_repwts/rep4.feather")
rm(rep4)
gc()

This section cleans and organizes the replicate weights saved to feather files above. It keeps only the necessary variables and outputs feather files that can be joined to acs_micro. This code only needs to be run when adding new observations to the replicate weights.

rep1 <- read_feather("acs_repwts/rep1.feather")
rep2 <- read_feather("acs_repwts/rep2.feather")
rep3 <- read_feather("acs_repwts/rep3.feather")
rep4 <- read_feather("acs_repwts/rep4.feather")

acs_replicate_person <- bind_rows(rep1, rep2)
acs_replicate_household <- bind_rows(rep3, rep4)

acs_replicate_household %<>%
  select(-PERNUM) %>%
  distinct()

acs_replicate_person %<>% select(-REPWTP) %>% select(year = YEAR, SERIAL, PERNUM, everything())
acs_replicate_household %<>% select(-REPWT) %>% select(year = YEAR, SERIAL, everything())

write_feather(acs_replicate_person,    "acs_repwts/person_repwts.feather")
write_feather(acs_replicate_household, "acs_repwts/household_repwts.feather")

This sections reads in the final acs_micro file and the acs_repwts file, joins them, and saves the output.

MR- I create one county-level output and one MSA-level output. Alternatively, if it would make your computer happier, you could create a person-level file and a household-level file. acs_micro_repwts

acs_micro_repwts <- left_join(acs_micro, acs_replicates_person, by = c("year", "SERIAL", "PERNUM")) %>% left_join(acs_replicates_household, by = c("year", "SERIAL"))

acs_micro_FIPS_repwts <- acs_micro_repwts %>% pull_peers(geog = "FIPS")

write_feather(acs_micro_repwts, path %p% "acs_micro_repwts.feather") write_feather(acs_micro_FIPS_repwts, path %p% "acs_micro_FIPS_repwts.feather")

acs_micro                <- read_feather("acs_micro.feather")
acs_replicates_person    <- read_feather("person_repwts.feather")
acs_replicates_household <- read_feather("household_repwts.feather")

# Join person and household weights to acs_micro
acs_micro_repwts <- left_join(acs_micro, acs_replicates_person, by = c("year", "SERIAL", "PERNUM")) %>%
  left_join(acs_replicates_household, by = c("year", "SERIAL"))

acs_micro_FIPS_repwts <- acs_micro_repwts %>% 
  pull_peers(geog = "FIPS")

rm(acs_replicates_person)
rm(acs_replicates_household)
gc()

#Now save the data just for Louisville 

acs_micro_Lou_repwts <- pull_Lou(acs, geog="MSA")

# arrow::write_feather never writes the whole file and just crashes...hmmm.
feather::write_feather(acs_micro_repwts, "acs_micro_repwts.feather")
feather::write_feather(acs_micro_FIPS_repwts, "acs_micro_FIPS_repwts.feather")
feather::write_feather(acs_micro_Lou_repwts, "acs_micro_Lou_repwts.feather")

Current Population Survey

cps_micro <- read_csv("cps_micro.csv") 
cps_micro %<>% clean_cps_micro() 
feather::write_feather(cps_micro, "cps_micro.feather") 

SMART Behavioral Risk Factor Surveillance System

brfss_micro <- brfss_time("brfss") 
feather::write_feather(brfss_micro, "brfss_micro.feather") 


greaterlouisvilleproject/glpdata documentation built on Nov. 2, 2023, 8:50 a.m.