data-raw/save_gss.R

library(dplyr)
library(forcats)
library(srvyr)
library(ggplot2)

# pull gss data
temp <- tempfile()
download.file("https://gss.norc.org/documents/stata/GSS_stata.zip",temp)

# if this next line errors with "No such file or directory", try
# incrementing the number after "_R"
gss_orig <- haven::read_dta(unz(temp, filename = "GSS7218_R2.DTA")) %>%
  haven::as_factor()
unlink(temp)

# select relevant columns
gss_small <- gss_orig %>%
  filter(!stringr::str_detect(sample, "blk oversamp")) %>% # this is for weighting
  select(year, age, sex, college = degree, partyid, hompop,
         hours = hrs1, income, class, finrela, weight = wtssall) %>%
  mutate_if(is.factor, ~fct_collapse(., NULL = c("IAP", "NA", "iap", "na"))) %>%
  mutate(age = age %>%
           fct_recode("89" = "89 or older",
                      NULL = "DK") %>% # truncated at 89
           as.character() %>%
           as.numeric(),
         hompop = hompop %>%
           fct_collapse(NULL = c("DK")) %>%
           as.character() %>%
           as.numeric(),
         hours = hours %>%
           fct_recode("89" = "89+ hrs",
                      NULL = "DK") %>% # truncated at 89
           as.character() %>%
           as.numeric(),
         weight = weight %>%
           as.character() %>%
           as.numeric(),
         partyid = fct_collapse(partyid,
           dem = c("strong democrat", "not str democrat"),
           rep = c("strong republican", "not str republican"),
           ind = c("ind,near dem", "independent", "ind,near rep"),
           other = "other party"
         ),
         income = factor(income, ordered = TRUE),
         college = fct_collapse(college,
           degree = c("junior college", "bachelor", "graduate"),
           "no degree"  = c("lt high school", "high school"),
           NULL = "dk" # no dks show up in the data, so drop this level
         )
         )

# sample 3k rows, first dropping NAs
set.seed(20200201)
gss <- gss_small %>%
  drop_na() %>%
  sample_n(500)

# check that the sample is similar unweighted to weighted
gss_wt <- srvyr::as_survey_design(gss, weights = weight)

unweighted <- gss %>%
  group_by(year, sex, partyid) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  group_by(year, sex) %>%
  mutate(prop = n / sum(n))

weighted <- gss_wt %>%
  group_by(year, sex, partyid) %>%
  summarize(prop = srvyr::survey_mean())

# save data into package
usethis::use_data(gss, overwrite = TRUE)

devtools::document()
tidymodels/infer documentation built on March 28, 2024, 7:02 p.m.