data-raw/lou-vax-survey.R

library(tidycensus)
library(dplyr)

library(survey)
library(srvyr)
library(svrep)

set.seed(2014)

# Load the Census API key ----
  census_api_key(
    readLines("census-api-key")
  )

# Download PUMS data for Louisville ----
  lou_pums_data <- get_pums(
    variables = c("SEX", "RAC1P", "HISP", "AGEP", "SCHL"),
    survey = "acs5",
    year = 2019,
    state = "KY",
    puma = paste0("0170", 1:6),
    recode = TRUE,
    rep_weights = "person"
  )

# Add derived variables to use for calibration ----
  lou_pums_data <- lou_pums_data |>
    mutate(
      RACE_ETHNICITY = case_when(
        HISP_label != "Not Spanish/Hispanic/Latino" ~ "Hispanic or Latino",
        !as.character(RAC1P_label) %in% c("White alone", "Black or African American alone",
                            "Hispanic or Latino") ~ "Other Race, not Hispanic or Latino",
        TRUE ~ paste0(as.character(RAC1P_label), ", not Hispanic or Latino")
      ),
      EDUC_ATTAINMENT = case_when(
        SCHL_label %in% c("Associate's degree", "Bachelor's degree", "Master's degree",
                          "Professional degree beyond a bachelor's degree",
                          "Doctorate degree") ~ "High school or beyond",
        TRUE ~ "Less than high school"
      )
    )

  lou_pums_data <- lou_pums_data |>
    mutate(CONTROL_CATEGORY = interaction(RACE_ETHNICITY, SEX_label,
                                          EDUC_ATTAINMENT, sep = "|"))

# Convert to survey design object ----
  lou_rep_design <- tidycensus::to_survey(
    df = lou_pums_data,
    type = "person"
  )

# Generate population vaccination rates ----

  population_counts <- lou_rep_design |>
    filter(AGEP >= 18) |>
    survey_count(CONTROL_CATEGORY, name = "Population_Size") |>
    mutate(categories = strsplit(x = as.character(CONTROL_CATEGORY),
                                 split = "|", fixed = TRUE),
           RACE_ETHNICITY = sapply(categories, function(x) x[1]),
           SEX = sapply(categories, function(x) x[2]),
           EDUC_ATTAINMENT = sapply(categories, function(x) x[3])) |>
    select(-categories)

  pop_vax_rates <- population_counts |>
    mutate(
      VAX_RATE = case_when(
        grepl(x = RACE_ETHNICITY, "Black or African American alone") ~ 0.53,
        grepl(x = RACE_ETHNICITY, "White alone") ~ 0.58,
        grepl(x = RACE_ETHNICITY, "^Hispanic or Latino") ~ 0.48,
        TRUE ~ 0.50
      )
    ) |>
    mutate(
      VAX_ODDS = case_when(
        SEX == "Female" ~ VAX_RATE/(1-VAX_RATE) * 1.25,
        SEX == "Male" ~ VAX_RATE/(1-VAX_RATE) * 0.75
      )
    ) |>
    mutate(
      VAX_ODDS = case_when(
        EDUC_ATTAINMENT == "High school or beyond" ~ VAX_ODDS * 1.33,
        EDUC_ATTAINMENT == "Less than high school" ~ VAX_ODDS * 0.67
      )
    ) |>
    mutate(VAX_RATE = VAX_ODDS / (1 + VAX_ODDS))

# Generate response propensity variable ----

  pop_vax_rates <- pop_vax_rates |>
    mutate(
      RESP_PROPENSITY = case_when(
        grepl(x = RACE_ETHNICITY, "Black or African American alone") ~ 0.45,
        grepl(x = RACE_ETHNICITY, "White alone") ~ 0.48,
        grepl(x = RACE_ETHNICITY, "^Hispanic or Latino") ~ 0.28,
        TRUE ~ 0.45
      )
    ) |>
    mutate(
      RESP_ODDS = case_when(
        SEX == "Female" ~ RESP_PROPENSITY/(1-RESP_PROPENSITY) * 1.07,
        SEX == "Male" ~ RESP_PROPENSITY/(1-RESP_PROPENSITY) * 0.93
      )
    ) |>
    mutate(
      RESP_ODDS = case_when(
        EDUC_ATTAINMENT == "High school or beyond" ~ RESP_ODDS * 1.5,
        EDUC_ATTAINMENT == "Less than high school" ~ RESP_ODDS * 1
      )
    ) |>
    mutate(RESP_PROPENSITY = RESP_ODDS / (1 + RESP_ODDS))

# Draw simple random sample for vaccination survey ----

  lou_vax_survey <- pop_vax_rates |>
    mutate(SAMPLING_WEIGHT = sum(Population_Size) / 1000) |>
    select(RACE_ETHNICITY, SEX, EDUC_ATTAINMENT,
           Population_Size, VAX_RATE,
           RESP_PROPENSITY, SAMPLING_WEIGHT) |>
    sample_n(size = 1000, weight = Population_Size, replace = TRUE) |>
    select(-Population_Size)

  ##_ Generate vaccination status and response status
  lou_vax_survey <- lou_vax_survey |>
    mutate(VAX_STATUS = sapply(VAX_RATE, FUN = function(vax_prob) {
      ifelse(vax_prob > runif(n = 1), "Vaccinated", "Unvaccinated")
    })) |>
    mutate(VAX_STATUS = sapply(VAX_RATE, FUN = function(vax_prob) {
      ifelse(vax_prob > runif(n = 1), "Vaccinated", "Unvaccinated")
    })) |>
    mutate(RESPONSE_STATUS = sapply(RESP_PROPENSITY, FUN = function(resp_prob) {
      ifelse(resp_prob > runif(n = 1), "Respondent", "Nonrespondent")
    })) |>
    select(-VAX_RATE, -RESP_PROPENSITY)

  lou_vax_survey[['VAX_STATUS']] <- ifelse(
    lou_vax_survey[['RESPONSE_STATUS']] == "Nonrespondent", NA_character_,
    lou_vax_survey[['VAX_STATUS']]
  )

  ##_ Rearrange columns and shuffles rows
  lou_vax_survey <- lou_vax_survey |>
    select(RESPONSE_STATUS, SAMPLING_WEIGHT, everything()) |>
    sample_n(size = 1000, replace = FALSE)

  lou_vax_survey <- lou_vax_survey[,c(setdiff(colnames(lou_vax_survey),
                                              "SAMPLING_WEIGHT"),
                                      "SAMPLING_WEIGHT")]

# Estimate control totals ----

  ##_ For post-stratification

  poststratification_totals <- lou_rep_design |>
    filter(AGEP >= 18) |>
    svytotal(x = ~ CONTROL_CATEGORY)

  vcov_poststratification_totals <- vcov(poststratification_totals) |> as.matrix()

  poststratification_totals <- coef(poststratification_totals)
  names(poststratification_totals) <- gsub(
    x = names(poststratification_totals),
    pattern = "SEX_label", replacement = "SEX"
  )
  colnames(vcov_poststratification_totals) <- rownames(vcov_poststratification_totals) <- names(
    poststratification_totals
  )

  attributes(vcov_poststratification_totals)$means <- NULL

  lou_vax_survey_poststrat_totals <- list(
    'estimates' = poststratification_totals,
    'variance-covariance' = vcov_poststratification_totals
  )

  ##_ For raking

  raking_totals <- lou_rep_design |>
    filter(AGEP >= 18) |>
    svytotal(x = ~ RACE_ETHNICITY + SEX_label + EDUC_ATTAINMENT)

  vcov_raking_totals <- vcov(raking_totals) |> as.matrix()

  raking_totals <- coef(raking_totals)

  names(raking_totals) <- gsub(
    x = names(raking_totals),
    pattern = "SEX_label", replacement = "SEX"
  )
  colnames(vcov_raking_totals) <- rownames(vcov_raking_totals) <- names(
    raking_totals
  )

  attributes(vcov_raking_totals)$means <- NULL

  lou_vax_survey_raking_totals <- list(
    'estimates' = raking_totals,
    'variance-covariance' = vcov_raking_totals
  )

  lou_vax_survey_control_totals <- list(
    'poststratification' = lou_vax_survey_poststrat_totals,
    'raking' = lou_vax_survey_raking_totals
  )

# Create minimal dataset with replicate weights to use for examples ----


  ##_ Randomly select rows to retain in the dataset,
  ##_ and upweight to represent dropped rows

  sampled_ids <- lou_rep_design$variables |>
    filter(AGEP >= 18) |>
    group_by(SEX_label, RACE_ETHNICITY, EDUC_ATTAINMENT) |>
    slice(1:5) |>
    mutate(UNIQUE_ID = paste0(SERIALNO, SPORDER)) |>
    pull("UNIQUE_ID")

  subsampled_design <- lou_rep_design |>
    mutate(IS_SUBSAMPLED = paste0(SERIALNO, SPORDER) %in% sampled_ids) |>
    filter(AGEP >= 18) |>
    svrep::redistribute_weights(reduce_if = !IS_SUBSAMPLED,
                                increase_if = IS_SUBSAMPLED,
                                by = c("SEX_label", "RACE_ETHNICITY", "EDUC_ATTAINMENT")) |>
    filter(IS_SUBSAMPLED)

  minimal_pums_data <- subsampled_design |>
    mutate(UNIQUE_ID = 1:nrow(subsampled_design)) |>
    select(UNIQUE_ID, AGE = AGEP, SEX = SEX_label, RACE_ETHNICITY, EDUC_ATTAINMENT) |>
    cbind(cbind('PWGTP' = weights(subsampled_design, type = "sampling"),
                subsampled_design$repweights))

  lou_pums_microdata <- minimal_pums_data[sample(x = seq_len(nrow(minimal_pums_data)),
                                                 size = nrow(minimal_pums_data),
                                                 replace = FALSE),] |>
    dplyr::as_tibble()


# Save the dataset(s) of interest ----

  usethis::use_data(lou_vax_survey,
                    lou_vax_survey_control_totals,
                    lou_pums_microdata,
                    overwrite = TRUE,
                    compress = "xz")
bschneidr/svrep documentation built on Feb. 11, 2025, 4:24 a.m.