knitr::opts_chunk$set( collapse = TRUE, comment = "#>", cache = FALSE, fig.path = "man/figures/README-", out.width = "100%" ) library(svysim) library(tidyverse) library(scales) library(glue)
The goal of svysim
is to facilitate creating realistic simulations of (biased) survey sampling.
Abstract: Surveys are used to teach sampling distributions and selection biases all the time, but illustrating a sampling distribution is difficult (because often we only have one survey) and most simulations are unrealistic (because they draw from a population with uncorrelated random variables, or because they assume a simple random sample from that population. This package provides some ready-made datasets from the CCES and Census-related datasets along with some customizable sampling schemes, using DeclareDesign.
remotes::install_github("kuriwaki/svysim")
Make a population dataset that expands the common sample via weights. Though the weights are not frequency weights, we treat it as such for simplicity.
pop_wtd <- pop_cces %>% mutate(weight_rounded = ceiling(weight_cumulative)) %>% tidyr::uncount(weights = weight_rounded) pop_wtd
Declare it formally as a population using the DeclareDesign
package.
popn <- DeclareDesign::declare_population(pop_cces)
Pick a propensity score. For example to oversample highly educated, the true propensity score we chose is:
pop_psc <- pop_wtd %>% mutate( psc_highed = p_highed(.data), psc_eddem = p_eddem(.data), )
pop_psc %>% select(matches("psc")) %>% pivot_longer(everything()) %>% mutate(pscore = recode_factor(name, psc_highed = "Education Oversample", psc_eddem = "Education + Dem Oversample")) %>% ggplot(aes(x = value, y = stat(width*density))) + facet_grid(~ pscore) + geom_histogram(bins = 50) + labs(x = "Propensity Score", y = "Proportion")
Take samples, e.g. a 0.1 percent sample with no replacement
stat_dem(pop_wtd) # true value samp0 <- samp_with(pop_psc, samp_srs, n = 600) stat_dem(samp0) samp1 <- samp_with(pop_psc, samp_pscore, varname = psc_highed, n = 600) stat_dem(samp1) samp2 <- samp_with(pop_psc, samp_pscore, varname = psc_eddem, n = 600) stat_dem(samp2)
Take Multiple Samples. This will take several minutes depending on how many independent samples we draw.
pop_mu <- stat_dem(pop_wtd) n_surveys <- 1000 n_samp <- 600 samps0 <- map_dbl(1:n_surveys, function(x) stat_dem(samp_with(pop_psc, samp_srs, n_samp))) samps1 <- map_dbl(1:n_surveys, function(x) stat_dem(samp_with(pop_psc, samp_pscore, varname = psc_highed, n = n_samp))) samps2 <- map_dbl(1:n_surveys, function(x) stat_dem(samp_with(pop_psc, samp_pscore, varname = psc_eddem, n = n_samp)))
Plot the Sampling Distributions
library(scales) library(glue) sampling_df <- bind_rows( tibble(method = "SRS", muhat = samps0), tibble(method = "Education Bias", muhat = samps1), tibble(method = "Ed. + PID Bias", muhat = samps2), ) %>% mutate(method = fct_inorder(method)) txt <- glue("Truth: {percent(pop_mu, accuracy = 0.1)}\n", "SRS Mean: {percent(mean(samps0), accuracy = 0.1)}\n", "Educaton Biased Mean: {percent(mean(samps1), accuracy = 0.1)}\n", "Ed. + PID Biased Mean: {percent(mean(samps2), accuracy = 0.1)}") sampling_df %>% ggplot(aes(x = muhat, fill = method)) + geom_histogram(alpha = 0.5, aes(y = stat(width*density)), position = position_identity(), bins = 25) + geom_vline(xintercept = pop_mu, linetype = "dashed") + annotate("text", x = pop_mu, y = Inf, label = txt, hjust = 0, vjust = 1.2) + theme_minimal() + scale_x_continuous(labels = percent_format(accuracy = 1)) + labs(x = "Estimated Proportion of Democrats in the Population", fill = "Sampling Method", y = "Proportion", caption = glue("{n_surveys} independent polls (each n = {n_samp}) from the same popualation"))
kuriwaki/ccesMRPprep
kuriwaki/ddi
kuriwaki/sparseregMRP
kuriwaki/synthArea
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.