#' Simulate biased survey samples
#'
#' @param n Total survey sample to draw
#' @author Lauren A. Kennedy, Jonah Gabry, and Aki Vehtari
#' @description This code is taken from https://github.com/lauken13/rstanarm, and
#' designed by the three authors listed above.
#'
#'
simulate_mrp_data <- function(n) {
# levels in male or not, eth, age, income level, state
J <- c(2, 3, 7, 3, 50)
# Columns of post-strat matrix, plus one for size
poststrat <- as.data.frame(array(NA, c(prod(J), length(J) + 1)))
colnames(poststrat) <- c("male", "eth", "age", "income", "state", "N")
count <- 0
for (i1 in 1:J[1]) {
for (i2 in 1:J[2]) {
for (i3 in 1:J[3]) {
for (i4 in 1:J[4]) {
for (i5 in 1:J[5]) {
count <- count + 1
# Fill them in so we know what category we are referring to
poststrat[count, 1:5] <- c(i1 - 1, i2, i3, i4, i5)
}
}
}
}
}
# Proportion in each sample in the population
p_male <- c(0.52, 0.48)
p_eth <- c(0.5, 0.2, 0.3)
p_age <- c(0.2, .1, 0.2, 0.2, 0.10, 0.1, 0.1)
p_income <- c(.50, .35, .15)
p_state_tmp <- runif(50, 10, 20)
p_state <- p_state_tmp / sum(p_state_tmp)
poststrat$N <- 0
for (j in 1:prod(J)) {
poststrat$N[j] <- round(250e6 * p_male[poststrat[j, 1] + 1] * p_eth[poststrat[j, 2]] *
p_age[poststrat[j, 3]] * p_income[poststrat[j, 4]] * p_state[poststrat[j, 5]]) # Adjust the N to be the number observed in each category in each group
}
# Now let's adjust for the probability of response
p_response_baseline <- 0.01
p_response_male <- c(2, 0.8) / 2.8
p_response_eth <- c(1, 1.2, 2.5) / 3.7
p_response_age <- c(1, 0.4, 1, 1.5, 3, 5, 7) / 18.9
p_response_inc <- c(1, 0.9, 0.8) / 2.7
set.seed(02138)
p_response_state <- rbeta(50, 1, 1)
p_response_state <- p_response_state / sum(p_response_state)
p_response <- rep(NA, prod(J))
for (j in 1:prod(J)) {
p_response[j] <- p_response_baseline * p_response_male[poststrat[j, 1] + 1] *
p_response_eth[poststrat[j, 2]] * p_response_age[poststrat[j, 3]] *
p_response_inc[poststrat[j, 4]] * p_response_state[poststrat[j, 5]]
}
people <- sample(prod(J), n, replace = TRUE, prob = poststrat$N * p_response)
## For respondent i, people[i] is that person's poststrat cell,
## some number between 1 and 32
n_cell <- rep(NA, prod(J))
for (j in 1:prod(J)) {
n_cell[j] <- sum(people == j)
}
coef_male <- c(0, -0.3)
coef_eth <- c(0, 0.6, 0.9)
coef_age <- c(0, -0.2, -0.3, 0.4, 0.5, 0.7, 0.8, 0.9)
coef_income <- c(0, -0.2, 0.6)
coef_state <- c(0, round(rnorm(49, 0, 1), 1))
coef_age_male <- t(cbind(
c(0, .1, .23, .3, .43, .5, .6),
c(0, -.1, -.23, -.5, -.43, -.5, -.6)
))
true.popn <- as_tibble(data.frame(poststrat[, 1:5], cat.pref = rep(NA, prod(J))))
for (j in 1:prod(J)) {
true.popn$cat.pref[j] <- plogis(coef_male[poststrat[j, 1] + 1] +
coef_eth[poststrat[j, 2]] +
coef_age[poststrat[j, 3]] +
coef_income[poststrat[j, 4]] +
coef_state[poststrat[j, 5]] +
coef_age_male[poststrat[j, 1] + 1, poststrat[j, 3]])
}
# male or not, eth, age, income level, state, city
y <- rbinom(n, 1, true.popn$cat.pref[people])
male <- poststrat[people, 1]
eth <- poststrat[people, 2]
age <- poststrat[people, 3]
income <- poststrat[people, 4]
state <- poststrat[people, 5]
sample <- tibble(cat.pref = y, male, age, eth, income, state, id = 1:length(people))
# make sure all variables are numerics..
return(list(sample = sample,
cell_count = as_tibble(poststrat),
cell_outcome = true.popn))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.