Nothing
## ---- SETTINGS-knitr, include=FALSE-------------------------------------------
stopifnot(require(knitr))
opts_chunk$set(
comment=NA,
message = FALSE,
warning = FALSE,
eval = identical(Sys.getenv("NOT_CRAN"), "true"),
dev = "png",
dpi = 150,
fig.asp = 0.618,
fig.width = 5,
out.width = "60%",
fig.align = "center"
)
## ----packages-1, message=FALSE------------------------------------------------
library(rstanarm)
library(ggplot2)
library(bayesplot)
theme_set(bayesplot::theme_default())
# options(mc.cores = 4)
## ----packages-2, eval=FALSE, message=FALSE------------------------------------
# library(dplyr)
# library(tidyr)
## ---- include=FALSE, collapse=TRUE--------------------------------------------
simulate_mrp_data <- function(n) {
J <- c(2, 3, 7, 3, 50) # male or not, eth, age, income level, state
poststrat <- as.data.frame(array(NA, c(prod(J), length(J)+1))) # Columns of post-strat matrix, plus one for size
colnames(poststrat) <- c("male", "eth", "age","income", "state",'N')
count <- 0
for (i1 in 1:J[1]){
for (i2 in 1:J[2]){
for (i3 in 1:J[3]){
for (i4 in 1:J[4]){
for (i5 in 1:J[5]){
count <- count + 1
# Fill them in so we know what category we are referring to
poststrat[count, 1:5] <- c(i1-1, i2, i3,i4,i5)
}
}
}
}
}
# Proportion in each sample in the population
p_male <- c(0.52, 0.48)
p_eth <- c(0.5, 0.2, 0.3)
p_age <- c(0.2,.1,0.2,0.2, 0.10, 0.1, 0.1)
p_income<-c(.50,.35,.15)
p_state_tmp<-runif(50,10,20)
p_state<-p_state_tmp/sum(p_state_tmp)
poststrat$N<-0
for (j in 1:prod(J)){
poststrat$N[j] <- round(250e6 * p_male[poststrat[j,1]+1] * p_eth[poststrat[j,2]] *
p_age[poststrat[j,3]]*p_income[poststrat[j,4]]*p_state[poststrat[j,5]]) #Adjust the N to be the number observed in each category in each group
}
# Now let's adjust for the probability of response
p_response_baseline <- 0.01
p_response_male <- c(2, 0.8) / 2.8
p_response_eth <- c(1, 1.2, 2.5) / 4.7
p_response_age <- c(1, 0.4, 1, 1.5, 3, 5, 7) / 18.9
p_response_inc <- c(1, 0.9, 0.8) / 2.7
p_response_state <- rbeta(50, 1, 1)
p_response_state <- p_response_state / sum(p_response_state)
p_response <- rep(NA, prod(J))
for (j in 1:prod(J)) {
p_response[j] <-
p_response_baseline * p_response_male[poststrat[j, 1] + 1] *
p_response_eth[poststrat[j, 2]] * p_response_age[poststrat[j, 3]] *
p_response_inc[poststrat[j, 4]] * p_response_state[poststrat[j, 5]]
}
people <- sample(prod(J), n, replace = TRUE, prob = poststrat$N * p_response)
## For respondent i, people[i] is that person's poststrat cell,
## some number between 1 and 32
n_cell <- rep(NA, prod(J))
for (j in 1:prod(J)) {
n_cell[j] <- sum(people == j)
}
coef_male <- c(0,-0.3)
coef_eth <- c(0, 0.6, 0.9)
coef_age <- c(0,-0.2,-0.3, 0.4, 0.5, 0.7, 0.8, 0.9)
coef_income <- c(0,-0.2, 0.6)
coef_state <- c(0, round(rnorm(49, 0, 1), 1))
coef_age_male <- t(cbind(c(0, .1, .23, .3, .43, .5, .6),
c(0, -.1, -.23, -.5, -.43, -.5, -.6)))
true_popn <- data.frame(poststrat[, 1:5], cat_pref = rep(NA, prod(J)))
for (j in 1:prod(J)) {
true_popn$cat_pref[j] <- plogis(
coef_male[poststrat[j, 1] + 1] +
coef_eth[poststrat[j, 2]] + coef_age[poststrat[j, 3]] +
coef_income[poststrat[j, 4]] + coef_state[poststrat[j, 5]] +
coef_age_male[poststrat[j, 1] + 1, poststrat[j, 3]]
)
}
#male or not, eth, age, income level, state, city
y <- rbinom(n, 1, true_popn$cat_pref[people])
male <- poststrat[people, 1]
eth <- poststrat[people, 2]
age <- poststrat[people, 3]
income <- poststrat[people, 4]
state <- poststrat[people, 5]
sample <- data.frame(cat_pref = y,
male, age, eth, income, state,
id = 1:length(people))
#Make all numeric:
for (i in 1:ncol(poststrat)) {
poststrat[, i] <- as.numeric(poststrat[, i])
}
for (i in 1:ncol(true_popn)) {
true_popn[, i] <- as.numeric(true_popn[, i])
}
for (i in 1:ncol(sample)) {
sample[, i] <- as.numeric(sample[, i])
}
list(
sample = sample,
poststrat = poststrat,
true_popn = true_popn
)
}
## ----include=FALSE, eval=FALSE------------------------------------------------
# mrp_sim <- simulate_mrp_data(n=1200)
# save(mrp_sim, file = "mrp-files/mrp_sim.rda", version = 2)
## ----eval=FALSE---------------------------------------------------------------
# mrp_sim <- simulate_mrp_data(n=1200)
# str(mrp_sim)
## ---- echo=FALSE--------------------------------------------------------------
load("mrp-files/mrp_sim.rda")
str(mrp_sim)
## ---- message=FALSE-----------------------------------------------------------
sample <- mrp_sim[["sample"]]
rbind(head(sample), tail(sample))
## ----message=FALSE------------------------------------------------------------
poststrat <- mrp_sim[["poststrat"]]
rbind(head(poststrat), tail(poststrat))
## ----message=FALSE------------------------------------------------------------
true_popn <- mrp_sim[["true_popn"]]
rbind(head(true_popn), tail(true_popn))
## ----order-states-------------------------------------------------------------
sample$state <- factor(sample$state, levels=1:50)
sample$state <- with(sample, factor(state, levels=order(table(state))))
true_popn$state <- factor(true_popn$state,levels = levels(sample$state))
poststrat$state <- factor(poststrat$state,levels = levels(sample$state))
## ----state-and-pop-data-for-plots, eval=FALSE, include=FALSE------------------
# # not evaluated to avoid tidyverse dependency
# income_popn <- poststrat %>%
# group_by(income) %>%
# summarize(Num=sum(N)) %>%
# mutate(PROP=Num/sum(Num),TYPE='Popn',VAR='Income',CAT=income) %>%
# ungroup()
# income_data <- sample %>%
# group_by(income) %>%
# summarise(Num=n()) %>%
# mutate(PROP=Num/sum(Num),TYPE='Sample',VAR='Income',CAT=income) %>%
# ungroup()
# income<-rbind(income_data[,2:6],income_popn[,2:6])
#
# age_popn <- poststrat%>%
# group_by(age)%>%
# summarize(Num=sum(N))%>%
# mutate(PROP=Num/sum(Num),TYPE='Popn',VAR='Age',CAT=age)%>%
# ungroup()
# age_data <- sample%>%
# group_by(age)%>%
# summarise(Num=n())%>%
# mutate(PROP=Num/sum(Num),TYPE='Sample',VAR='Age',CAT=age)%>%
# ungroup()
# age <- rbind(age_data[,2:6],age_popn[,2:6] )
#
# eth_popn <- poststrat%>%
# group_by(eth)%>%
# summarize(Num=sum(N))%>%
# mutate(PROP=Num/sum(Num),TYPE='Popn',VAR='Ethnicity',CAT=eth)%>%
# ungroup()
# eth_data <- sample%>%
# group_by(eth)%>%
# summarise(Num=n())%>%
# mutate(PROP=Num/sum(Num),TYPE='Sample',VAR='Ethnicity',CAT=eth)%>%
# ungroup()
# eth<-rbind(eth_data[,2:6],eth_popn[,2:6])
#
# male_popn <- poststrat%>%
# group_by(male)%>%
# summarize(Num=sum(N))%>%
# mutate(PROP=Num/sum(Num),TYPE='Popn',VAR='Male',CAT=male)%>%
# ungroup()
# male_data <- sample%>%
# group_by(male)%>%
# summarise(Num=n())%>%
# mutate(PROP=Num/sum(Num),TYPE='Sample',VAR='Male',CAT=male)%>%
# ungroup()
# male <- rbind(male_data[,2:6],male_popn[,2:6])
#
# state_popn <- poststrat%>%
# group_by(state)%>%
# summarize(Num=sum(N))%>%
# mutate(PROP=Num/sum(poststrat$N),TYPE='Popn',VAR='State',CAT=state)%>%
# ungroup()
#
# state_plot_data <- sample%>%
# group_by(state)%>%
# summarise(Num=n())%>%
# mutate(PROP=Num/nrow(sample),TYPE='Sample',VAR='State',CAT=state)%>%
# ungroup()
#
# state_plot_data <- rbind(state_plot_data[,2:6],state_popn[,2:6])
# state_plot_data$TYPE <- factor(state_plot_data$TYPE, levels = c("Sample","Popn"))
#
# plot_data <- rbind(male,eth,age,income)
# plot_data$TYPE <- factor(plot_data$TYPE, levels = c("Sample","Popn"))
#
# save(state_plot_data, file = "mrp-files/state_plot_data.rda", version = 2)
# save(plot_data, file = "mrp-files/plot_data.rda", version = 2)
## ----plot-data, echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"----
load("mrp-files/plot_data.rda") # created in previous chunk
ggplot(data=plot_data, aes(x=as.factor(CAT), y=PROP, group=as.factor(TYPE), linetype=as.factor(TYPE))) +
geom_point(stat="identity",colour='black')+
geom_line()+
facet_wrap( ~ VAR, scales = "free",nrow=1,ncol=5)+
theme_bw()+
scale_fill_manual(values=c('#1f78b4','#33a02c',
'#e31a1c','#ff7f00','#8856a7'),guide=FALSE)+
scale_y_continuous(breaks=c(0,.25,.5,.75,1), labels=c('0%','25%',"50%","75%","100%"))+
scale_alpha_manual(values=c(1, .3))+
ylab('Proportion')+
labs(alpha='')+
theme(legend.position="bottom",
axis.title.y=element_blank(),
axis.title.x=element_blank(),
legend.title=element_blank(),
legend.text=element_text(size=10),
axis.text=element_text(size=10),
strip.text=element_text(size=10),
strip.background = element_rect(fill='grey92'))
load("mrp-files/state_plot_data.rda") # created in previous chunk
ggplot(data=state_plot_data, aes(x=as.factor(CAT), y=PROP, group=as.factor(TYPE), linetype=as.factor(TYPE))) +
geom_point(stat="identity",colour='black')+
geom_line()+
facet_wrap( ~ VAR)+
theme_bw()+
scale_fill_manual(values=c('#1f78b4','#33a02c',
'#e31a1c','#ff7f00','#8856a7'),guide=FALSE)+
scale_y_continuous(breaks=c(0,.025,.05,1), labels=c('0%','2.5%',"5%","100%"),expand=c(0,0),limits=c(0,.06))+
scale_alpha_manual(values=c(1, .3))+
ylab('Proportion')+
labs(alpha='')+
theme(legend.position="bottom",
axis.title.y=element_blank(),
axis.title.x=element_blank(),
legend.title=element_blank(),
legend.text=element_text(size=10),
axis.text.y=element_text(size=10),
axis.text.x=element_text(size=8,angle=90),
strip.text=element_text(size=10),
strip.background = element_rect(fill='grey92'))
## ---- eval=FALSE, echo=FALSE--------------------------------------------------
# # not evaluated to avoid dependency on tidyverse
#
# #Summarise
# summary_by_poststrat_var <- sample %>%
# gather(variable,category,c("income","eth","age","male")) %>%
# group_by(variable,category) %>%
# #Wald confidence interval
# summarise(y_mean=mean(cat_pref),y_sd=sqrt(mean(cat_pref)*(1-mean(cat_pref))/n())) %>%
# ungroup()
# summary_by_poststrat_var$variable <- as.factor(summary_by_poststrat_var$variable)
# levels(summary_by_poststrat_var$variable) <- list('Age'='age','Ethnicity'='eth','Income'='income','Male'='male')
#
# save(summary_by_poststrat_var, file = "mrp-files/summary_by_poststrat_var.rda",
# version = 2)
## ----plot-summary-by-poststrat-var, echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"----
load("mrp-files/summary_by_poststrat_var.rda") # created in previous chunk
ggplot(data=summary_by_poststrat_var, aes(x=as.factor(category), y=y_mean,group=1)) +
geom_errorbar(aes(ymin=y_mean-y_sd, ymax=y_mean+y_sd), width=0)+
geom_line()+
geom_point()+
scale_colour_manual(values=c('#1f78b4','#33a02c','#e31a1c','#ff7f00',
'#8856a7'))+theme_bw()+
facet_wrap(~variable,scales = "free_x",nrow=1,ncol=5)+
scale_y_continuous(breaks=c(.5,.75,1), labels=c("50%","75%",
"100%"), limits=c(0.4-.4*.05,.9),expand = c(0,0))+
labs(x="",y="Cat preference")+
theme(legend.position="none",
axis.title.y=element_text(size=10),
axis.title.x=element_blank(),
axis.text=element_text(size=10),
strip.text=element_text(size=10),
strip.background = element_rect(fill='grey92'))
## ----interaction-summary, eval=FALSE, echo=FALSE------------------------------
# # not evaluated to avoid dependency on tidyverse
#
# #Summarise
# interaction <- sample %>%
# gather(variable, category, c("age", "eth")) %>%
# group_by(variable, category, male) %>%
# summarise(y_mean = mean(cat_pref),
# y_sd = sqrt(mean(cat_pref) * (1 - mean(cat_pref)) / n())) %>%
# ungroup()
#
# #Tidy for nice facet labels
# interaction$variable <- as.factor(interaction$variable)
# levels(interaction$variable) <- list('Ethnicity' = 'eth', 'Age' = 'age')
# save(interaction, file = "mrp-files/interaction.rda", version = 2)
## ----plot-interaction, echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"----
load("mrp-files/interaction.rda") # created in previous chunk
ggplot(data=interaction, aes(x=as.factor(category), y=y_mean, colour=as.factor(male),group=as.factor(male))) +
geom_errorbar(aes(ymin=y_mean-y_sd, ymax=y_mean+y_sd),width=0 )+
geom_line(aes(x=as.factor(category), y=y_mean,colour=as.factor(male)))+
geom_point()+
facet_wrap(~variable,scales = "free_x",nrow=1,ncol=2)+
labs(x="",y="Cat preference",colour='Gender')+
scale_y_continuous(breaks=c(0,.25,.5,.75,1), labels=c("0%",'25%',"50%","75%",
"100%"), limits=c(0,1),expand=c(0,0))+
scale_colour_manual(values=c('#4575b4','#d73027'))+theme_bw()+
theme(axis.title=element_text(size=10),
axis.text=element_text(size=10),
legend.position='none',
strip.text=element_text(size=10),
strip.background = element_rect(fill='grey92'))
## ---- eval=FALSE, echo=FALSE--------------------------------------------------
# # not evaluated to avoid dependency on tidyverse
#
# #Summarise by state
# preference_by_state <- sample %>%
# group_by(state) %>%
# summarise(y_mean = mean(cat_pref),
# y_sd = sqrt(mean(cat_pref) * (1 - mean(cat_pref)) / n())) %>%
# ungroup()
#
# save(preference_by_state, file = "mrp-files/preference_by_state.rda", version = 2)
## ---- echo=FALSE, fig.height = 4, fig.width = 8, fig.align = "center"---------
load("mrp-files/preference_by_state.rda")
compare <- ggplot(data=preference_by_state, aes(x=state, y=y_mean,group=1)) +
geom_ribbon(aes(ymin=y_mean-y_sd,ymax=y_mean+y_sd,x=state),fill='lightgrey',alpha=.7)+
geom_line(aes(x=state, y=y_mean))+
geom_point()+
scale_y_continuous(breaks=c(0,.25,.5,.75,1),
labels=c("0%","25%","50%","75%","100%"),
limits=c(0,1), expand=c(0,0))+
scale_x_discrete(drop=FALSE)+
scale_colour_manual(values=c('#1f78b4','#33a02c','#e31a1c','#ff7f00',
'#8856a7'))+
theme_bw()+
labs(x="States",y="Cat preference")+
theme(legend.position="none",
axis.title=element_text(size=10),
axis.text.y=element_text(size=10),
axis.text.x=element_text(angle=90,size=8),
legend.title=element_text(size=10),
legend.text=element_text(size=10))
compare2 <- ggplot()+
geom_hline(yintercept = mean(sample$cat_pref),size=.8)+
geom_text(aes(x = 5.2, y = mean(sample$cat_pref)+.025, label = "Sample"))+
scale_y_continuous(breaks=c(0,.25,.5,.75,1),
labels=c("0%","25%","50%","75%","100%"),
limits=c(-0.25,1.25),expand=c(0,0))+
theme_bw()+
labs(x="Popn",y="")+
theme(legend.position="none",
axis.title.y=element_blank(),
axis.title.x=element_text(size=10),
axis.text=element_blank(),
axis.ticks=element_blank(),
legend.title=element_text(size=10),
legend.text=element_text(size=10))
bayesplot_grid(compare,compare2,
grid_args = list(nrow=1, widths = c(8,1)))
## ---- message=FALSE, warning=FALSE, results='hide'----------------------------
fit <- stan_glmer(
cat_pref ~ factor(male) + factor(male) * factor(age) +
(1 | state) + (1 | age) + (1 | eth) + (1 | income),
family = binomial(link = "logit"),
data = sample
)
## -----------------------------------------------------------------------------
print(fit)
## ---- message=FALSE-----------------------------------------------------------
posterior_prob <- posterior_linpred(fit, transform = TRUE, newdata = poststrat)
poststrat_prob <- posterior_prob %*% poststrat$N / sum(poststrat$N)
model_popn_pref <- c(mean = mean(poststrat_prob), sd = sd(poststrat_prob))
round(model_popn_pref, 3)
## ---- message=FALSE-----------------------------------------------------------
sample_popn_pref <- mean(sample$cat_pref)
round(sample_popn_pref, 3)
## ---- message=FALSE,fig.height = 4, fig.width = 8, fig.align = "center"-------
compare2 <- compare2 +
geom_hline(yintercept = model_popn_pref[1], colour = '#2ca25f', size = 1) +
geom_text(aes(x = 5.2, y = model_popn_pref[1] + .025), label = "MRP", colour = '#2ca25f')
bayesplot_grid(compare, compare2,
grid_args = list(nrow = 1, widths = c(8, 1)))
## ---- message=FALSE-----------------------------------------------------------
true_popn_pref <- sum(true_popn$cat_pref * poststrat$N) / sum(poststrat$N)
round(true_popn_pref, 3)
## ---- echo=FALSE, message=FALSE,fig.height = 4, fig.width = 8, fig.align = "center"----
compare2 <- compare2 +
geom_hline(yintercept = mean(true_popn_pref), linetype = 'dashed', size = .8) +
geom_text(aes(x = 5.2, y = mean(true_popn_pref) - .025), label = "True")
bayesplot_grid(compare, compare2,
grid_args = list(nrow = 1, widths = c(8, 1)))
## ---- message=FALSE-----------------------------------------------------------
state_df <- data.frame(
State = 1:50,
model_state_sd = rep(-1, 50),
model_state_pref = rep(-1, 50),
sample_state_pref = rep(-1, 50),
true_state_pref = rep(-1, 50),
N = rep(-1, 50)
)
for(i in 1:length(levels(as.factor(poststrat$state)))) {
poststrat_state <- poststrat[poststrat$state == i, ]
posterior_prob_state <- posterior_linpred(
fit,
transform = TRUE,
draws = 1000,
newdata = as.data.frame(poststrat_state)
)
poststrat_prob_state <- (posterior_prob_state %*% poststrat_state$N) / sum(poststrat_state$N)
#This is the estimate for popn in state:
state_df$model_state_pref[i] <- round(mean(poststrat_prob_state), 4)
state_df$model_state_sd[i] <- round(sd(poststrat_prob_state), 4)
#This is the estimate for sample
state_df$sample_state_pref[i] <- round(mean(sample$cat_pref[sample$state == i]), 4)
#And what is the actual popn?
state_df$true_state_pref[i] <-
round(sum(true_popn$cat_pref[true_popn$state == i] * poststrat_state$N) /
sum(poststrat_state$N), digits = 4)
state_df$N[i] <- length(sample$cat_pref[sample$state == i])
}
state_df[c(1,3:6)]
state_df$State <- factor(state_df$State, levels = levels(sample$state))
## -----------------------------------------------------------------------------
round(100 * c(
mean = mean(abs(state_df$sample_state_pref-state_df$true_state_pref), na.rm = TRUE),
max = max(abs(state_df$sample_state_pref-state_df$true_state_pref), na.rm = TRUE)
))
## -----------------------------------------------------------------------------
round(100 * c(
mean = mean(abs(state_df$model_state_pref-state_df$true_state_pref)),
max = max(abs(state_df$model_state_pref-state_df$true_state_pref))
))
## ---- message=FALSE, echo=FALSE, fig.height = 4, fig.width = 8, fig.align = "center",warning=FALSE, fig.align = "center"----
#Summarise by state
compare <- compare +
geom_point(data=state_df, mapping=aes(x=State, y=model_state_pref),
inherit.aes=TRUE,colour='#238b45')+
geom_line(data=state_df, mapping=aes(x=State, y=model_state_pref,group=1),
inherit.aes=TRUE,colour='#238b45')+
geom_ribbon(data=state_df,mapping=aes(x=State,ymin=model_state_pref-model_state_sd,
ymax=model_state_pref+model_state_sd,group=1),
inherit.aes=FALSE,fill='#2ca25f',alpha=.3)+
geom_point(data=state_df, mapping=aes(x=State, y=true_state_pref),
alpha=.5,inherit.aes=TRUE)+
geom_line(data=state_df, mapping=aes(x=State, y=true_state_pref),
inherit.aes = TRUE,linetype='dashed')
bayesplot_grid(compare, compare2,
grid_args = list(nrow = 1, widths = c(8, 1)))
## ---- eval=FALSE--------------------------------------------------------------
# # not evaluated to avoid dependency on tidyverse
# sample_alt <- sample %>%
# group_by(male, age, income, state, eth) %>%
# summarise(N_cat_pref = sum(cat_pref), N = n()) %>%
# ungroup()
## ---- include=FALSE-----------------------------------------------------------
load("mrp-files/sample_alt.rda")
## ---- message=FALSE, warning=FALSE, results='hide'----------------------------
fit2 <- stan_glmer(
cbind(N_cat_pref, N - N_cat_pref) ~ factor(male) + factor(male) * factor(age) +
(1 | state) + (1 | age) + (1 | eth) + (1 | income),
family = binomial("logit"),
data = sample_alt,
refresh = 0
)
## -----------------------------------------------------------------------------
print(fit2)
## ---- message=FALSE-----------------------------------------------------------
posterior_prob_alt <- posterior_linpred(fit2, transform = TRUE, newdata = poststrat)
poststrat_prob_alt <- posterior_prob_alt %*% poststrat$N / sum(poststrat$N)
model_popn_pref_alt <- c(mean = mean(poststrat_prob_alt), sd = sd(poststrat_prob_alt))
round(model_popn_pref_alt, 3)
## -----------------------------------------------------------------------------
print(simulate_mrp_data)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.