knitr::opts_chunk$set(echo = TRUE) options(scipen = 999) library(proftools) library(lemur.pack) library(extraDistr) library(dplyr) library(ggplot2) library(gridExtra)
We will use built-in data from our package.
data("ex1pop") data("ex1samp") pop <- ex1pop %>% select(HousePrice, HousePriceCat) samp <- ex1samp %>% select(Income, IncomeCat, HousePriceCat)
We observe the housing prices from the full population, but we only get raw incomes from a simple random sample. The SRS also contains binned housing price data.
countDf <- pop %>% group_by(HousePriceCat) %>% summarize(Count = n(), .groups = "drop") %>% mutate(Location = c(40000, 110000, 180000, 290000, 420000)) g1 <- ggplot(data = pop, mapping = aes(x = HousePrice)) + geom_histogram(breaks = seq(0, 560000, by = 20000), fill = "darkgreen", alpha = 0.5) + labs(x = "Housing Price", y = "Count") + geom_vline(xintercept = c(80000, 140000, 220000, 360000), linetype = "dashed") + geom_label(data = countDf, mapping = aes(x = Location, label = Count), y = 100) countDf <- samp %>% group_by(IncomeCat, HousePriceCat, .drop = FALSE) %>% summarize(Count = n(), .groups = "drop") g2 <- ggplot(data = countDf, mapping = aes(x = HousePriceCat, y = IncomeCat, fill = Count)) + geom_tile() + geom_label(data = countDf[countDf$Count > 0, ], mapping = aes(label = Count), fill = "white") + scale_fill_gradient(low = "white", high = "black") + scale_y_discrete(limits = rev(levels(countDf$IncomeCat))) + labs(x = "Housing Price", y = "Income") grid.arrange(g1, g2, nrow = 2, ncol = 1)
Our goal is to create synthetic populations using the observed raw data, in a way that respects the observed sample. Since we know the housing prices for the entire population, we will use those.
First, we will use a Normal distribution assumption for the incomes, and we will assume they are independent of housing prices.
\begin{align} X_c &= (I_c, H_c) \ X &= (I, H) \ H &\sim F(h) \ I &\sim Norm(\mu, \sigma^2) \end{align}
The following is a relatively straight-froward MCMC algorithm which will sample from the posterior representing this scenario.
indNormalMCMC <- function(sample, population, iterations = 1000) { ranges <- list( mins = list( Income = c(0, 40000, 65000, 110000), HousePrice = c(0, 80000, 140000, 220000, 360000) ), maxs = list( Income = c(39999, 64999, 109999, Inf), HousePrice = c(79999, 139999, 219999, 359999, Inf) ) ) modTable <- function(tabs, old, new) { oldElem <- matrix(0, nrow = 1, ncol = length(old)) for(i in 1:length(oldElem)) { oldElem[1, i] <- findInterval(old[i], vec = c(ranges$mins[[i]], Inf), all.inside = TRUE) } newElem <- matrix(0, nrow = 1, ncol = length(new)) for(i in 1:length(newElem)) { newElem[1, i] <- findInterval(new[i], vec = c(ranges$mins[[i]], Inf), all.inside = TRUE) } tabs[oldElem] <- tabs[oldElem] - 1 tabs[newElem] <- tabs[newElem] + 1 return(tabs) } toLLik <- function(tabs) { out <- 0 for(i in 1:nCatsH) { out <- out + suppressWarnings(dmvhyper(tab[, i], tabs[, i], tal[i], log = TRUE)) } return(out) } l1 <- levels(sample$IncomeCat) l2 <- levels(sample$HousePriceCat) tab <- table(sample[, -1], exclude = FALSE) mar <- as.numeric(table(population$HousePriceCat, exclude = FALSE)) tal <- as.numeric(table(sample$HousePriceCat, exclude = FALSE)) nCatsI <- length(l1) nCatsH <- length(l2) N <- nrow(population) mu <- 100000 sig <- 30000 step <- 30000 current <- data.frame( Income = rnorm(N, mu, sig), HousePrice = sample(population$HousePrice) ) currentMat <- matrix(0, nrow = nCatsI, ncol = nCatsH) while(!is.finite(toLLik(currentMat))) { current$Income <- rnorm(N, mu, sig) currentMat <- current %>% mutate( IncomeCat = case_when( Income < 40000 ~ "<40000", Income < 65000 ~ "40000-64999", Income < 110000 ~ "65000-109999", TRUE ~ ">=110000" ), HousePriceCat = case_when( HousePrice < 80000 ~ "<80000", HousePrice < 140000 ~ "80000-139999", HousePrice < 220000 ~ "140000-219999", HousePrice < 360000 ~ "220000-359999", TRUE ~ ">=360000" ) ) %>% transmute( IncomeCat = factor(IncomeCat, levels = c("<40000", "40000-64999", "65000-109999", ">=110000")), HousePriceCat = factor(HousePriceCat, levels = c("<80000", "80000-139999", "140000-219999", "220000-359999", ">=360000")) ) %>% table(., exclude = FALSE) %>% matrix(., ncol = nCatsH) } out <- list() for(i in 1:iterations) { propDf <- current for(k in 1:N) { proposal <- rtnorm(1, current[k, 1], step, 0, Inf) propDf[k, 1] <- proposal propMat <- modTable(currentMat, current[k, ], propDf[k, ]) # compare alpha <- toLLik(propMat) + dnorm(proposal, mu, sig, log = TRUE) - toLLik(currentMat) - dnorm(current[k, 1], mu, sig, log = TRUE) + dtnorm(current[k, 1], proposal, step, 0, Inf, log = TRUE) - dtnorm(proposal, current[k, 1], step, 0, Inf, log = TRUE) if(is.finite(alpha) & log(runif(1)) < alpha) { # Acceptance current <- propDf currentMat <- propMat } else { # Rejection propDf <- current } } out[[i]] <- current } return(out) }
prof <- profileExpr({ test <- indNormalMCMC(samp, pop, 10) }) # test <- indNormalMCMC(sample, population, 60000) # # synthPops <- test[50001:60000] # # save(synthPops, file = paste0(getwd(), "/synthPops.rda")) # # funSummary(prof) # # test[[length(test) - 1]] %>% # mutate( # IncomeCat = case_when( # Income < 40000 ~ "<40000", # Income < 65000 ~ "40000-64999", # Income < 110000 ~ "65000-109999", # TRUE ~ ">=110000" # ), # HousePriceCat = case_when( # HousePrice < 80000 ~ "<80000", # HousePrice < 140000 ~ "80000-139999", # HousePrice < 220000 ~ "140000-219999", # HousePrice < 360000 ~ "220000-359999", # TRUE ~ ">=360000" # ) # ) %>% # transmute( # IncomeCat = factor(IncomeCat, levels = c("<40000", "40000-64999", "65000-109999", ">=110000")), # HousePriceCat = factor(HousePriceCat, levels = c("<80000", "80000-139999", "140000-219999", "220000-359999", ">=360000")) # ) %>% # table(., exclude = FALSE)
load(file = "synthPops.rda") dfPops <- bind_rows(synthPops) avgPop <- dfPops %>% mutate( IncomeCat = case_when( Income < 40000 ~ "<40000", Income < 65000 ~ "40000-64999", Income < 110000 ~ "65000-109999", TRUE ~ ">=110000" ), HousePriceCat = case_when( HousePrice < 80000 ~ "<80000", HousePrice < 140000 ~ "80000-139999", HousePrice < 220000 ~ "140000-219999", HousePrice < 360000 ~ "220000-359999", TRUE ~ ">=360000" ) ) %>% transmute( IncomeCat = factor(IncomeCat, levels = c("<40000", "40000-64999", "65000-109999", ">=110000")), HousePriceCat = factor(HousePriceCat, levels = c("<80000", "80000-139999", "140000-219999", "220000-359999", ">=360000")) ) %>% group_by(IncomeCat, HousePriceCat, .drop = FALSE) %>% summarize(Count = n() / 10000, .groups = "drop") g2 <- ggplot(data = avgPop, mapping = aes(x = HousePriceCat, y = IncomeCat, fill = Count)) + geom_tile() + geom_label(data = avgPop[avgPop$Count > 0, ], mapping = aes(label = Count), fill = "white") + scale_fill_gradient(low = "white", high = "black") + scale_y_discrete(limits = rev(levels(avgPop$IncomeCat))) + labs(x = "Housing Price", y = "Income") g2
We are using a proper Normal prior for the incomes; in this case, we chose a mean of 100000 and standard deviation of 30000. If we look at our incomes by themselves, and compare this to random draws from our prior, we can see that
randDf <- data.frame(Income = rnorm(100000000, 100000, 30000)) %>% mutate( IncomeCat = case_when( Income < 40000 ~ "<40000", Income < 65000 ~ "40000-64999", Income < 110000 ~ "65000-109999", TRUE ~ ">=110000" ) ) %>% transmute( IncomeCat = factor(IncomeCat, levels = c("<40000", "40000-64999", "65000-109999", ">=110000")) ) %>% group_by(IncomeCat, .drop = FALSE) %>% summarize(Count = n() / 100000, .groups = "drop") ggplot(data = avgPop, mapping = aes(y = IncomeCat, x = Count, fill = "Posterior")) + geom_bar(stat = "identity", alpha = 0.5) + geom_bar(data = randDf, inherit.aes = FALSE, mapping = aes(y = IncomeCat, x = Count, fill = "Prior"), stat = "identity", alpha = 0.5) + scale_fill_manual(values = c("Prior" = "green", "Posterior" = "Blue")) + labs(fill = "Distribution")
Means...
plot(density(sapply(synthPops, function(x) mean(x[, 1]))), main = "Density of Synthetic Population Mean Incomes", xlab = "Mean Income")
plot(1:length(synthPops), sapply(synthPops, function(x) mean(x[, 1])), main = "Trace Plot of Populations Means", xlab = "Mean Income", type = "l")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.