In ctgrubb/lemur.pack: Package for Team Lemur functional codebase

knitr::opts_chunk$set(echo = TRUE)

options(scipen = 999)

library(proftools)
library(lemur.pack)
library(extraDistr)
library(dplyr)
library(ggplot2)
library(gridExtra)

Introduction

Data

We will use built-in data from our package.

data("ex1pop")
data("ex1samp")

pop <- ex1pop %>%
  select(HousePrice, HousePriceCat)

samp <- ex1samp %>%
  select(Income, IncomeCat, HousePriceCat)

We observe the housing prices from the full population, but we only get raw incomes from a simple random sample. The SRS also contains binned housing price data.

countDf <- pop %>% 
  group_by(HousePriceCat) %>%
  summarize(Count = n(), .groups = "drop") %>%
  mutate(Location = c(40000, 110000, 180000, 290000, 420000))

g1 <- ggplot(data = pop, mapping = aes(x = HousePrice)) + 
  geom_histogram(breaks = seq(0, 560000, by = 20000), fill = "darkgreen", alpha = 0.5) + 
  labs(x = "Housing Price", y = "Count") + 
  geom_vline(xintercept = c(80000, 140000, 220000, 360000), linetype = "dashed") + 
  geom_label(data = countDf, mapping = aes(x = Location, label = Count), y = 100)

countDf <- samp %>%
  group_by(IncomeCat, HousePriceCat, .drop = FALSE) %>%
  summarize(Count = n(), .groups = "drop")

g2 <- ggplot(data = countDf, mapping = aes(x = HousePriceCat, y = IncomeCat, fill = Count)) + 
  geom_tile() + 
  geom_label(data = countDf[countDf$Count > 0, ], mapping = aes(label = Count), fill = "white") + 
  scale_fill_gradient(low = "white", high = "black") + 
  scale_y_discrete(limits = rev(levels(countDf$IncomeCat))) + 
  labs(x = "Housing Price", y = "Income")

grid.arrange(g1, g2, nrow = 2, ncol = 1)

Goal

Our goal is to create synthetic populations using the observed raw data, in a way that respects the observed sample. Since we know the housing prices for the entire population, we will use those.

Methods

Basic (Independent Normal, Hypergeometric)

First, we will use a Normal distribution assumption for the incomes, and we will assume they are independent of housing prices.

\begin{align} X_c &= (I_c, H_c) \ X &= (I, H) \ H &\sim F(h) \ I &\sim Norm(\mu, \sigma^2) \end{align}

The following is a relatively straight-froward MCMC algorithm which will sample from the posterior representing this scenario.

indNormalMCMC <- function(sample, population, iterations = 1000) {

  ranges <- list(
    mins = list(
      Income = c(0, 40000, 65000, 110000),
      HousePrice = c(0, 80000, 140000, 220000, 360000)
    ),
    maxs = list(
      Income = c(39999, 64999, 109999, Inf),
      HousePrice = c(79999, 139999, 219999, 359999, Inf)
    )
  )

  modTable <- function(tabs, old, new) {
    oldElem <- matrix(0, nrow = 1, ncol = length(old))
    for(i in 1:length(oldElem)) {
      oldElem[1, i] <- findInterval(old[i], vec = c(ranges$mins[[i]], Inf), all.inside = TRUE)
    }

    newElem <- matrix(0, nrow = 1, ncol = length(new))
    for(i in 1:length(newElem)) {
      newElem[1, i] <- findInterval(new[i], vec = c(ranges$mins[[i]], Inf), all.inside = TRUE)
    }

    tabs[oldElem] <- tabs[oldElem] - 1
    tabs[newElem] <- tabs[newElem] + 1

    return(tabs)
  }

  toLLik <- function(tabs) {
    out <- 0
    for(i in 1:nCatsH) {
      out <- out + suppressWarnings(dmvhyper(tab[, i], tabs[, i], tal[i], log = TRUE))
    }
    return(out)
  }

  l1 <- levels(sample$IncomeCat)
  l2 <- levels(sample$HousePriceCat)

  tab <- table(sample[, -1], exclude = FALSE)
  mar <- as.numeric(table(population$HousePriceCat, exclude = FALSE))
  tal <- as.numeric(table(sample$HousePriceCat, exclude = FALSE))

  nCatsI <- length(l1)
  nCatsH <- length(l2)

  N <- nrow(population)
  mu <- 100000
  sig <- 30000
  step <- 30000

  current <- data.frame(
    Income = rnorm(N, mu, sig),
    HousePrice = sample(population$HousePrice)
  )

  currentMat <- matrix(0, nrow = nCatsI, ncol = nCatsH)

  while(!is.finite(toLLik(currentMat))) {
    current$Income <- rnorm(N, mu, sig)

    currentMat <- current %>%
    mutate(
      IncomeCat = case_when(
        Income < 40000 ~ "<40000",
        Income < 65000 ~ "40000-64999",
        Income < 110000 ~ "65000-109999",
        TRUE ~ ">=110000"
      ),
      HousePriceCat = case_when(
        HousePrice < 80000 ~ "<80000",
        HousePrice < 140000 ~ "80000-139999",
        HousePrice < 220000 ~ "140000-219999",
        HousePrice < 360000 ~ "220000-359999",
        TRUE ~ ">=360000"
      )
    ) %>%
    transmute(
      IncomeCat = factor(IncomeCat, levels = c("<40000", "40000-64999", "65000-109999", ">=110000")),
      HousePriceCat = factor(HousePriceCat, levels = c("<80000", "80000-139999", "140000-219999", "220000-359999", ">=360000"))
    ) %>%
    table(., exclude = FALSE) %>%
    matrix(., ncol = nCatsH)
  }

  out <- list()

  for(i in 1:iterations) {
    propDf <- current
    for(k in 1:N) {
      proposal <- rtnorm(1, current[k, 1], step, 0, Inf)
      propDf[k, 1] <- proposal
      propMat <- modTable(currentMat, current[k, ], propDf[k, ])

      # compare
      alpha <- toLLik(propMat) + dnorm(proposal, mu, sig, log = TRUE) - 
        toLLik(currentMat) - dnorm(current[k, 1], mu, sig, log = TRUE) + 
        dtnorm(current[k, 1], proposal, step, 0, Inf, log = TRUE) - dtnorm(proposal, current[k, 1], step, 0, Inf, log = TRUE)
      if(is.finite(alpha) & log(runif(1)) < alpha) {
        # Acceptance
        current <- propDf
        currentMat <- propMat
      } else {
        # Rejection
        propDf <- current
      }
    }
    out[[i]] <- current
  }
  return(out)
}

prof <- profileExpr({
  test <- indNormalMCMC(samp, pop, 10)
})

# test <- indNormalMCMC(sample, population, 60000)
# 
# synthPops <- test[50001:60000]
# 
# save(synthPops, file = paste0(getwd(), "/synthPops.rda"))
# 
# funSummary(prof)
# 
# test[[length(test) - 1]] %>%
#   mutate(
#       IncomeCat = case_when(
#         Income < 40000 ~ "<40000",
#         Income < 65000 ~ "40000-64999",
#         Income < 110000 ~ "65000-109999",
#         TRUE ~ ">=110000"
#       ),
#       HousePriceCat = case_when(
#         HousePrice < 80000 ~ "<80000",
#         HousePrice < 140000 ~ "80000-139999",
#         HousePrice < 220000 ~ "140000-219999",
#         HousePrice < 360000 ~ "220000-359999",
#         TRUE ~ ">=360000"
#       )
#     ) %>%
#     transmute(
#       IncomeCat = factor(IncomeCat, levels = c("<40000", "40000-64999", "65000-109999", ">=110000")),
#       HousePriceCat = factor(HousePriceCat, levels = c("<80000", "80000-139999", "140000-219999", "220000-359999", ">=360000"))
#     ) %>%
#     table(., exclude = FALSE)

Results

load(file = "synthPops.rda")
dfPops <- bind_rows(synthPops)

avgPop <- dfPops %>%
  mutate(
    IncomeCat = case_when(
      Income < 40000 ~ "<40000",
      Income < 65000 ~ "40000-64999",
      Income < 110000 ~ "65000-109999",
      TRUE ~ ">=110000"
    ),
    HousePriceCat = case_when(
      HousePrice < 80000 ~ "<80000",
      HousePrice < 140000 ~ "80000-139999",
      HousePrice < 220000 ~ "140000-219999",
      HousePrice < 360000 ~ "220000-359999",
      TRUE ~ ">=360000"
    )
  ) %>%
  transmute(
    IncomeCat = factor(IncomeCat, levels = c("<40000", "40000-64999", "65000-109999", ">=110000")),
    HousePriceCat = factor(HousePriceCat, levels = c("<80000", "80000-139999", "140000-219999", "220000-359999", ">=360000"))
  ) %>%
  group_by(IncomeCat, HousePriceCat, .drop = FALSE) %>%
  summarize(Count = n() / 10000, .groups = "drop")

g2 <- ggplot(data = avgPop, mapping = aes(x = HousePriceCat, y = IncomeCat, fill = Count)) + 
  geom_tile() + 
  geom_label(data = avgPop[avgPop$Count > 0, ], mapping = aes(label = Count), fill = "white") + 
  scale_fill_gradient(low = "white", high = "black") + 
  scale_y_discrete(limits = rev(levels(avgPop$IncomeCat))) + 
  labs(x = "Housing Price", y = "Income")

g2

Remarks

We are using a proper Normal prior for the incomes; in this case, we chose a mean of 100000 and standard deviation of 30000. If we look at our incomes by themselves, and compare this to random draws from our prior, we can see that

randDf <- data.frame(Income = rnorm(100000000, 100000, 30000)) %>%
  mutate(
    IncomeCat = case_when(
      Income < 40000 ~ "<40000",
      Income < 65000 ~ "40000-64999",
      Income < 110000 ~ "65000-109999",
      TRUE ~ ">=110000"
    )
  ) %>%
  transmute(
    IncomeCat = factor(IncomeCat, levels = c("<40000", "40000-64999", "65000-109999", ">=110000"))
  ) %>%
  group_by(IncomeCat, .drop = FALSE) %>%
  summarize(Count = n() / 100000, .groups = "drop")

ggplot(data = avgPop, mapping = aes(y = IncomeCat, x = Count, fill = "Posterior")) + 
  geom_bar(stat = "identity", alpha = 0.5) + 
  geom_bar(data = randDf, inherit.aes = FALSE, mapping = aes(y = IncomeCat, x = Count, fill = "Prior"), 
           stat = "identity", alpha = 0.5) + 
  scale_fill_manual(values = c("Prior" = "green", "Posterior" = "Blue")) + 
  labs(fill = "Distribution")

Means...

plot(density(sapply(synthPops, function(x) mean(x[, 1]))), main = "Density of Synthetic Population Mean Incomes",
     xlab = "Mean Income")

plot(1:length(synthPops), sapply(synthPops, function(x) mean(x[, 1])), main = "Trace Plot of Populations Means",
     xlab = "Mean Income", type = "l")