knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = TRUE
)

quick_eval <- FALSE
if(quick_eval) {
  samples <- 1000
} else {
  samples <- 75000
}

set.seed(1234)
library(extraDistr)
library(knitr)
library(kableExtra)
library(proftools)
library(tidyverse)
library(gridExtra)

Intro

Text here.

Samplers

Here we will compare three samplers:

Importance Sampling

importanceSampler <- function(x, N, samples = 1000, saturation = 10, nEff = length(sample), 
                              alpha = 0.5, cutoff = 0.5) {
  n <- length(x)
  nLow <- sum(x < cutoff)

  priorDrawsRaw <- rdirichlet(samples * saturation, (nEff + alpha) / (N + 1) * rep(1, N + 1))
  priorDraws <- t(apply(matrix(priorDrawsRaw, nrow = samples * saturation)
                        [, -(N + 1), drop = FALSE], 1, cumsum))
  NLow <- apply(priorDraws, 1, function(x) sum(x < cutoff))
  weights <- dhyper(nLow, NLow, N - NLow, n)

  draws <- sample(1:(samples * saturation), size = samples, replace = TRUE, prob = weights)

  return(priorDraws[draws, ])
}

Block Sampling MCMC

Let $x_1, x_2, \dots, x_N$ represent the ordered population of size N. We denote $s_1 = x_1$, $s_2 = x_2 - x_1, \dots, s_{N-1} = x_N - x_{N-1}$, $s_N = 1 - x_N$ as the spacing of the ordered population.

From our formulation of the posterior, $\sum s_i = 1$ and $(s_1, \dots, s_{N+1}) \sim Dir(\frac{N_E + \alpha}{N+1}, \dots, \frac{N_E + \alpha}{N+1})$. We can use this to figure out the conditional prior for a set of spacings $s_{(m)}$.

For example, suppose we want the distribution of $x_{(1)} | x_{(2)}$, where $x_{(1)}, x_{(2)}$ partition $x$. Fortunately, this is relatively straightforward, and we end up with

$$ \frac{1}{1 - \mathbf{1}^Tx_{(2)}} X_{(1)} | X_{(2)} = x_{(2)} \sim Dir(\frac{N_E + \alpha}{N+1}, \dots, \frac{N_E + \alpha}{N+1})$$

Essentially, nothing changes except we now know the spacings have to add up to $1 - \mathbf{1}^Tx_{(2)}$. We can use this to our advantage, using this conditional prior as our proposal. For simplicity, we let $N_E\ |\ N$, and consider proposal chunks of size $N / N_E$.

mcmcPriorBlock <- function(x, N, samples = 1000, nEff = length(sample), alpha = 0.5, cutoff = 0.5) {

  lpost <- function(nLow, pop, diffs, N, nEff, alpha, cutoff) {
    NLow <- sum(pop < cutoff)
    lprior <- ddirichlet(diffs, (nEff + alpha) / (N + 1) * rep(1, N + 1), log = TRUE)
    llik <- dhyper(nLow, NLow, N - NLow, n, log = TRUE)
    return(llik + lprior)
  }

  priorProposal <- function(x) {
    as.numeric(rdirichlet(1, rep((nEff + alpha) / (N + 1), length(x))) * sum(x))
  }

  priorDensity <- function(x) {
    ddirichlet(x / sum(x), rep((nEff + alpha) / (N + 1), length(x)), log = TRUE)
  }

  spacings <- matrix(NA, nrow = samples, ncol = N + 1)
  data <- matrix(NA, nrow = samples, ncol = N)

  chunks <- split(1:(N + 1), cut(seq_along(1:(N + 1)), nEff * 2, labels = FALSE))

  chunks2 <- list()
  for(i in 1:(length(chunks) - 1)) {
    chunks2[[i]] <- c(chunks[[i]], chunks[[i + 1]])
  }


  n <- length(x)
  nLow <- sum(x < cutoff)

  data[1, ] <- sort(c(runif(ceiling(N * nLow / n), 0, cutoff), 
                      runif(floor(N * (n - nLow) / n), cutoff, 1)))
  spacings[1, ] <- diff(c(0, data[1, ], 1))


  lp0 <- lpost(nLow, data[1, ], spacings[1, ], N, nEff, alpha, cutoff)

  for(i in 2:samples) {
    currentSpacings <- spacings[i - 1,]
    currentData <- data[i - 1, ]
    for(k in 1:length(chunks2)) {
      proposeSpacings <- currentSpacings
      proposeSpacings[chunks2[[k]]] <- priorProposal(proposeSpacings[chunks2[[k]]])
      proposeData <- cumsum(proposeSpacings)[-(N + 1)]
      lp1 <- lpost(nLow, proposeData, proposeSpacings, N, nEff, alpha, cutoff)

      u <- runif(1)
      if(is.finite(lp1) && log(u) < (lp1 - priorDensity(proposeSpacings)) - 
         (lp0 - priorDensity(currentSpacings))) {
        currentSpacings <- proposeSpacings
        currentData <- proposeData
        lp0 <- lp1
      }
    }
    spacings[i, ] <- currentSpacings
    data[i, ] <- currentData
  }

  out <- list(
    data = data,
    spacings = spacings,
    x = x,
    N = n,
    samples = samples,
    nEff = nEff,
    alpha = alpha,
    cutoff = cutoff
  )

  return(out)
}

MCMC

mcmcPrior <- function(x, N, samples = 1000, nEff = length(sample), alpha = 0.5, cutoff = 0.5) {

  lpost <- function(nLow, pop, diffs, N, nEff, alpha, cutoff) {
    NLow <- sum(pop < cutoff)
    lprior <- ddirichlet(diffs, (nEff + alpha) / (N + 1) * rep(1, N + 1), log = TRUE)
    llik <- dhyper(nLow, NLow, N - NLow, n, log = TRUE)
    return(llik + lprior)
  }

  priorProposal <- function(x) {
    as.numeric(rdirichlet(1, rep((nEff + alpha) / (N + 1), length(x))) * sum(x))
  }

  priorDensity <- function(x) {
    ddirichlet(x / sum(x), rep((nEff + alpha) / (N + 1), length(x)), log = TRUE)
  }

  spacings <- matrix(NA, nrow = samples, ncol = N + 1)
  data <- matrix(NA, nrow = samples, ncol = N)

  chunks <- split(1:(N + 1), 1:(N + 1))

  chunks2 <- list()
  for(i in 1:(length(chunks) - 1)) {
    chunks2[[i]] <- c(chunks[[i]], chunks[[i + 1]])
  }


  n <- length(x)
  nLow <- sum(x < cutoff)

  data[1, ] <- sort(c(runif(ceiling(N * nLow / n), 0, cutoff), 
                      runif(floor(N * (n - nLow) / n), cutoff, 1)))
  spacings[1, ] <- diff(c(0, data[1, ], 1))


  lp0 <- lpost(nLow, data[1, ], spacings[1, ], N, nEff, alpha, cutoff)

  for(i in 2:samples) {
    currentSpacings <- spacings[i - 1,]
    currentData <- data[i - 1, ]
    for(k in 1:length(chunks2)) {
      proposeSpacings <- currentSpacings
      proposeSpacings[chunks2[[k]]] <- priorProposal(proposeSpacings[chunks2[[k]]])
      proposeData <- cumsum(proposeSpacings)[-(N + 1)]
      lp1 <- lpost(nLow, proposeData, proposeSpacings, N, nEff, alpha, cutoff)

      u <- runif(1)
      if(is.finite(lp1) && log(u) < (lp1 - priorDensity(proposeSpacings)) - 
         (lp0 - priorDensity(currentSpacings))) {
        currentSpacings <- proposeSpacings
        currentData <- proposeData
        lp0 <- lp1
      }
    }
    spacings[i, ] <- currentSpacings
    data[i, ] <- currentData
  }

  out <- list(
    data = data,
    spacings = spacings,
    x = x,
    N = n,
    samples = samples,
    nEff = nEff,
    alpha = alpha,
    cutoff = cutoff
  )

  return(out)
}

Comparisons

Text here.

x1 <- c(0.11, 0.19, 0.27, 0.39, 0.42, 0.48, 0.59, 0.71, 0.84, 0.92)
x2 <- c(0.07, 0.12, 0.19, 0.26, 0.31, 0.36, 0.41, 0.49, 0.67, 0.89)
N <- 39
nEff <- 4
alpha <- 0.5
cutoff <- 0.5
initRun <- importanceSampler(x = x1, N = N, samples = 100, nEff = 4)
initRun <- mcmcPrior(x = x1, N = N, samples = 100, nEff = 4)
initRun <- mcmcPriorBlock(x = x1, N = N, samples = 100, nEff = 4)

initRun <- importanceSampler(x = x2, N = N, samples = 100, nEff = 4)
initRun <- mcmcPrior(x = x2, N = N, samples = 100, nEff = 4)
initRun <- mcmcPriorBlock(x = x2, N = N, samples = 100, nEff = 4)
time_1 <- profileExpr(pops1_1 <- importanceSampler(x = x1, N = N, samples = samples, nEff = 4))
time_2 <- profileExpr(pops1_2 <- mcmcPrior(x = x1, N = N, samples = samples, nEff = 4))
time_3 <- profileExpr(pops1_3 <- mcmcPriorBlock(x = x1, N = N, samples = samples, nEff = 4))

pops2_1 <- importanceSampler(x = x2, N = N, samples = samples, nEff = 4)
pops2_2 <- mcmcPrior(x = x2, N = N, samples = samples, nEff = 4)
pops2_3 <- mcmcPriorBlock(x = x2, N = N, samples = samples, nEff = 4)

prof_1 <- slice_max(data.frame(flatProfile(time_1, byTotal = TRUE, GC = FALSE)), total.time, n = 10, with_ties = FALSE)
prof_2 <- slice_max(data.frame(flatProfile(time_2, byTotal = TRUE, GC = FALSE)), total.time, n = 10, with_ties = FALSE)
prof_3 <- slice_max(data.frame(flatProfile(time_3, byTotal = TRUE, GC = FALSE)), total.time, n = 10, with_ties = FALSE)

Location

Means

par(mfrow = c(1, 2), mar = c(3, 3, 0, 0.5))

plot(density(rowMeans(pops1_1)), col = "red", main = "", xlab = "Mean of Population", 
     ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "")
axis(1, 2:8 / 10)
axis(2, 0:7)
lines(density(rowMeans(pops1_2$data)), col = "blue")
lines(density(rowMeans(pops1_3$data)), col = "green")

par(mar = c(3, 0.5, 0, 3))

plot(density(rowMeans(pops2_1)), col = "red", main = "", xlab = "Mean of Population", 
     ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "")
axis(1, 2:8 / 10)
axis(4, 0:7)
lines(density(rowMeans(pops2_2$data)), col = "blue")
lines(density(rowMeans(pops2_3$data)), col = "green")

Medians

par(mfrow = c(1, 2), mar = c(3, 3, 0, 0.5))

plot(density(apply(pops1_1, 1, median)), col = "red", main = "", ylim = c(0, 7), xaxt = "n", 
     yaxt = "n", ylab = "")
axis(1, 2:8 / 10)
axis(2, 0:7)
lines(density(apply(pops1_2$data, 1, median)), col = "blue")
lines(density(apply(pops1_3$data, 1, median)), col = "green")

par(mar = c(3, 0.5, 0, 3))

plot(density(apply(pops2_1, 1, median)), col = "red", main = "", ylim = c(0, 7), xaxt = "n", 
     yaxt = "n", ylab = "")
axis(1, 2:8 / 10)
axis(4, 0:7)
lines(density(apply(pops2_2$data, 1, median)), col = "blue")
lines(density(apply(pops2_3$data, 1, median)), col = "green")

Empirical CDFs

par(mfrow = c(1, 3), mar = c(3, 0.5, 0, 0.5))

pops1_1_sort = t(apply(pops1_1, 1, sort))
plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n', 
     yaxt = 'n')
matlines(t(pops1_1_sort), cumsum(rep(1/N, N)), lty = 1, type = 's')
axis(1, 1:9 / 10)
mtext("Importance Sampling")

pops1_2_sort = t(apply(pops1_2$data, 1, sort))
plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n', 
     yaxt = 'n', ann = FALSE)
matlines(t(pops1_2_sort), cumsum(rep(1/N, N)), lty = 1, type = 's')
axis(1, 1:9 / 10)
mtext("MCMC\n(Prior Proposal)")

pops1_3_sort = t(apply(pops1_3$data, 1, sort))
plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n', 
     yaxt = 'n', ann = FALSE)
matlines(t(pops1_3_sort), cumsum(rep(1/N, N)), lty = 1, type = 's')
axis(1, 1:9 / 10)
mtext("Block MCMC\n(Prior Proposal)")
par(mfrow = c(1, 3), mar = c(3, 0.5, 0, 0.5))

pops2_1_sort = t(apply(pops2_1, 1, sort))
plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n', 
     yaxt = 'n')
matlines(t(pops2_1_sort), cumsum(rep(1/N, N)), lty = 1, type = 's')
axis(1, 1:9 / 10)
mtext("Importance Sampling")

pops2_2_sort = t(apply(pops2_2$data, 1, sort))
plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n',
     yaxt = 'n', ann = FALSE)
matlines(t(pops2_2_sort), cumsum(rep(1/N, N)), lty = 1, type = 's')
axis(1, 1:9 / 10)
mtext("MCMC\n(Prior Proposal)")

pops2_3_sort = t(apply(pops2_3$data, 1, sort))
plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n',
     yaxt = 'n', ann = FALSE)
matlines(t(pops2_3_sort), cumsum(rep(1/N, N)), lty = 1, type = 's')
axis(1, 1:9 / 10)
mtext("Block MCMC\n(Prior Proposal)")

F(0.5)

par(mfrow = c(1, 2), mar = c(3, 3, 0, 0.5))

plot(density(apply(pops1_1, 1, function(x) {sum(x < 0.5) / length(x)})), col = "red", 
     main = "", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "")
axis(1, 2:8 / 10)
axis(2, 0:7)
lines(density(apply(pops1_2$data, 1, function(x) {sum(x < 0.5) / length(x)})), col = "blue")
lines(density(apply(pops1_3$data, 1, function(x) {sum(x < 0.5) / length(x)})), col = "green")

par(mar = c(3, 0.5, 0, 3))

plot(density(apply(pops2_1, 1, function(x) {sum(x < 0.5) / length(x)})), col = "red",
     main = "", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "")
axis(1, 2:8 / 10)
axis(4, 0:7)
lines(density(apply(pops2_2$data, 1, function(x) {sum(x < 0.5) / length(x)})), col = "blue")
lines(density(apply(pops2_3$data, 1, function(x) {sum(x < 0.5) / length(x)})), col = "green")

F(0.25)

par(mfrow = c(1, 2), mar = c(3, 3, 0, 0.5))

plot(density(apply(pops1_1, 1, function(x) {sum(x < 0.25) / length(x)})), col = "red",
     main = "", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "")
axis(1, 2:8 / 10)
axis(2, 0:7)
lines(density(apply(pops1_2$data, 1, function(x) {sum(x < 0.25) / length(x)})), col = "blue")
lines(density(apply(pops1_3$data, 1, function(x) {sum(x < 0.25) / length(x)})), col = "green")

par(mar = c(3, 0.5, 0, 3))

plot(density(apply(pops2_1, 1, function(x) {sum(x < 0.25) / length(x)})), col = "red",
     main = "", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "")
axis(1, 2:8 / 10)
axis(4, 0:7)
lines(density(apply(pops2_2$data, 1, function(x) {sum(x < 0.25) / length(x)})), col = "blue")
lines(density(apply(pops2_3$data, 1, function(x) {sum(x < 0.25) / length(x)})), col = "green")

Time

kables(
  list(
    kable(prof_1, col.names = c("Total %", "Total Time", "Self %", "Self Time")) %>%
      kable_styling(full_width = FALSE, position = "float_left"),
    kable(prof_2, col.names = c("Total %", "Total Time", "Self %", "Self Time")) %>%
      kable_styling(full_width = FALSE, position = "center"),
    kable(prof_3, col.names = c("Total %", "Total Time", "Self %", "Self Time")) %>%
      kable_styling(full_width = FALSE, position = "float_left")
  ),
  caption = "Profile for Importance Sampling (left), Prior Proposal MCMC (center), and Prior Proposal Blocked MCMC (right)"
) %>%
  kable_styling(full_width = TRUE, font_size = 12)

Characteristics

CDF

data.frame(pops1_1_sort) %>%
  mutate_all(sort) %>%
  slice(round(seq(1, samples, length.out = 100))) %>%
  pivot_longer(everything(), names_to = "Q", values_to = "X") %>%
  mutate(Q = as.numeric(substring(Q, 2)) / N) %>%
  ggplot(mapping = aes(x = X, y = Q)) + 
  geom_point() + 
  labs(y = "F(x)")

Medians

g1 <- data.frame(X = apply(pops1_1_sort, 1, median)) %>%
  ggplot(mapping = aes(x = X)) + 
  geom_histogram(binwidth = 0.01) + 
  lims(x = c(0, 1)) + 
  labs(y = "Count")

g2 <- data.frame(X = apply(pops2_1_sort, 1, median)) %>%
  ggplot(mapping = aes(x = X)) + 
  geom_histogram(binwidth = 0.01) + 
  lims(x = c(0, 1)) + 
  labs(y = "Count")

grid.arrange(g1, g2, nrow = 1, ncol = 2)

F(0.5)

g1 <- data.frame(X = apply(pops1_1_sort, 1, function(x) {sum(x < 0.5) / length(x)})) %>%
  ggplot(mapping = aes(x = X)) + 
  geom_histogram(binwidth = 1/39) + 
  lims(x = c(0, 1)) + 
  labs(y = "Count")

g2 <- data.frame(X = apply(pops2_1_sort, 1, function(x) {sum(x < 0.5) / length(x)})) %>%
  ggplot(mapping = aes(x = X)) + 
  geom_histogram(binwidth = 1/39) + 
  lims(x = c(0, 1)) + 
  labs(y = "Count")

grid.arrange(g1, g2, nrow = 1, ncol = 2)

Changing the Cutoff

pops1_1_2 <- importanceSampler(x = x1, N = N, samples = samples, nEff = 4, cutoff = 0.6)
pops2_1_2 <- importanceSampler(x = x2, N = N, samples = samples, nEff = 4, cutoff = 0.6)
g1 <- data.frame(X = apply(pops1_1_2, 1, mean)) %>%
  ggplot(mapping = aes(x = X)) + 
  geom_histogram(binwidth = 0.01) + 
  lims(x = c(0, 1)) + 
  labs(y = "Count")

g2 <- data.frame(X = apply(pops2_1_2, 1, mean)) %>%
  ggplot(mapping = aes(x = X)) + 
  geom_histogram(binwidth = 0.01) + 
  lims(x = c(0, 1)) + 
  labs(y = "Count")

g3 <- data.frame(X = apply(pops1_1_2, 1, median)) %>%
  ggplot(mapping = aes(x = X)) + 
  geom_histogram(binwidth = 0.01) + 
  lims(x = c(0, 1)) + 
  labs(y = "Count")

g4 <- data.frame(X = apply(pops2_1_2, 1, median)) %>%
  ggplot(mapping = aes(x = X)) + 
  geom_histogram(binwidth = 0.01) + 
  lims(x = c(0, 1)) + 
  labs(y = "Count")

grid.arrange(g1, g2, g3, g4, nrow = 2, ncol = 2)

Diagnostics

Trace Plots (Mean)

par(mfrow = c(2, 1), mar = c(3, 3, 0.5, 0.5))

plot(1:samples, rowMeans(pops1_2$data), type = "l", col = "red", main = "", 
     xlab = "Mean of Population", ylim = c(0, 1), ylab = "", yaxt = "n")
axis(2, 0:5/5)
lines(1:samples, rowMeans(pops1_3$data), col = "blue")

plot(1:samples, rowMeans(pops2_2$data), type = "l", col = "red", main = "", 
     xlab = "Mean of Population", ylim = c(0, 1), ylab = "", yaxt = "n")
axis(2, 0:5/5)
lines(1:samples, rowMeans(pops2_3$data), col = "blue")

Trace Plots (Median)

par(mfrow = c(2, 1), mar = c(3, 3, 0.5, 0.5))

plot(1:samples, apply(pops1_2$data, 1, median), type = "l", col = "red", main = "", 
     xlab = "Mean of Population", ylim = c(0, 1), ylab = "", yaxt = "n")
axis(2, 0:5/5)
lines(1:samples, apply(pops1_3$data, 1, median), col = "blue")

plot(1:samples, apply(pops2_2$data, 1, median), type = "l", col = "red", main = "", 
     xlab = "Mean of Population", ylim = c(0, 1), ylab = "", yaxt = "n")
axis(2, 0:5/5)
lines(1:samples, apply(pops2_3$data, 1, median), col = "blue")

Converting to Normal

Lets first create a new sample. We will use the same values from the previous examples for $N$, $nEff$, $\alpha$, and the cutoff.

For now, we will only use the importance sampler.

x3 <- sort(rnorm(10, mean = pi, sd = 1))
x3

Now we can convert them to $[0, 1]$ using the CDF.

x3Unif <- pnorm(x3, mean = mean(x3), sd = sd(x3))
x3Unif
sum(x3Unif < 0.5)

And now we can use our importance sampler, and then back-transform using the inverse CDF. We do get a few infinite values (negative and positive infinity), because our Dirichlet can get very close to 0 or 1. We will just discard these.

unifPops <- importanceSampler(x = x3Unif, N = N, samples = samples, nEff = 4)
normPops <- t(apply(unifPops, 1, qnorm, mean = mean(x3), sd = sd(x3)))
normPopsFinite <- normPops[complete.cases(normPops), ]

And now we can plot the distribution of the means and CDFs.

par(mfrow = c(1, 2))

plot(density(rowMeans(normPopsFinite)), col = "red", main = "", xlab = "Mean of Population", ylab = "Density")
abline(v = mean(x3))
abline(v = mean(x3) - sd(x3), lty = 2)
abline(v = mean(x3) + sd(x3), lty = 2)

unifPops_sort = apply(unifPops, 2, sort)
plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities')
matlines(t(unifPops_sort), cumsum(rep(1/N, N)), lty = 1, type = 's')


ctgrubb/lemur.pack documentation built on May 7, 2023, 4:13 a.m.