knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = TRUE ) quick_eval <- FALSE
if(quick_eval) { samples <- 1000 } else { samples <- 75000 } set.seed(1234)
library(extraDistr) library(knitr) library(kableExtra) library(proftools) library(tidyverse) library(gridExtra)
Text here.
Here we will compare three samplers:
importanceSampler <- function(x, N, samples = 1000, saturation = 10, nEff = length(sample), alpha = 0.5, cutoff = 0.5) { n <- length(x) nLow <- sum(x < cutoff) priorDrawsRaw <- rdirichlet(samples * saturation, (nEff + alpha) / (N + 1) * rep(1, N + 1)) priorDraws <- t(apply(matrix(priorDrawsRaw, nrow = samples * saturation) [, -(N + 1), drop = FALSE], 1, cumsum)) NLow <- apply(priorDraws, 1, function(x) sum(x < cutoff)) weights <- dhyper(nLow, NLow, N - NLow, n) draws <- sample(1:(samples * saturation), size = samples, replace = TRUE, prob = weights) return(priorDraws[draws, ]) }
Let $x_1, x_2, \dots, x_N$ represent the ordered population of size N. We denote $s_1 = x_1$, $s_2 = x_2 - x_1, \dots, s_{N-1} = x_N - x_{N-1}$, $s_N = 1 - x_N$ as the spacing of the ordered population.
From our formulation of the posterior, $\sum s_i = 1$ and $(s_1, \dots, s_{N+1}) \sim Dir(\frac{N_E + \alpha}{N+1}, \dots, \frac{N_E + \alpha}{N+1})$. We can use this to figure out the conditional prior for a set of spacings $s_{(m)}$.
For example, suppose we want the distribution of $x_{(1)} | x_{(2)}$, where $x_{(1)}, x_{(2)}$ partition $x$. Fortunately, this is relatively straightforward, and we end up with
$$ \frac{1}{1 - \mathbf{1}^Tx_{(2)}} X_{(1)} | X_{(2)} = x_{(2)} \sim Dir(\frac{N_E + \alpha}{N+1}, \dots, \frac{N_E + \alpha}{N+1})$$
Essentially, nothing changes except we now know the spacings have to add up to $1 - \mathbf{1}^Tx_{(2)}$. We can use this to our advantage, using this conditional prior as our proposal. For simplicity, we let $N_E\ |\ N$, and consider proposal chunks of size $N / N_E$.
mcmcPriorBlock <- function(x, N, samples = 1000, nEff = length(sample), alpha = 0.5, cutoff = 0.5) { lpost <- function(nLow, pop, diffs, N, nEff, alpha, cutoff) { NLow <- sum(pop < cutoff) lprior <- ddirichlet(diffs, (nEff + alpha) / (N + 1) * rep(1, N + 1), log = TRUE) llik <- dhyper(nLow, NLow, N - NLow, n, log = TRUE) return(llik + lprior) } priorProposal <- function(x) { as.numeric(rdirichlet(1, rep((nEff + alpha) / (N + 1), length(x))) * sum(x)) } priorDensity <- function(x) { ddirichlet(x / sum(x), rep((nEff + alpha) / (N + 1), length(x)), log = TRUE) } spacings <- matrix(NA, nrow = samples, ncol = N + 1) data <- matrix(NA, nrow = samples, ncol = N) chunks <- split(1:(N + 1), cut(seq_along(1:(N + 1)), nEff * 2, labels = FALSE)) chunks2 <- list() for(i in 1:(length(chunks) - 1)) { chunks2[[i]] <- c(chunks[[i]], chunks[[i + 1]]) } n <- length(x) nLow <- sum(x < cutoff) data[1, ] <- sort(c(runif(ceiling(N * nLow / n), 0, cutoff), runif(floor(N * (n - nLow) / n), cutoff, 1))) spacings[1, ] <- diff(c(0, data[1, ], 1)) lp0 <- lpost(nLow, data[1, ], spacings[1, ], N, nEff, alpha, cutoff) for(i in 2:samples) { currentSpacings <- spacings[i - 1,] currentData <- data[i - 1, ] for(k in 1:length(chunks2)) { proposeSpacings <- currentSpacings proposeSpacings[chunks2[[k]]] <- priorProposal(proposeSpacings[chunks2[[k]]]) proposeData <- cumsum(proposeSpacings)[-(N + 1)] lp1 <- lpost(nLow, proposeData, proposeSpacings, N, nEff, alpha, cutoff) u <- runif(1) if(is.finite(lp1) && log(u) < (lp1 - priorDensity(proposeSpacings)) - (lp0 - priorDensity(currentSpacings))) { currentSpacings <- proposeSpacings currentData <- proposeData lp0 <- lp1 } } spacings[i, ] <- currentSpacings data[i, ] <- currentData } out <- list( data = data, spacings = spacings, x = x, N = n, samples = samples, nEff = nEff, alpha = alpha, cutoff = cutoff ) return(out) }
mcmcPrior <- function(x, N, samples = 1000, nEff = length(sample), alpha = 0.5, cutoff = 0.5) { lpost <- function(nLow, pop, diffs, N, nEff, alpha, cutoff) { NLow <- sum(pop < cutoff) lprior <- ddirichlet(diffs, (nEff + alpha) / (N + 1) * rep(1, N + 1), log = TRUE) llik <- dhyper(nLow, NLow, N - NLow, n, log = TRUE) return(llik + lprior) } priorProposal <- function(x) { as.numeric(rdirichlet(1, rep((nEff + alpha) / (N + 1), length(x))) * sum(x)) } priorDensity <- function(x) { ddirichlet(x / sum(x), rep((nEff + alpha) / (N + 1), length(x)), log = TRUE) } spacings <- matrix(NA, nrow = samples, ncol = N + 1) data <- matrix(NA, nrow = samples, ncol = N) chunks <- split(1:(N + 1), 1:(N + 1)) chunks2 <- list() for(i in 1:(length(chunks) - 1)) { chunks2[[i]] <- c(chunks[[i]], chunks[[i + 1]]) } n <- length(x) nLow <- sum(x < cutoff) data[1, ] <- sort(c(runif(ceiling(N * nLow / n), 0, cutoff), runif(floor(N * (n - nLow) / n), cutoff, 1))) spacings[1, ] <- diff(c(0, data[1, ], 1)) lp0 <- lpost(nLow, data[1, ], spacings[1, ], N, nEff, alpha, cutoff) for(i in 2:samples) { currentSpacings <- spacings[i - 1,] currentData <- data[i - 1, ] for(k in 1:length(chunks2)) { proposeSpacings <- currentSpacings proposeSpacings[chunks2[[k]]] <- priorProposal(proposeSpacings[chunks2[[k]]]) proposeData <- cumsum(proposeSpacings)[-(N + 1)] lp1 <- lpost(nLow, proposeData, proposeSpacings, N, nEff, alpha, cutoff) u <- runif(1) if(is.finite(lp1) && log(u) < (lp1 - priorDensity(proposeSpacings)) - (lp0 - priorDensity(currentSpacings))) { currentSpacings <- proposeSpacings currentData <- proposeData lp0 <- lp1 } } spacings[i, ] <- currentSpacings data[i, ] <- currentData } out <- list( data = data, spacings = spacings, x = x, N = n, samples = samples, nEff = nEff, alpha = alpha, cutoff = cutoff ) return(out) }
Text here.
x1 <- c(0.11, 0.19, 0.27, 0.39, 0.42, 0.48, 0.59, 0.71, 0.84, 0.92) x2 <- c(0.07, 0.12, 0.19, 0.26, 0.31, 0.36, 0.41, 0.49, 0.67, 0.89)
N <- 39 nEff <- 4 alpha <- 0.5 cutoff <- 0.5
initRun <- importanceSampler(x = x1, N = N, samples = 100, nEff = 4) initRun <- mcmcPrior(x = x1, N = N, samples = 100, nEff = 4) initRun <- mcmcPriorBlock(x = x1, N = N, samples = 100, nEff = 4) initRun <- importanceSampler(x = x2, N = N, samples = 100, nEff = 4) initRun <- mcmcPrior(x = x2, N = N, samples = 100, nEff = 4) initRun <- mcmcPriorBlock(x = x2, N = N, samples = 100, nEff = 4)
time_1 <- profileExpr(pops1_1 <- importanceSampler(x = x1, N = N, samples = samples, nEff = 4)) time_2 <- profileExpr(pops1_2 <- mcmcPrior(x = x1, N = N, samples = samples, nEff = 4)) time_3 <- profileExpr(pops1_3 <- mcmcPriorBlock(x = x1, N = N, samples = samples, nEff = 4)) pops2_1 <- importanceSampler(x = x2, N = N, samples = samples, nEff = 4) pops2_2 <- mcmcPrior(x = x2, N = N, samples = samples, nEff = 4) pops2_3 <- mcmcPriorBlock(x = x2, N = N, samples = samples, nEff = 4) prof_1 <- slice_max(data.frame(flatProfile(time_1, byTotal = TRUE, GC = FALSE)), total.time, n = 10, with_ties = FALSE) prof_2 <- slice_max(data.frame(flatProfile(time_2, byTotal = TRUE, GC = FALSE)), total.time, n = 10, with_ties = FALSE) prof_3 <- slice_max(data.frame(flatProfile(time_3, byTotal = TRUE, GC = FALSE)), total.time, n = 10, with_ties = FALSE)
par(mfrow = c(1, 2), mar = c(3, 3, 0, 0.5)) plot(density(rowMeans(pops1_1)), col = "red", main = "", xlab = "Mean of Population", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "") axis(1, 2:8 / 10) axis(2, 0:7) lines(density(rowMeans(pops1_2$data)), col = "blue") lines(density(rowMeans(pops1_3$data)), col = "green") par(mar = c(3, 0.5, 0, 3)) plot(density(rowMeans(pops2_1)), col = "red", main = "", xlab = "Mean of Population", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "") axis(1, 2:8 / 10) axis(4, 0:7) lines(density(rowMeans(pops2_2$data)), col = "blue") lines(density(rowMeans(pops2_3$data)), col = "green")
par(mfrow = c(1, 2), mar = c(3, 3, 0, 0.5)) plot(density(apply(pops1_1, 1, median)), col = "red", main = "", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "") axis(1, 2:8 / 10) axis(2, 0:7) lines(density(apply(pops1_2$data, 1, median)), col = "blue") lines(density(apply(pops1_3$data, 1, median)), col = "green") par(mar = c(3, 0.5, 0, 3)) plot(density(apply(pops2_1, 1, median)), col = "red", main = "", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "") axis(1, 2:8 / 10) axis(4, 0:7) lines(density(apply(pops2_2$data, 1, median)), col = "blue") lines(density(apply(pops2_3$data, 1, median)), col = "green")
par(mfrow = c(1, 3), mar = c(3, 0.5, 0, 0.5)) pops1_1_sort = t(apply(pops1_1, 1, sort)) plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n', yaxt = 'n') matlines(t(pops1_1_sort), cumsum(rep(1/N, N)), lty = 1, type = 's') axis(1, 1:9 / 10) mtext("Importance Sampling") pops1_2_sort = t(apply(pops1_2$data, 1, sort)) plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n', yaxt = 'n', ann = FALSE) matlines(t(pops1_2_sort), cumsum(rep(1/N, N)), lty = 1, type = 's') axis(1, 1:9 / 10) mtext("MCMC\n(Prior Proposal)") pops1_3_sort = t(apply(pops1_3$data, 1, sort)) plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n', yaxt = 'n', ann = FALSE) matlines(t(pops1_3_sort), cumsum(rep(1/N, N)), lty = 1, type = 's') axis(1, 1:9 / 10) mtext("Block MCMC\n(Prior Proposal)")
par(mfrow = c(1, 3), mar = c(3, 0.5, 0, 0.5)) pops2_1_sort = t(apply(pops2_1, 1, sort)) plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n', yaxt = 'n') matlines(t(pops2_1_sort), cumsum(rep(1/N, N)), lty = 1, type = 's') axis(1, 1:9 / 10) mtext("Importance Sampling") pops2_2_sort = t(apply(pops2_2$data, 1, sort)) plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n', yaxt = 'n', ann = FALSE) matlines(t(pops2_2_sort), cumsum(rep(1/N, N)), lty = 1, type = 's') axis(1, 1:9 / 10) mtext("MCMC\n(Prior Proposal)") pops2_3_sort = t(apply(pops2_3$data, 1, sort)) plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities', xaxt = 'n', yaxt = 'n', ann = FALSE) matlines(t(pops2_3_sort), cumsum(rep(1/N, N)), lty = 1, type = 's') axis(1, 1:9 / 10) mtext("Block MCMC\n(Prior Proposal)")
par(mfrow = c(1, 2), mar = c(3, 3, 0, 0.5)) plot(density(apply(pops1_1, 1, function(x) {sum(x < 0.5) / length(x)})), col = "red", main = "", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "") axis(1, 2:8 / 10) axis(2, 0:7) lines(density(apply(pops1_2$data, 1, function(x) {sum(x < 0.5) / length(x)})), col = "blue") lines(density(apply(pops1_3$data, 1, function(x) {sum(x < 0.5) / length(x)})), col = "green") par(mar = c(3, 0.5, 0, 3)) plot(density(apply(pops2_1, 1, function(x) {sum(x < 0.5) / length(x)})), col = "red", main = "", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "") axis(1, 2:8 / 10) axis(4, 0:7) lines(density(apply(pops2_2$data, 1, function(x) {sum(x < 0.5) / length(x)})), col = "blue") lines(density(apply(pops2_3$data, 1, function(x) {sum(x < 0.5) / length(x)})), col = "green")
par(mfrow = c(1, 2), mar = c(3, 3, 0, 0.5)) plot(density(apply(pops1_1, 1, function(x) {sum(x < 0.25) / length(x)})), col = "red", main = "", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "") axis(1, 2:8 / 10) axis(2, 0:7) lines(density(apply(pops1_2$data, 1, function(x) {sum(x < 0.25) / length(x)})), col = "blue") lines(density(apply(pops1_3$data, 1, function(x) {sum(x < 0.25) / length(x)})), col = "green") par(mar = c(3, 0.5, 0, 3)) plot(density(apply(pops2_1, 1, function(x) {sum(x < 0.25) / length(x)})), col = "red", main = "", ylim = c(0, 7), xaxt = "n", yaxt = "n", ylab = "") axis(1, 2:8 / 10) axis(4, 0:7) lines(density(apply(pops2_2$data, 1, function(x) {sum(x < 0.25) / length(x)})), col = "blue") lines(density(apply(pops2_3$data, 1, function(x) {sum(x < 0.25) / length(x)})), col = "green")
kables( list( kable(prof_1, col.names = c("Total %", "Total Time", "Self %", "Self Time")) %>% kable_styling(full_width = FALSE, position = "float_left"), kable(prof_2, col.names = c("Total %", "Total Time", "Self %", "Self Time")) %>% kable_styling(full_width = FALSE, position = "center"), kable(prof_3, col.names = c("Total %", "Total Time", "Self %", "Self Time")) %>% kable_styling(full_width = FALSE, position = "float_left") ), caption = "Profile for Importance Sampling (left), Prior Proposal MCMC (center), and Prior Proposal Blocked MCMC (right)" ) %>% kable_styling(full_width = TRUE, font_size = 12)
data.frame(pops1_1_sort) %>% mutate_all(sort) %>% slice(round(seq(1, samples, length.out = 100))) %>% pivot_longer(everything(), names_to = "Q", values_to = "X") %>% mutate(Q = as.numeric(substring(Q, 2)) / N) %>% ggplot(mapping = aes(x = X, y = Q)) + geom_point() + labs(y = "F(x)")
g1 <- data.frame(X = apply(pops1_1_sort, 1, median)) %>% ggplot(mapping = aes(x = X)) + geom_histogram(binwidth = 0.01) + lims(x = c(0, 1)) + labs(y = "Count") g2 <- data.frame(X = apply(pops2_1_sort, 1, median)) %>% ggplot(mapping = aes(x = X)) + geom_histogram(binwidth = 0.01) + lims(x = c(0, 1)) + labs(y = "Count") grid.arrange(g1, g2, nrow = 1, ncol = 2)
g1 <- data.frame(X = apply(pops1_1_sort, 1, function(x) {sum(x < 0.5) / length(x)})) %>% ggplot(mapping = aes(x = X)) + geom_histogram(binwidth = 1/39) + lims(x = c(0, 1)) + labs(y = "Count") g2 <- data.frame(X = apply(pops2_1_sort, 1, function(x) {sum(x < 0.5) / length(x)})) %>% ggplot(mapping = aes(x = X)) + geom_histogram(binwidth = 1/39) + lims(x = c(0, 1)) + labs(y = "Count") grid.arrange(g1, g2, nrow = 1, ncol = 2)
pops1_1_2 <- importanceSampler(x = x1, N = N, samples = samples, nEff = 4, cutoff = 0.6) pops2_1_2 <- importanceSampler(x = x2, N = N, samples = samples, nEff = 4, cutoff = 0.6)
g1 <- data.frame(X = apply(pops1_1_2, 1, mean)) %>% ggplot(mapping = aes(x = X)) + geom_histogram(binwidth = 0.01) + lims(x = c(0, 1)) + labs(y = "Count") g2 <- data.frame(X = apply(pops2_1_2, 1, mean)) %>% ggplot(mapping = aes(x = X)) + geom_histogram(binwidth = 0.01) + lims(x = c(0, 1)) + labs(y = "Count") g3 <- data.frame(X = apply(pops1_1_2, 1, median)) %>% ggplot(mapping = aes(x = X)) + geom_histogram(binwidth = 0.01) + lims(x = c(0, 1)) + labs(y = "Count") g4 <- data.frame(X = apply(pops2_1_2, 1, median)) %>% ggplot(mapping = aes(x = X)) + geom_histogram(binwidth = 0.01) + lims(x = c(0, 1)) + labs(y = "Count") grid.arrange(g1, g2, g3, g4, nrow = 2, ncol = 2)
par(mfrow = c(2, 1), mar = c(3, 3, 0.5, 0.5)) plot(1:samples, rowMeans(pops1_2$data), type = "l", col = "red", main = "", xlab = "Mean of Population", ylim = c(0, 1), ylab = "", yaxt = "n") axis(2, 0:5/5) lines(1:samples, rowMeans(pops1_3$data), col = "blue") plot(1:samples, rowMeans(pops2_2$data), type = "l", col = "red", main = "", xlab = "Mean of Population", ylim = c(0, 1), ylab = "", yaxt = "n") axis(2, 0:5/5) lines(1:samples, rowMeans(pops2_3$data), col = "blue")
par(mfrow = c(2, 1), mar = c(3, 3, 0.5, 0.5)) plot(1:samples, apply(pops1_2$data, 1, median), type = "l", col = "red", main = "", xlab = "Mean of Population", ylim = c(0, 1), ylab = "", yaxt = "n") axis(2, 0:5/5) lines(1:samples, apply(pops1_3$data, 1, median), col = "blue") plot(1:samples, apply(pops2_2$data, 1, median), type = "l", col = "red", main = "", xlab = "Mean of Population", ylim = c(0, 1), ylab = "", yaxt = "n") axis(2, 0:5/5) lines(1:samples, apply(pops2_3$data, 1, median), col = "blue")
Lets first create a new sample. We will use the same values from the previous examples for $N$, $nEff$, $\alpha$, and the cutoff.
For now, we will only use the importance sampler.
x3 <- sort(rnorm(10, mean = pi, sd = 1)) x3
Now we can convert them to $[0, 1]$ using the CDF.
x3Unif <- pnorm(x3, mean = mean(x3), sd = sd(x3)) x3Unif sum(x3Unif < 0.5)
And now we can use our importance sampler, and then back-transform using the inverse CDF. We do get a few infinite values (negative and positive infinity), because our Dirichlet can get very close to 0 or 1. We will just discard these.
unifPops <- importanceSampler(x = x3Unif, N = N, samples = samples, nEff = 4) normPops <- t(apply(unifPops, 1, qnorm, mean = mean(x3), sd = sd(x3))) normPopsFinite <- normPops[complete.cases(normPops), ]
And now we can plot the distribution of the means and CDFs.
par(mfrow = c(1, 2)) plot(density(rowMeans(normPopsFinite)), col = "red", main = "", xlab = "Mean of Population", ylab = "Density") abline(v = mean(x3)) abline(v = mean(x3) - sd(x3), lty = 2) abline(v = mean(x3) + sd(x3), lty = 2) unifPops_sort = apply(unifPops, 2, sort) plot(c(0, 1), c(0, 1), type = 'l', xlab = 'U', ylab = 'Cumulative Probabilities') matlines(t(unifPops_sort), cumsum(rep(1/N, N)), lty = 1, type = 's')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.