knitr::opts_chunk$set( collapse = TRUE, comment = "##", eval = TRUE, cache = TRUE ) quick_eval <- FALSE options(scipen = 6) set.seed(318937291)
library(extraDistr) library(lemur.pack) library(MASS) library(mvtnorm) library(latex2exp) library(microbenchmark) library(tidyverse) library(gridExtra) library(cowplot)
Let $X_1, X_2, X_3, \dots$ be a sequence of i.i.d. random variables with mean 0 and variance 1. Let $S_n := \sum_{i=1}^n X_i$. The stochastic process $S := (S_n)_{n \in \mathbb{N}}$ is known as a random walk.
n <- 100 X <- rnorm(n) S <- cumsum(X)
par(mar = c(2.5, 2.5, 1, 1)) plot(1:n, S, type = "l", xlab = "n")
Now, let
[ W^{(n)}(t) := \frac{S_{\lfloor nt \rfloor}}{\sqrt{n}}, \qquad t \in [0, 1]. ]
The central limit theorem tells us this rescaled $W^{(n)}(1)$ converges in distribution to a standard Gaussian random variable as $n \to \infty$.
t <- seq(0, 1, length = n + 1)[-1] W <- S[floor(n * t)] / sqrt(n)
par(mar = c(2.5, 2.5, 1, 1)) plot(t, W, type = "l", xlab = "t")
If we rerun this entire process $k$ times, we can create a QQ plot of $W^{(n)}(1)$.
```r(1)$"} k <- 1000 Wn <- rep(0, k)
for(i in 1:k) { X <- rnorm(n) S <- cumsum(X)
Wn[i] <- S[n] / sqrt(n) }
par(mar = c(2.5, 2.5, 1, 1)) qqnorm(Wn, main = "")
It should come as no surprise that $W^{(n)}(1)$ is in fact distributed standard normal, as \[ W^{(n)}(1) := \frac{1}{\sqrt{n}} \sum_{i=1}^{n} X_i. \] # Brownian Bridges ## Construction A Brownian bridge is the stochastic process $B(t)$ whose probability distribution is the conditional distribution of a standard Wiener process $W(t)$ (also known as Brownian motion) subject to $W(T) = 0$. Mathematically, there are a number of ways to arrive at this process. One way is to consider a Wiener process $W_n$ as defined above. Then, \[ B(t) := W(t) - \frac{t}{T}W(T) \] It is relatively simple to consider a generalized Brownian bridge, where $B(t_1) = a$ and $B(t_2) = b$. We can simulate Brownian bridges using the above method. ```r n <- 5 k <- 100 storage <- matrix(NA, nrow = k, ncol = n) times <- seq(0, 1, length.out = n + 2)[2:(n + 1)] target <- 0 for(i in 1:k) { dW <- rnorm(n + 1) / sqrt(n) W <- cumsum(dW) storage[i, ] <- W[1:n] + times * (target - W[n + 1]) }
par(mar = c(2.5, 2.5, 1, 1)) plot(NA, NA, xlim = c(0, 1), ylim = c(-2, 2), xlab = "t (discrete)", ylab = "B(t)") for(i in 1:k) { lines(0:6 / 6, c(0, storage[i, ], 0), type = "l") }
Alternatively, we can return to $S$, as defined above. We can write $S = AX$ where
[ A = \begin{bmatrix} 1 & 0 & 0 & 0 & \cdots & 0 \ 1 & 1 & 0 & 0 & \cdots & 0 \ 1 & 1 & 1 & 0 & \cdots & 0 \ 1 & 1 & 1 & 1 & \cdots & 0 \ \vdots & \vdots & \vdots & \vdots & \ddots & \vdots \ 1 & 1 & 1 & 1 & \cdots & 1 \end{bmatrix} ]
Thus, $Var(S) = A \Sigma A^T$. In our case, $\Sigma = I$, so $Var(S) = AA^T$, which has the form
[ Var(S) = \begin{bmatrix} 1 & 1 & 1 & 1 & \cdots & 1 \ 1 & 2 & 2 & 2 & \cdots & 2 \ 1 & 2 & 3 & 3 & \cdots & 3 \ 1 & 2 & 3 & 4 & \cdots & 4 \ \vdots & \vdots & \vdots & \vdots & \ddots & \vdots \ 1 & 2 & 3 & 4 & \cdots & n \end{bmatrix} ]
The inverse of $AA^T$ is the precision matrix $\Omega$, which also has a distinct form.
[ \Omega = \begin{bmatrix} 2 & -1 & 0 & 0 & \cdots & 0 \ -1 & 2 & -1 & 0 & \cdots & 0 \ 0 & -1 & 2 & -1 & \cdots & 0 \ 0 & 0 & -1 & 2 & \cdots & 0 \ \vdots & \vdots & \vdots & \vdots & \ddots & \vdots \ 0 & 0 & 0 & 0 & \cdots & 1 \end{bmatrix} ]
If we pass this $\Omega$ into the kernel of a Gaussian distribution, the resulting quantity $\exp{-\frac{1}{2}S^T \Omega S}$ simplifies.
\begin{align} S^T \Omega S &= \begin{bmatrix} S_1 \ S_2 \ S_3 \ S_4 \ \vdots \ S_n \end{bmatrix}^T \begin{bmatrix} 2 & -1 & 0 & 0 & \cdots & 0 \ -1 & 2 & -1 & 0 & \cdots & 0 \ 0 & -1 & 2 & -1 & \cdots & 0 \ 0 & 0 & -1 & 2 & \cdots & 0 \ \vdots & \vdots & \vdots & \vdots & \ddots & \vdots \ 0 & 0 & 0 & 0 & \cdots & 1 \end{bmatrix} \begin{bmatrix} S_1 \ S_2 \ S_3 \ S_4 \ \vdots \ S_n \end{bmatrix} \ &= \begin{bmatrix} 2 S_1 - S_2 \ -S_1 + 2 S_2 - S_3 \ -S_2 + 2 S_3 -S_4 \ -S_3 + 2 S_4 - S_5 \ \vdots \ -S_{n-1} + S_n \end{bmatrix}^T \begin{bmatrix} S_1 \ S_2 \ S_3 \ S_4 \ \vdots \ S_n \end{bmatrix} \ &= 2 {S_1}^2 - 2 S_1 S_2 + 2 {S_2}^2 - 2 S_2 S_3 + 2 {S_3}^2 - 2 S_3 S_4 + \dots - 2 S_{n-1} S_n + {S_n}^2 \ &= (S_1 - 0)^2 + (S_2 - S_1)^2 + (S_3 - S_2)^2 + (S_4 - S_3)^2 + \dots + (S_n - S_{n-1})^2 (#eq:omega-simplification) \end{align}
If we want to tie $S_n$ to 0, is it as simple as adding a $(0 - S_n)^2$ term? This is equivalent to changing the $\Omega_{n,n}$ element to 2 from 1. Also, we can't forget our constant multiplier; we have been working with $S$ but we actually need to work with $W$, so there is an extra $\frac{1}{\sqrt{n}}$ term inside our $A$ matrix. b Lets look at an example with $n = 5$. First, we construct $\Omega$.
omega <- diag(2, n) omega[abs(row(omega) - col(omega)) == 1] <- -1
Now we can construct samples, and plot them.
sigma <- 1 / n * chol2inv(chol(omega)) storage <- rmvnorm(k, sigma = sigma)
par(mar = c(2.5, 2.5, 1, 1)) plot(NA, NA, xlim = c(0, 1), ylim = c(-2, 2), xlab = "t (discrete)", ylab = "B(t)") for(i in 1:k) { lines(0:6 / 6, c(0, storage[i, ], 0), type = "l") }
To confirm that these two methods are identical, we can compare the empirical variances at each discrete $t$ value.
n <- 5 k <- 100000 storage <- matrix(NA, nrow = k, ncol = n) times <- seq(0, 1, length.out = n + 2)[2:(n + 1)] target <- 0 for(i in 1:k) { dW <- rnorm(n + 1) / sqrt(n) W <- cumsum(dW) storage[i, ] <- W[1:n] + times * (target - W[n + 1]) } omega <- diag(2, n) omega[abs(row(omega) - col(omega)) == 1] <- -1 sigma <- 1 / n * chol2inv(chol(omega)) storage2 <- rmvnorm(k, sigma = sigma) apply(storage, 2, var) apply(storage2, 2, var)
These are very close to each other, so it is not unreasonable to conclude the variances are identical. Since the means are clearly zero, and both methods are normally distributed (first method is linear combination of normals, which is normal), we can conclude they are identically distributed.
Let $B(t)$ denote a brownian bridge with $t = 1, 2, \dots, T$ discrete steps. If we want the marginal distribution of $B(v)$ for $v \subset t$.
For example, lets look at $k = 100000$ bridges with $4$ intermediate points, and focus on $B(v)$ with $v = 1, 3, 4$. We know the expected value will always be zero, so we focus on the covariance matrix. Empirically, we get
n <- 4 k <- 100000 times <- seq(0, 1, length.out = n + 2)[2:(n + 1)] target <- 0 omega <- diag(2, n) omega[abs(row(omega) - col(omega)) == 1] <- -1 sigma <- 1 / n * chol2inv(chol(omega)) storage <- rmvnorm(k, sigma = sigma) var(storage[, c(1, 3, 4)])
And theoretically, we should get
fractions(sigma[c(1, 3, 4), c(1, 3, 4)])
If we look at the precision matrix instead of covariance (also we will remove the factor of $n$ from now on), as well as the determinant, we get
fractions(solve(n * sigma[c(1, 3, 4), c(1, 3, 4)]))
and
fractions(det(solve(n * sigma[c(1, 3, 4), c(1, 3, 4)])))
Our quadratic form is thus
[ \begin{align} {x^T \Omega x} &= {x_1}^2 + \frac{1}{2} {x_1}^2 - x_1 x_3 + \frac{3}{2} {x_3}^2 - 2 x_3 x_4 + {x_4}^2 + {x_4}^2 \ &= (x_1 - 0)^2 + \frac{1}{2} (x_3 - x_1)^2 + (x_4 - x_3)^2 + (0 - x_4)^2. \end{align} ]
If we instead look at $v = 1, 4$, we get
fractions(solve(n * sigma[c(1, 4), c(1, 4)]))
and
fractions(det(solve(n * sigma[c(1, 4), c(1, 4)])))
and our quadratic form is
[ \begin{align} {x^T \Omega x} &= {x_1}^2 + \frac{1}{3} {x_1}^2 - \frac{2}{3} x_1 x_4 + \frac{1}{3} {x_4}^2 + {x_4}^2 \ &= (x_1 - 0)^2 + \frac{1}{3} (x_4 - x_1)^2 + (0 - x_4)^2. \end{align} ]
It looks like there is a pattern emerging. To confirm, let $n = 6$, and $v = 1, 3, 6$. If we look at the precision matrix and determinant (also removing the factor of $n$), we get
n <- 6 k <- 100000 times <- seq(0, 1, length.out = n + 2)[2:(n + 1)] target <- 0 omega <- diag(2, n) omega[abs(row(omega) - col(omega)) == 1] <- -1 sigma <- 1 / n * chol2inv(chol(omega)) storage <- rmvnorm(k, sigma = sigma)
fractions(solve(n * sigma[c(1, 3, 6), c(1, 3, 6)]))
and
fractions(det(solve(n * sigma[c(1, 3, 6), c(1, 3, 6)])))
and our quadratic form is
[ \begin{align} {x^T \Omega x} &= {x_1}^2 + \frac{1}{2} {x_1}^2 - x_1 x_3 + \frac{5}{6} {x_3}^2 + - \frac{2}{3} x_3 x_6 + \frac{1}{3} {x_6}^2 + {x_4}^2 \ &= (x_1 - 0)^2 + \frac{1}{2} (x_3 - x_1)^2 + \frac{1}{3} (x_6 - x_3)^2 + (0 - x_6)^2. \end{align} ]
The pattern is now clear. Note the coefficients are determined by the distance between observations. For example, the coefficient in front of $(x_3 - x_1)^2$ is $\frac{1}{2}$ because the distance between $x_3$ and $x_1$ is $2$. Likewise, the coefficient for $(x_6 - x_3)^2$ is $\frac{1}{3}$ because the distance between them is $3$. Also, the determinant of the precision matrix is always the product of these inverse distances, with an additional $n - 1$ term. We can use these facts to calculate the marginal likelihood without ever constructing a matrix.
Let $b$ be a potential Brownian bridge. If we want to evaluate the log density $f(B = b)$, the most basic way would be to calculate $\Sigma = \Omega^{-1}$ and feed $b$ and $\Sigma$ into a multivariate normal function such as dmvnorm.
denseval.sigma <- function(b) { n <- length(b) omega <- diag(2, n) omega[abs(row(omega) - col(omega)) == 1] <- -1 sigma <- 1 / n * chol2inv(chol(omega)) dmvnorm(b, sigma = sigma, log = TRUE) }
If we are computing the density for many potential bridges, we could calculate $\Sigma$ once, so we should exclude this computation from any comparisons.
denseval.slow <- function(b, sigma) { dmvnorm(b, sigma = sigma, log = TRUE) }
We can make the calculation faster by exploiting \@ref(eq:omega-simplification). Also, we can make use of the fact that $det(\Omega_{nxn}) = n + 1$. Thus,
[ log(f(\mathbf{b}|\mathbf{0}, \mathbf{\Omega})) = \frac{1}{2}\bigg[log(n + 1) + n log(n)- nlog(2\pi) - \mathbf{b}^T\mathbf{\Omega}\mathbf{b}\bigg] ]
denseval.fast <- function(b) { n <- length(b) d2 <- diff(c(0, b, 0)) ** 2 1 / 2 * (log(n + 1) + n * log(n) - n * log(2 * pi) - n * sum(d2)) }
We can confirm that all versions give the same result.
n <- 23 omega <- diag(2, n) omega[abs(row(omega) - col(omega)) == 1] <- -1 sigma <- 1 / n * chol2inv(chol(omega)) b <- rmvnorm(1, sigma = sigma) denseval.slow(b, sigma) denseval.fast(b)
And we can compare the time it takes for each method. Ignoring the outliers (the way R works often causes the first several evaluations of a function to be much slower), we see that the fast version runs much faster, as expected.
par(mar = c(2.5, 4.5, 1, 1)) boxplot(microbenchmark(denseval.slow(b, sigma), denseval.fast(b), times = 1000), log = TRUE)
Using what we learned above, we can evaluate the marginal density without constructing any matrices.
denseval.marginal <- function(b, d, n) { bb <- c(0, b, 0) b2 <- diff(bb) ** 2 dd <- c(0, d, max(d) + 1) d2 <- diff(dd) determ <- (n + 1) / prod(d2) wp <- b2 / d2 1 / 2 * (log(determ) + length(d) * log(n) - length(d) * log(2 * pi) - n * sum(wp)) }
We can also evaluate the marginal distribution of a subset of points. Let $n = 10$, and we will observe $B(v)$ with $v = 1, 3, 6, 7, 10$. First, lets look empirically at these observations.
n <- 10 k <- 10000 d <- c(1, 3, 6, 7, 10) times <- seq(0, 1, length.out = n + 2)[2:(n + 1)] target <- 0 omega <- diag(2, n) omega[abs(row(omega) - col(omega)) == 1] <- -1 sigma <- 1 / n * chol2inv(chol(omega)) storage <- rmvnorm(k, sigma = sigma)
Empirically, if we look at the variance matrix and determinant of our selected observations, we get
solve(var(storage[, d]))
and
det(solve(var(storage[, d])))
Calculating the determinant the way of our fast function, we get
n ** length(d) * (n + 1) / prod(diff(c(0, d, n + 1)))
We can look at the distribution of log densities, using both the slow way and our new function.
new.sigma <- sigma[d, d] logdens <- apply(storage[, d], 1, denseval.marginal, d = d, n = 10) par(mar = c(2.5, 4.5, 1, 1), mfrow = c(1, 2)) plot(density(apply(storage[, d], 1, dmvnorm, sigma = new.sigma, log = TRUE)), main = "", xlab = "") plot(density(logdens), main = "", xlab = "")
If we look at the quantity $2 log(\ell_{max} - \ell)$, it should be distributed $\chi_k^2$, where $k$ is the dimensionality. In this example, we are looking at the marginal at $k = 5$ points, which is a $5-d$ normal.
```r - \ell)$, with overlaid $\chi_5^2$"} m <- max(logdens) t.logdens <- 2 * (m - logdens)
par(mar = c(2.5, 4.5, 1, 1), mfrow = c(1, 1)) hist(t.logdens, main = "", xlab = "", prob = TRUE, breaks = 20) curve(dchisq(x, df = 5), add = TRUE, col = 'blue')
## Testing vectorization One drawback of **denseval.marginal** is that if we are evaluating the marginal density of many potential bridges, it will repeat some calculations unnecessarily. Only the actual bridge values change, but other quantities such as $d$ and $n$ may be the same for every potential bridge. One way of dealing with this is to vectorize our function. ```r v.denseval.marginal <- function(b, d, n) { k <- length(d) b2 <- apply(b, 1, function(x) diff(c(0, x, 0)) ** 2) d2 <- diff(c(0, d, n + 1)) determ <- (n + 1) / prod(d2) wp <- b2 / d2 1 / 2 * (log(determ) + k * log(n) - k * log(2 * pi) - n * colSums(wp)) }
Is this actually faster?
par(mar = c(2.5, 4.5, 1, 1)) boxplot(microbenchmark(apply(storage[, d], 1, denseval.marginal, d = d, n = 10), v.denseval.marginal(storage[, d], d, n), times = 10), log = TRUE, names = c("denseval.marginal", "v.denseval.marginal"))
And are the results the same?
logdens <- apply(storage[, d], 1, denseval.marginal, d = d, n = 10) v.logdens <- v.denseval.marginal(storage[, d], d, n) par(mar = c(2.5, 4.5, 1, 1), mfrow = c(1, 2)) plot(density(logdens), main = "", xlab = "") plot(density(v.logdens), main = "", xlab = "")
Let $F_n$ be the empirical distribution function of i.i.d. random variables $X_1, X_2, X_3,...$ with distribution function $F$. Define the centered and scaled version of $F_n$ by
[ G_n(x) = \sqrt{n}(F_n(x) - F(x)) (#eq:donskerseq) ]
indexed by $x \in \mathbb{R}$. From the classical central limit theorem for fixed $x$, we know the random variable $G_n(x)$ converges in distribution to a Gaussian random variable with mean zero and variance $F(x)(1 - F(x))$ as the sample size $n$ grows. Donsker's theorem adds to this, and shows that $G_n(x)$ converges in distribution to a Gaussian Process with mean zero and covariance $K(G(s), G(t))$ given by
[ K(G(s), G(t)) = E[G(s)G(t)] = min{F(s), F(t)} - F(s)F(t) (#eq:donskerkernel) ]
If the population is $1, 2, 3, 4, 5$, the inverse of our covariance matrix is
X <- 1:5 FX <- 1:5 / 5 sigma <- matrix(NA, nrow = 5, ncol = 5) for(i in 1:5) { for(j in 1:5) { sigma[i, j] <- min(i, j) / 5 - i * j / 5 ** 2 } } solve(sigma[-5, -5])
Note that this is the $n \Omega$, Brownian bridge precision matrix, i.e.,
[ \begin{bmatrix} 10 & -5 & 0 & 0 \ -5 & 10 & -5 & 0 \ 0 & -5 & 10 & -5 \ 0 & 0 & -5 & 10 \ \end{bmatrix} = 5 \begin{bmatrix} 2 & -1 & 0 & 0 \ -1 & 2 & -1 & 0 \ 0 & -1 & 2 & -1 \ 0 & 0 & -1 & 2 \ \end{bmatrix} ]
Let $x \sim N(0, 1)$.
First, lets look at some potential CDFs from sampling $X_n$ with $n = 1000$.
N <- 10 n <- 1000 storage <- matrix(rnorm(N * n), nrow = N, ncol = n) colors <- sample(colors()[-1], N) plot(NA, NA, xlim = c(-3, 3), ylim = c(-0.05, 1.05), xlab = "x", ylab = "F(x)") for(i in 1:N) { lines(sort(storage[i, ]), (1:n) / n, type = "s", col = colors[i]) }
Next, lets transform these $F_n(x)$, using Donsker's theorem.
storage2 <- matrix(NA, nrow = N, ncol = n) for(i in 1:N) { storage2[i, ] <- sqrt(n) * (sapply(storage[i, ], function(x) sum(storage[i, ] <= x)) / n - pnorm(storage[i, ])) }
And now we can visualize them.
plot(NA, NA, xlim = c(-3, 3), ylim = c(-1.5, 1.5), xlab = "x", ylab = TeX("$\\sqrt{n}(F_n(x) - F(x))$")) for(i in 1:N) { ord <- order(storage[i, ]) lines(storage[i, ord], storage2[i, ord], type = "l", col = colors[i]) }
Extracting densities.
covMat <- function(p){ outer(p, p, FUN = "pmin") - outer(p, p) } denStore <- rep(NA, N) for(i in 1:N) { ord <- order(storage[i, ]) K <- covMat(pnorm(storage[i, ord]))[-n, -n] test <- rmvnorm(1, sigma = K) dmvnorm(test, sigma = K, log = TRUE) denStore[i] <- dmvnorm(sqrt(n) * (sapply(storage[i, ord], function(x) sum(storage[i, ] <= x)) / n - pnorm(storage[i, ord]))[-n], sigma = K, log = TRUE) } denStore
Let's draw a population from $Unif({1, 2, 3, \dots, 100})$, of size $N = 10000$.
N <- 10000 pop <- sample(1:100, size = N, replace = TRUE) FN <- cumsum(as.numeric(table(pop))) / N
If we pull large samples, say $n = 1000$, it is almost guaranteed that every potential outcome receives some mass, so we will start with this easy example.
k <- 10 n <- 1000 samples <- matrix(sample(pop, size = k * n, replace = TRUE), nrow = k, ncol = n)
Now we can calculate our $F_n(x)$ for $x = 1, 2, 3, \dots, 100$.
Fn <- t(apply(samples, 1, function(x) cumsum(as.numeric(table(factor(x, levels = 1:100)))) / n))
And we can visualize them.
colors <- sample(colors()[-1], k) plot(NA, NA, xlim = c(0, 101), ylim = c(-0.05, 1.05), xlab = "x", ylab = TeX("$F_n(x)$")) for(i in 1:k) { lines(1:100, Fn[i, ], type = "s", col = colors[i]) }
The next step is to Donskerize these $F_n$ values, by applying the transformation $G_n(x) = \sqrt(n)(F_n(x) - F(x))$.
Gn <- t(apply(Fn, 1, function(x) sqrt(n) * (x - FN)))
And now we can visualize them again.
plot(NA, NA, xlim = c(0, 101), ylim = c(-1.5, 1.), xlab = "x", ylab = TeX("$G_n(x)$")) for(i in 1:k) { lines(1:100, Gn[i, ], type = "l", col = colors[i]) }
And we can get the log densities.
apply(Gn, 1, denseval.fast)
Let our population be discrete uniform at $1, 2, 3, \dots, 100$. Suppose we pull a sample of size $n = 20$ that looks like
[
1 \quad 3 \quad 7 \quad 8 \quad 12 \quad 14 \quad 14 \quad 21 \quad 28 \quad 31 \quad 34 \quad 41 \quad 49 \quad
57 \quad 63 \quad 68 \quad 74 \quad 83 \quad 91 \quad 100
]
x <- c(1, 3, 7, 8, 12, 14, 14, 21, 28, 31, 34, 41, 49, 57, 63, 68, 74, 83, 91, 100) xu <- unique(x) df <- data.frame( x = 1:100, F = 1:100 / 100, Fn = sapply(1:100, function(y) sum(x <= y)) / 20 ) df$Gn = sqrt(20) * (df$Fn - df$F)
denseval.fast(df$Gn) denseval.marginal(df$Gn[xu], xu, 20)
Lets create a discrete (since finite), but continuous-valued population of size $N = 100$.
N <- 100 df <- data.frame( y = sort(rnorm(N)), F = 1:100 / 100 )
For any i.i.d. sample (we use size $n = 20$), we can construct the needed $F_n$ and $G_n$.
n <- 20 xi <- sort(sample(1:N, size = n, replace = TRUE)) x <- df$y[xi] df$Fn <- sapply(df$y, function(z) sum(x <= z) / 20) df$Gn <- sqrt(20) * (df$Fn - df$F)
Since we are no longer working with integers, we need to modify our marginal density function slightly.
denseval.marginal.fixed <- function(b, d, n, N) { bb <- c(0, b, 0) b2 <- diff(bb) ** 2 dd <- c(0, d, N + 1) d2 <- diff(dd) logdeterm <- log(n + 1) - sum(log(d2)) wp <- b2 / d2 1 / 2 * (logdeterm + length(d) * log(n) - length(d) * log(2 * pi) - n * sum(wp)) } denseval.marginal.fixed.id <- function(b, d, n, N) { bb <- c(0, b, 0) b2 <- diff(bb) ** 2 dd <- c(0, d, N + 1) d2 <- diff(dd) wp <- b2 / d2 - 1 / 2 * (n * sum(wp)) }
And we can evaluate joint and marginal densities.
denseval.fast(df$Gn) denseval.marginal.fixed(df$Gn[unique(xi)], unique(xi), n, N)
Now lets do this with many samples
k <- 100 xi.storage <- t(apply(matrix(sample(1:N, size = n * k, replace = TRUE), nrow = k), 1, sort)) x.storage <- matrix(df$y[as.numeric(xi.storage)], nrow = k) j.densities <- rep(0, k) m.id.densities <- rep(0, k) m.densities <- rep(0, k) for(i in 1:k) { xi <- xi.storage[i, ] x <- x.storage[i, ] df$Fn <- sapply(df$y, function(z) sum(x <= z) / 20) df$Gn <- sqrt(20) * (df$Fn - df$F) j.densities[i] <- denseval.fast(df$Gn) m.id.densities[i] <- denseval.marginal.fixed.id(df$Gn[unique(xi)], unique(xi), n, N) m.densities[i] <- denseval.marginal.fixed(df$Gn[unique(xi)], unique(xi), n, N) }
And we can visualize these densities.
par(mar = c(2.5, 4.5, 1, 1), mfrow = c(1, 2)) plot(1:k, j.densities, type = "p", ylab = "Log density") plot(1:k, m.densities, type = "p", ylab = "Log density")
Which samples look the best? The indices from the sample with the highest and lowest marginal density are
xi.storage[which.max(m.densities), ] xi.storage[which.min(m.densities), ]
And the spacings are
diff(c(0, xi.storage[which.max(m.densities), ], 100)) diff(c(0, xi.storage[which.min(m.densities), ], 100))
The densities at the entire population are
j.densities[which.max(m.densities)] j.densities[which.min(m.densities)]
This doesn't really make sense; let's look at these samples. I will add in what should be theoretical best sample
x.ideal <- df$y[round(1:20 * 100 / 21)] par(mar = c(4.5, 4.5, 1, 1), mfrow = c(1, 1)) plot(df$y, df$F, type = "s", xlab = "X", ylab = "F") lines(x.storage[which.max(m.densities), ], 1:n / n, type = "s", col = "blue") lines(x.storage[which.min(m.densities), ], 1:n / n, type = "s", col = "red") lines(x.ideal, 1:n / n, type = "s", col = "darkgreen")
What is the marginal density of this ideal sample?
xi <- round(1:20 * 100 / 21) x <- df$y[xi] df$Fn <- sapply(df$y, function(z) sum(x <= z) / 20) df$Gn <- sqrt(20) * (df$Fn - df$F) denseval.fast(df$Gn) denseval.marginal.fixed.id(df$Gn[unique(xi)], unique(xi), n, N) denseval.marginal.fixed(df$Gn[unique(xi)], unique(xi), n, N)
denseval.interp <- function(samp, pop, n, N, interp.n = round(N / 5) - 1) { samp <- sort(samp) ind <- which(pop %in% samp) unq.samp <- unique(samp) pop <- sort(pop) interp.points <- seq(0, N + 1, length.out = interp.n + 2) interp.points <- round(interp.points[-c(1, length(interp.points))]) df <- data.frame(y = pop, F = 1:N / N, Fn = NA) df$Fn[ind] <- sapply(unq.samp, function(x) sum(samp <= x)) / n df$FnNew <- df$Fn df$FnNew[pop < min(samp)] <- 0 df$FnNew[pop > max(samp)] <- 1 if(any(is.na(df$FnNew))) { for(i in 1:(length(ind) - 1)) { interp.interm <- pop > unq.samp[i] & pop < unq.samp[i + 1] df$FnNew[pop >= unq.samp[i] & pop <= unq.samp[i + 1]] <- seq(df$Fn[ind[i]], df$Fn[ind[i + 1]], length.out = sum(interp.interm) + 2) } } df$Gn <- sqrt(n) * (df$FnNew - df$F) bb <- c(0, df$Gn[interp.points], 0) b2 <- diff(bb) ** 2 dd <- c(0, interp.points, N) d2 <- diff(dd) logdeterm <- log(n + 1) - sum(log(d2)) wp <- b2 / d2 1 / 2 * (logdeterm + length(unq.samp) * log(n) - length(unq.samp) * log(2 * pi) - n * sum(wp)) }
k <- 50000 N <- 100 n <- 20 pop <- sort(rnorm(100)) samps <- matrix(sample(pop, k * n, replace = TRUE), nrow = k, ncol = n) interp.densities <- rep(0, k) for(i in 1:k) { interp.densities[i] <- denseval.interp(samp = samps[i, ], pop = pop, n = n, N = N, interp.n = 19) } max(interp.densities) which.max(interp.densities) samps[which.max(interp.densities), ] which(pop %in% samps[which.max(interp.densities), ]) diff(c(0, which(pop %in% samps[which.max(interp.densities), ]), N)) # Best sample? samp <- pop[round(1:n * (N / 21))] denseval.interp(samp = samp, pop = pop, n = n, N = N, interp.n = 19)
Very small test for understanding. Let $N = 5$, $n = 2$. Note that $F = [0.2, 0.4, 0.6, 0.8, 1.0]$.
N <- 5 n <- 2 k <- 100 df <- data.frame( y = sort(rnorm(N)), F = 1:N / N ) xi.storage <- t(apply(matrix(sample(1:N, size = n * k, replace = TRUE), nrow = k), 1, sort)) x.storage <- matrix(df$y[as.numeric(xi.storage)], nrow = k) j.densities <- rep(0, k) m.densities <- rep(0, k) i.densities <- rep(0, k) for(i in 1:k) { xi <- xi.storage[i, ] x <- x.storage[i, ] df$Fn <- sapply(df$y, function(z) sum(x <= z) / n) df$Gn <- sqrt(n) * (df$Fn - df$F) j.densities[i] <- denseval.fast(df$Gn) m.densities[i] <- denseval.marginal.fixed(df$Gn[unique(xi)], unique(xi), n, N) i.densities[i] <- denseval.interp(x, df$y, n, N, interp.n = 2) } storage <- data.frame(xi.storage, j.densities, m.densities, i.densities)
Let $x_1 = [y_3, y_5]$. Thus,
\begin{align} F_N &= [0, 0, 0.5, 0.5, 1] \ G_n &= \sqrt{2} [-0.2, -0.4, -0.1, -0.3, 0] \ f_y(x) &\propto 0.2^2 + 0.2^2 + 0.3^2 + 0.2^2 + 0.3^2 + 0^2 \ f_x(x) &\propto \frac{1}{3}0.1^2 + \frac{1}{2}0.1^2 + 0^2 \end{align}
Let $x_1 = [y_3, y_4]$. Thus,
\begin{align} F_N &= [0, 0, 0.5, 1, 1] \ G_n &= \sqrt{2} [-0.2, -0.4, -0.1, 0.2, 0] \ f_y(x) &\propto 0.2^2 + 0.2^2 + 0.3^2 + 0.3^2 + 0.2^2 + 0^2 \ f_x(x) &\propto \frac{1}{3}0.1^2 + 0.3^2 + \frac{1}{2}0.2^2 \end{align}
Let $x_1 = [y_2, y_4]$. Thus,
\begin{align} F_N &= [0, 0.5, 0.5, 1, 1] \ G_n &= \sqrt{2} [-0.2, 0.1, -0.1, 0.2, 0] \ f_y(x) &\propto 0.2^2 + 0.3^2 + 0.2^2 + 0.3^2 + 0.2^2 + 0^2 \ f_x(x) &\propto \frac{1}{2}0.1^2 + \frac{1}{2}0.1^2 + \frac{1}{2}0.2^2 \end{align}
N <- 5 n <- 100 k <- 100 df <- data.frame( y = sort(rnorm(N)), F = 1:N / N ) xi.storage <- t(apply(matrix(sample(1:N, size = n * k, replace = TRUE), nrow = k), 1, sort)) x.storage <- matrix(df$y[as.numeric(xi.storage)], nrow = k) j.densities <- rep(0, k) m.densities <- rep(0, k) i.densities <- rep(0, k) for(i in 1:k) { xi <- xi.storage[i, ] x <- x.storage[i, ] df$Fn <- sapply(df$y, function(z) sum(x <= z) / n) df$Gn <- sqrt(n) * (df$Fn - df$F) j.densities[i] <- denseval.fast(df$Gn) m.densities[i] <- denseval.marginal.fixed(df$Gn[unique(xi)], unique(xi), n, N) i.densities[i] <- denseval.interp(x, df$y, n, N, interp.n = 2) } storage <- data.frame(t(apply(xi.storage, 1, table)), j.densities, m.densities, i.densities)
When the variable takes on a small number of unique values compared to the sample and population size, the marginal and full distribtuon of $G_n$ work the way we expected them to. The interpolated version doesn't perform as well, since we lose information.
N <- 100 n <- 100 k <- 100000 interp.n <- 11 df <- data.frame( y = sort(rnorm(N)), F = 1:N / N, Fn = NA ) pop <- df$y xi.storage <- t(apply(matrix(sample(1:N, size = n * k, replace = TRUE), nrow = k), 1, sort)) x.storage <- matrix(df$y[as.numeric(xi.storage)], nrow = k) interp.Gn <- matrix(nrow = k, ncol = interp.n) for(j in 1:k) { xi <- xi.storage[j, ] samp <- x.storage[j, ] ind <- which(pop %in% samp) samp <- sort(samp) unq.samp <- unique(samp) pop <- sort(pop) interp.points <- round(seq(0, N + 1, length.out = interp.n + 2)) interp.points <- interp.points[-c(1, length(interp.points))] df$Fn[ind] <- sapply(unq.samp, function(x) sum(samp <= x)) / n df$FnNew <- df$Fn df$FnNew[pop < min(samp)] <- 0 df$FnNew[pop > max(samp)] <- 1 if(any(is.na(df$FnNew))) { for(i in 1:(length(ind) - 1)) { interp.interm <- pop > unq.samp[i] & pop < unq.samp[i + 1] df$FnNew[pop >= unq.samp[i] & pop <= unq.samp[i + 1]] <- seq(df$Fn[ind[i]], df$Fn[ind[i + 1]], length.out = sum(interp.interm) + 2) } } df$Gn <- sqrt(n) * (df$FnNew - df$F) interp.Gn[j, ] <- df$Gn[interp.points] } colMeans(interp.Gn) solve(var(interp.Gn))
n <- 5 N <- 20 k <- 12 K <- 3 samp.df <- data.frame(X = seq(-1.8, 1.8, length.out = n)) pop.df <- data.frame( Pop = rep(LETTERS[1:k], each = N), X = rnorm(N * k) ) dens.df <- data.frame( Pop = rep(LETTERS[1:k], K), A = dchisq(seq(0.1, 4, length.out = k), df = 3, log = TRUE), B = dchisq(seq(0.1, 4, length.out = k), df = 4, log = TRUE), C = dchisq(seq(0.1, 4, length.out = k), df = 5, log = TRUE) ) %>% pivot_longer(A:C, names_to = "Method", values_to = "logDensity")
popDens.plot <- function(samp.df, pop.df, dens.df) { dens.plot <- ggplot(data = dens.df, mapping = aes(x = Pop, y = logDensity, color = Method, group = Method)) + geom_point() + geom_line() + theme( legend.position = "top", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0, 0.5), "cm") ) + labs(y = "Log Density") pop.plot <- ggplot(data = pop.df, mapping = aes(x = Pop, y = X)) + geom_point(size = 1) + geom_hline(data = samp.df, mapping = aes(yintercept = X)) + theme( plot.margin = unit(c(0, 0.5, 0.5, 0.5), "cm"), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank() ) + labs(x = "") plot_grid(dens.plot, pop.plot, ncol = 1, align = "v", axis = "l") }
popDens.plot(samp.df, pop.df, dens.df)
denseval.fast <- function(b, n) { b2 <- diff(c(0, b, 0)) ** 2 1 / 2 * (log(n + 1) + length(b) * log(n) - length(b) * log(2 * pi) - n * sum(b2)) }
denseval.dirichlet <- function(samp, pop, n, N) { x <- tabulate(findInterval(pop, samp, left.open = TRUE) + 1, nbin = n + 1) alpha = x / N * (n + 1) xval = c(.5 / n, rep(1 / n, n - 1), .5 / n) return(ddirichlet(xval, alpha, log = TRUE)) } denseval.dirichlet2 <- function(samp, pop, n, N) { x <- tabulate(findInterval(pop, samp, left.open = TRUE) + 1, nbin = n + 1) alpha = x / N * (n + 1) xval = rep(1 / (n + 1), n + 1) return(ddirichlet(xval, alpha, log = TRUE)) }
n <- 5 N <- 20 k <- 20 samp.df <- data.frame(X = seq(-4, 4, length.out = n)) pop.df <- data.frame( Pop = character(), X = numeric() ) dens.df <- data.frame( Pop = character(), Method = character(), logDensity = numeric() ) endpoints <- seq(2.2, 12.2, length.out = k) for(i in 1:k) { X <- seq(-endpoints[i], endpoints[i], length.out = N) pop.df <- bind_rows(pop.df, data.frame(Pop = LETTERS[i], X = X)) full.pop <- sort(c(X, samp.df$X)) samp.indices <- which(full.pop %in% samp.df$X) Fn <- sapply(full.pop, function(x) sum(samp.df$X <= x)) / n FN <- 1:(N + n) / (N + n) Gn <- sqrt(n) * (Fn - FN) add.df <- data.frame( Pop = LETTERS[i], Method = c("Marginal Density", "Interp Density (k = 5)", "Dirichlet Density (unequal)", "Dirichlet Density (equal)"), logDensity = c( denseval.marginal(Gn[samp.indices], samp.indices, n), denseval.interp(samp.df$X, full.pop, n, N + n, interp.n = 5), denseval.dirichlet(samp.df$X, full.pop, n, N + n), denseval.dirichlet2(samp.df$X, full.pop, n, N + n) ) ) dens.df <- bind_rows(dens.df, add.df) } dens.df <- dens.df %>% group_by(Method) %>% mutate(Density = exp(logDensity)) %>% mutate(Density = Density / sum(Density, na.rm = TRUE)) %>% mutate(logDensity = log(Density)) popDens.plot(samp.df, pop.df, dens.df)
n <- 5 N <- 20 k <- 12 samp.df <- data.frame(X = seq(-4, 4, length.out = n)) pop.df <- data.frame( Pop = character(), X = numeric() ) dens.df <- data.frame( Pop = character(), Method = character(), logDensity = numeric() ) endpoints <- seq(2.2, 12.2, length.out = k) for(i in 1:k) { X <- seq(-5, endpoints[i], length.out = N) pop.df <- bind_rows(pop.df, data.frame(Pop = LETTERS[i], X = X)) full.pop <- sort(c(X, samp.df$X)) samp.indices <- which(full.pop %in% samp.df$X) Fn <- sapply(full.pop, function(x) sum(samp.df$X <= x)) / n FN <- 1:(N + n) / (N + n) Gn <- sqrt(n) * (Fn - FN) add.df <- data.frame( Pop = LETTERS[i], Method = c("Marginal Density", "Interp Density (k = 5)", "Dirichlet Density (unequal)", "Dirichlet Density (equal)"), logDensity = c( denseval.marginal(Gn[samp.indices], samp.indices, n), denseval.interp(samp.df$X, full.pop, n, N + n, interp.n = 5), denseval.dirichlet(samp.df$X, full.pop, n, N + n), denseval.dirichlet2(samp.df$X, full.pop, n, N + n) ) ) dens.df <- bind_rows(dens.df, add.df) } dens.df <- dens.df %>% group_by(Method) %>% mutate(Density = exp(logDensity)) %>% mutate(Density = Density / sum(Density, na.rm = TRUE)) %>% mutate(logDensity = log(Density)) popDens.plot(samp.df, pop.df, dens.df)
n <- 5 N <- 20 k <- 12 samp.df <- data.frame(X = seq(-4, 4, length.out = n)) pop.df <- data.frame( Pop = character(), X = numeric() ) dens.df <- data.frame( Pop = character(), Method = character(), logDensity = numeric() ) endpoints <- seq(2.2, 12.2, length.out = k) for(i in 1:k) { X <- seq(-endpoints[i], 5, length.out = N) pop.df <- bind_rows(pop.df, data.frame(Pop = LETTERS[i], X = X)) full.pop <- sort(c(X, samp.df$X)) samp.indices <- which(full.pop %in% samp.df$X) Fn <- sapply(full.pop, function(x) sum(samp.df$X <= x)) / n FN <- 1:(N + n) / (N + n) Gn <- sqrt(n) * (Fn - FN) add.df <- data.frame( Pop = LETTERS[i], Method = c("Marginal Density", "Interp Density (k = 5)", "Dirichlet Density (unequal)", "Dirichlet Density (equal)"), logDensity = c( denseval.marginal(Gn[samp.indices], samp.indices, n), denseval.interp(samp.df$X, full.pop, n, N + n, interp.n = 5), denseval.dirichlet(samp.df$X, full.pop, n, N + n), denseval.dirichlet2(samp.df$X, full.pop, n, N + n) ) ) dens.df <- bind_rows(dens.df, add.df) } dens.df <- dens.df %>% group_by(Method) %>% mutate(Density = exp(logDensity)) %>% mutate(Density = Density / sum(Density, na.rm = TRUE)) %>% mutate(logDensity = log(Density)) popDens.plot(samp.df, pop.df, dens.df)
n <- 5 N <- 20 k <- 12 samp.df <- data.frame(X = seq(-4, 4, length.out = n)) pop.df <- data.frame( Pop = character(), X = numeric() ) dens.df <- data.frame( Pop = character(), Method = character(), logDensity = numeric() ) endpoints <- seq(5.55, 5.55, length.out = k) for(i in 1:k) { X <- seq(-endpoints[i], endpoints[i], length.out = N) + 0.35 * (i - 5) pop.df <- bind_rows(pop.df, data.frame(Pop = LETTERS[i], X = X)) full.pop <- sort(c(X, samp.df$X)) samp.indices <- which(full.pop %in% samp.df$X) Fn <- sapply(full.pop, function(x) sum(samp.df$X <= x)) / n FN <- 1:(N + n) / (N + n) Gn <- sqrt(n) * (Fn - FN) add.df <- data.frame( Pop = LETTERS[i], Method = c("Marginal Density", "Interp Density (k = 5)", "Dirichlet Density (unequal)", "Dirichlet Density (equal)"), logDensity = c( denseval.marginal(Gn[samp.indices], samp.indices, n), denseval.interp(samp.df$X, full.pop, n, N + n, interp.n = 5), denseval.dirichlet(samp.df$X, full.pop, n, N + n), denseval.dirichlet2(samp.df$X, full.pop, n, N + n) ) ) dens.df <- bind_rows(dens.df, add.df) } dens.df <- dens.df %>% group_by(Method) %>% mutate(Density = exp(logDensity)) %>% mutate(Density = Density / sum(Density, na.rm = TRUE)) %>% mutate(logDensity = log(Density)) popDens.plot(samp.df, pop.df, dens.df)
denseval.dirichlet <- function(samp, pop, n, N) { x <- tabulate(findInterval(pop, samp, left.open = TRUE) + 1, nbin = n + 1) alpha = x / N * (n + 1) xval = c(.5 / n, rep(1 / n, n - 1), .5 / n) return(ddirichlet(xval, alpha, log = TRUE)) } randpop.dirichlet <- function(n, N) { "hmm" }
Let $k_i$ be the number of population members in the $i^{th}$ interval of $-\infty, sort(x), \infty$.
\begin{align} f(x) &= \frac{1}{\beta(\frac{k(n+1)}{N})} {\frac{1}{2n}}^{(n + 1) \frac{k_1}{N}} {\frac{1}{n}}^{(n + 1) \frac{k_2}{N}} \cdots {\frac{1}{n}}^{(n + 1) \frac{k_n}{N}} {\frac{1}{2n}}^{(n + 1) \frac{k_{n+1}}{N}} \ &= \end{align}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.