library(dplyr) library(tidyr) library(ggplot2) library(gridExtra) knitr::opts_chunk$set( cache = TRUE, collapse = TRUE, comment = "#>", eval = TRUE ) quick_eval <- FALSE
If $X_n$ is an ordered sample from a population $Y_N$ with $n < N$, then the average behavior of $Y_N \setminus X_n$ can be described by $Pr[y_{k} \in (z_i, z_{i+1})]\ = \frac{1}{n+1} \forall\ k \in n+1, \dots, N\ \wedge\ i \in 1,\dots,n+1$ where $Z = {-\infty, x_1, \dots, x_n, \infty}$.
iters <- 50000 N <- 500 n <- 20 orig.pop <- runif(N) storage <- matrix(NA, nrow = iters, ncol = n + 1) for(i in 1:iters) { ind <- sample(1:N, n) samp <- sort(orig.pop[ind]) others <- orig.pop[-ind] tabs <- table(findInterval(orig.pop, c(-Inf, samp, Inf), left.open = TRUE)) add1 <- as.numeric(tabs) names(add1) <- names(tabs) add2 <- rep(0, n + 1) names(add2) <- 1:(n + 1) add <- list(add1, add2) tab <- tapply(unlist(add), names(unlist(add)), sum) storage[i, ] <- tab[order(factor(names(tab), levels = 1:(length(samp) + 1)))] }
g1 <- data.frame( "Window" = 1:(n + 1), "Membership" = storage[1, ] ) %>% ggplot(mapping = aes(y = as.factor(Window), x = Membership)) + geom_bar(stat = "identity") + labs(x = "Membership", y = "Window") + theme_bw() g2 <- data.frame( "Window" = 1:(n + 1), "Mean" = colMeans(storage), "Q10" = apply(storage, 2, quantile, probs = 0.1), "Q90" = apply(storage, 2, quantile, probs = 0.9) ) %>% ggplot(mapping = aes(y = as.factor(Window), x = Mean)) + geom_bar(stat = "identity") + geom_errorbar(mapping = aes(xmin = Q10, xmax = Q90), color = "red") + labs(x = "Mean Membership", y = "Window") + theme_bw() grid.arrange(g1, g2, nrow = 1)
If we instead fix our sample, and create synthetic populations using the same distribution that generated our original population, does this property apply to the synthetic populations?
iters <- 50000 N <- 500 n <- 20 ind <- sample(1:N, n) samp <- sort(orig.pop[ind]) storage2 <- matrix(NA, nrow = iters + 1, ncol = n + 1) for(i in 1:iters) { pop <- runif(N - n) tabs <- table(findInterval(pop, c(-Inf, samp, Inf), left.open = TRUE)) add1 <- as.numeric(tabs) names(add1) <- names(tabs) add2 <- rep(0, n + 1) names(add2) <- 1:(n + 1) add <- list(add1, add2) tab <- tapply(unlist(add), names(unlist(add)), sum) storage2[i, ] <- tab[order(factor(names(tab), levels = 1:(length(samp) + 1)))] } pop <- orig.pop[-ind] tabs <- table(findInterval(pop, c(-Inf, samp, Inf), left.open = TRUE)) add1 <- as.numeric(tabs) names(add1) <- names(tabs) add2 <- rep(0, n + 1) names(add2) <- 1:(n + 1) add <- list(add1, add2) tab <- tapply(unlist(add), names(unlist(add)), sum) storage2[iters + 1, ] <- tab[order(factor(names(tab), levels = 1:(length(samp) + 1)))]
g1 <- data.frame( "Window" = 1:(n + 1), "Truth" = storage2[iters + 1, ], "Q10" = apply(storage2, 2, quantile, probs = 0.1), "Q90" = apply(storage2, 2, quantile, probs = 0.9), "Min" = apply(storage2, 2, min), "Max" = apply(storage2, 2, max) ) %>% ggplot(mapping = aes(y = as.factor(Window), x = Truth)) + geom_point(stat = "identity", mapping = aes(color = "Truth")) + geom_errorbar(mapping = aes(xmin = Min, xmax = Max, color = "Min, Max")) + geom_errorbar(mapping = aes(xmin = Q10, xmax = Q90, color = "Q10, Q90")) + scale_color_manual(name = NULL, values = c("grey", "red", "black")) + labs(x = "Membership", y = "Window") + theme_bw() grid.arrange(g1, nrow = 1)
data.frame( Y = sort(orig.pop), E = seq(1, N) / N ) %>% ggplot(mapping = aes(x = Y)) + stat_ecdf(pad = FALSE) + labs(y = "Cumulative Probability") + theme_bw()
Instead of synthesizing $Y_N \setminus X_n$, if we instead just use the true population distribution to create fully synthetic populations, how far do our empirical CDFs deviate from the truth?
nSynth <- 10000 storage3 <- matrix(runif(nSynth * N), nrow = nSynth, ncol = N) storage3 <- t(apply(storage3, 1, sort)) data.frame( "Min" = apply(storage3, 2, min), "Q10" = apply(storage3, 2, quantile, probs = 0.1), "Q20" = apply(storage3, 2, quantile, probs = 0.2), "Q30" = apply(storage3, 2, quantile, probs = 0.3), "Q40" = apply(storage3, 2, quantile, probs = 0.4), "Q50" = apply(storage3, 2, quantile, probs = 0.5), "Q60" = apply(storage3, 2, quantile, probs = 0.6), "Q70" = apply(storage3, 2, quantile, probs = 0.7), "Q80" = apply(storage3, 2, quantile, probs = 0.8), "Q90" = apply(storage3, 2, quantile, probs = 0.9), "Max" = apply(storage3, 2, max) ) %>% pivot_longer(Min:Max, names_to = "Quantile", values_to = "Value") %>% mutate(Quantile = factor(Quantile, levels = c("Min", paste0("Q", 1:9, "0"), "Max"))) %>% ggplot(mapping = aes(group = Quantile, x = Value, color = Quantile)) + stat_ecdf(pad = FALSE) + scale_color_discrete(name = NULL) + labs(x = "Y", y = "Cumulative Probability") + theme_bw()
For a different distribution (Normal),
nSynth <- 10000 storage4 <- matrix(rnorm(nSynth * N), nrow = nSynth, ncol = N) storage4 <- t(apply(storage3, 1, sort)) data.frame( "Min" = apply(storage4, 2, min), "Q10" = apply(storage4, 2, quantile, probs = 0.1), "Q20" = apply(storage4, 2, quantile, probs = 0.2), "Q30" = apply(storage4, 2, quantile, probs = 0.3), "Q40" = apply(storage4, 2, quantile, probs = 0.4), "Q50" = apply(storage4, 2, quantile, probs = 0.5), "Q60" = apply(storage4, 2, quantile, probs = 0.6), "Q70" = apply(storage4, 2, quantile, probs = 0.7), "Q80" = apply(storage4, 2, quantile, probs = 0.8), "Q90" = apply(storage4, 2, quantile, probs = 0.9), "Max" = apply(storage4, 2, max) ) %>% pivot_longer(Min:Max, names_to = "Quantile", values_to = "Value") %>% mutate(Quantile = factor(Quantile, levels = c("Min", paste0("Q", 1:9, "0"), "Max"))) %>% ggplot(mapping = aes(group = Quantile, x = Value, color = Quantile)) + stat_ecdf(pad = FALSE) + scale_color_discrete(name = NULL) + labs(x = "Y", y = "Cumulative Probability") + theme_bw()
We want to see similar spread, but we only have a sample to work with.
bounds <- data.frame( "Min" = apply(storage3, 2, min), "Max" = apply(storage3, 2, max) ) %>% pivot_longer(Min:Max, names_to = "Quantile", values_to = "Value") %>% mutate(Quantile = factor(Quantile, levels = c("Min", "Max"))) data.frame(Sample = samp) %>% ggplot(mapping = aes(x = Sample)) + stat_ecdf(pad = FALSE) + stat_ecdf(data = bounds, mapping = aes(group = Quantile, x = Value), color = "red", pad = FALSE)
Let $Y_N$ represent the population and $X_n$ represent a sample.
One option to gauge how representative a sample is to look at the membership within the population between each observed sample. Specifically, we will look at the quantity $P(X_{(n)} \le x_{(n)}, X_{(n-1)} \le x_{(n-1)}, \dots, X_{(1)} \le x_{(1)})$.
From the laws of conditional probability, we know that
\begin{align} P(X_{(n)} \le x_{(n)}, \dots, X_{(1)} \le x_{(1)}) &= P(X_{(n)} \le x_{(n)}, \dots, X_{(2)} \le x_{(2)} | X_{(1)} \le x_{(1)}) P(X_{(1)} \le x_{(1)}) \ &= P(X_{(n)} \le x_{(n)} | X_{(n-1)} \le x_{(n-1)}, \dots, X_{(1)} \le x_{(1)}) \cdots \ &\hspace{1cm} P(X_{(2)} \le x_{(2)} | X_{(1)} \le x_{(1)}) P(X_{(1)} \le x_{(1)}) \end{align}
Problem. This is maximized with a high sample (i.e., $x_{(1)} = , X_{(N-n+1)}\dots, x_{(n)} = X_{(N)}$).
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.