library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)

knitr::opts_chunk$set(
  cache = TRUE,
  collapse = TRUE,
  comment = "#>",
  eval = TRUE
)

quick_eval <- FALSE

Assumptions

Membership in windows is uniform

If $X_n$ is an ordered sample from a population $Y_N$ with $n < N$, then the average behavior of $Y_N \setminus X_n$ can be described by $Pr[y_{k} \in (z_i, z_{i+1})]\ = \frac{1}{n+1} \forall\ k \in n+1, \dots, N\ \wedge\ i \in 1,\dots,n+1$ where $Z = {-\infty, x_1, \dots, x_n, \infty}$.

Simulation

iters <- 50000
N <- 500
n <- 20

orig.pop <- runif(N)

storage <- matrix(NA, nrow = iters, ncol = n + 1)

for(i in 1:iters) {
  ind <- sample(1:N, n)
  samp <- sort(orig.pop[ind])
  others <- orig.pop[-ind]

  tabs <- table(findInterval(orig.pop, c(-Inf, samp, Inf), left.open = TRUE))
  add1 <- as.numeric(tabs)
  names(add1) <- names(tabs)
  add2 <- rep(0, n + 1)
  names(add2) <- 1:(n + 1)
  add <- list(add1, add2)

  tab <- tapply(unlist(add), names(unlist(add)), sum)
  storage[i, ] <- tab[order(factor(names(tab), levels = 1:(length(samp) + 1)))]
}
g1 <- data.frame(
  "Window" = 1:(n + 1),
  "Membership" = storage[1, ]
) %>%
  ggplot(mapping = aes(y = as.factor(Window), x = Membership)) + 
  geom_bar(stat = "identity") + 
  labs(x = "Membership", y = "Window") + 
  theme_bw()

g2 <- data.frame(
  "Window" = 1:(n + 1),
  "Mean" = colMeans(storage),
  "Q10" = apply(storage, 2, quantile, probs = 0.1),
  "Q90" = apply(storage, 2, quantile, probs = 0.9)
) %>%
  ggplot(mapping = aes(y = as.factor(Window), x = Mean)) + 
  geom_bar(stat = "identity") + 
  geom_errorbar(mapping = aes(xmin = Q10, xmax = Q90), color = "red") + 
  labs(x = "Mean Membership", y = "Window") + 
  theme_bw()

grid.arrange(g1, g2, nrow = 1)

Reversibility

If we instead fix our sample, and create synthetic populations using the same distribution that generated our original population, does this property apply to the synthetic populations?

iters <- 50000
N <- 500
n <- 20

ind <- sample(1:N, n)
samp <- sort(orig.pop[ind])

storage2 <- matrix(NA, nrow = iters + 1, ncol = n + 1)

for(i in 1:iters) {
  pop <- runif(N - n)

  tabs <- table(findInterval(pop, c(-Inf, samp, Inf), left.open = TRUE))
  add1 <- as.numeric(tabs)
  names(add1) <- names(tabs)
  add2 <- rep(0, n + 1)
  names(add2) <- 1:(n + 1)
  add <- list(add1, add2)

  tab <- tapply(unlist(add), names(unlist(add)), sum)
  storage2[i, ] <- tab[order(factor(names(tab), levels = 1:(length(samp) + 1)))]
}

pop <- orig.pop[-ind]

tabs <- table(findInterval(pop, c(-Inf, samp, Inf), left.open = TRUE))
add1 <- as.numeric(tabs)
names(add1) <- names(tabs)
add2 <- rep(0, n + 1)
names(add2) <- 1:(n + 1)
add <- list(add1, add2)

tab <- tapply(unlist(add), names(unlist(add)), sum)
storage2[iters + 1, ] <- tab[order(factor(names(tab), levels = 1:(length(samp) + 1)))]
g1 <- data.frame(
  "Window" = 1:(n + 1),
  "Truth" = storage2[iters + 1, ],
  "Q10" = apply(storage2, 2, quantile, probs = 0.1),
  "Q90" = apply(storage2, 2, quantile, probs = 0.9),
  "Min" = apply(storage2, 2, min),
  "Max" = apply(storage2, 2, max)
) %>%
  ggplot(mapping = aes(y = as.factor(Window), x = Truth)) + 
  geom_point(stat = "identity", mapping = aes(color = "Truth")) + 
  geom_errorbar(mapping = aes(xmin = Min, xmax = Max, color = "Min, Max")) + 
  geom_errorbar(mapping = aes(xmin = Q10, xmax = Q90, color = "Q10, Q90")) + 
  scale_color_manual(name = NULL, values = c("grey", "red", "black")) + 
  labs(x = "Membership", y = "Window") + 
  theme_bw()

grid.arrange(g1, nrow = 1)

Emperical CDFs of synthetic populations should be close to the true population

data.frame(
  Y = sort(orig.pop),
  E = seq(1, N) / N
) %>%
  ggplot(mapping = aes(x = Y)) + 
  stat_ecdf(pad = FALSE) + 
  labs(y = "Cumulative Probability") + 
  theme_bw()

Instead of synthesizing $Y_N \setminus X_n$, if we instead just use the true population distribution to create fully synthetic populations, how far do our empirical CDFs deviate from the truth?

nSynth <- 10000

storage3 <- matrix(runif(nSynth * N), nrow = nSynth, ncol = N)
storage3 <- t(apply(storage3, 1, sort))

data.frame(
  "Min" = apply(storage3, 2, min),
  "Q10" = apply(storage3, 2, quantile, probs = 0.1),
  "Q20" = apply(storage3, 2, quantile, probs = 0.2),
  "Q30" = apply(storage3, 2, quantile, probs = 0.3),
  "Q40" = apply(storage3, 2, quantile, probs = 0.4),
  "Q50" = apply(storage3, 2, quantile, probs = 0.5),
  "Q60" = apply(storage3, 2, quantile, probs = 0.6),
  "Q70" = apply(storage3, 2, quantile, probs = 0.7),
  "Q80" = apply(storage3, 2, quantile, probs = 0.8),
  "Q90" = apply(storage3, 2, quantile, probs = 0.9),
  "Max" = apply(storage3, 2, max)
) %>%
  pivot_longer(Min:Max, names_to = "Quantile", values_to = "Value") %>%
  mutate(Quantile = factor(Quantile, levels = c("Min", paste0("Q", 1:9, "0"), "Max"))) %>%
  ggplot(mapping = aes(group = Quantile, x = Value, color = Quantile)) + 
  stat_ecdf(pad = FALSE) + 
  scale_color_discrete(name = NULL) + 
  labs(x = "Y", y = "Cumulative Probability") + 
  theme_bw()

For a different distribution (Normal),

nSynth <- 10000

storage4 <- matrix(rnorm(nSynth * N), nrow = nSynth, ncol = N)
storage4 <- t(apply(storage3, 1, sort))

data.frame(
  "Min" = apply(storage4, 2, min),
  "Q10" = apply(storage4, 2, quantile, probs = 0.1),
  "Q20" = apply(storage4, 2, quantile, probs = 0.2),
  "Q30" = apply(storage4, 2, quantile, probs = 0.3),
  "Q40" = apply(storage4, 2, quantile, probs = 0.4),
  "Q50" = apply(storage4, 2, quantile, probs = 0.5),
  "Q60" = apply(storage4, 2, quantile, probs = 0.6),
  "Q70" = apply(storage4, 2, quantile, probs = 0.7),
  "Q80" = apply(storage4, 2, quantile, probs = 0.8),
  "Q90" = apply(storage4, 2, quantile, probs = 0.9),
  "Max" = apply(storage4, 2, max)
) %>%
  pivot_longer(Min:Max, names_to = "Quantile", values_to = "Value") %>%
  mutate(Quantile = factor(Quantile, levels = c("Min", paste0("Q", 1:9, "0"), "Max"))) %>%
  ggplot(mapping = aes(group = Quantile, x = Value, color = Quantile)) + 
  stat_ecdf(pad = FALSE) + 
  scale_color_discrete(name = NULL) + 
  labs(x = "Y", y = "Cumulative Probability") + 
  theme_bw()

We want to see similar spread, but we only have a sample to work with.

bounds <- data.frame(
  "Min" = apply(storage3, 2, min),
  "Max" = apply(storage3, 2, max)
) %>%
  pivot_longer(Min:Max, names_to = "Quantile", values_to = "Value") %>%
  mutate(Quantile = factor(Quantile, levels = c("Min", "Max")))

data.frame(Sample = samp) %>%
  ggplot(mapping = aes(x = Sample)) + 
  stat_ecdf(pad = FALSE) + 
  stat_ecdf(data = bounds, mapping = aes(group = Quantile, x = Value), color = "red", pad = FALSE)

Dave's Idea (December 2, 2021)

Let $Y_N$ represent the population and $X_n$ represent a sample.

One option to gauge how representative a sample is to look at the membership within the population between each observed sample. Specifically, we will look at the quantity $P(X_{(n)} \le x_{(n)}, X_{(n-1)} \le x_{(n-1)}, \dots, X_{(1)} \le x_{(1)})$.

From the laws of conditional probability, we know that

\begin{align} P(X_{(n)} \le x_{(n)}, \dots, X_{(1)} \le x_{(1)}) &= P(X_{(n)} \le x_{(n)}, \dots, X_{(2)} \le x_{(2)} | X_{(1)} \le x_{(1)}) P(X_{(1)} \le x_{(1)}) \ &= P(X_{(n)} \le x_{(n)} | X_{(n-1)} \le x_{(n-1)}, \dots, X_{(1)} \le x_{(1)}) \cdots \ &\hspace{1cm} P(X_{(2)} \le x_{(2)} | X_{(1)} \le x_{(1)}) P(X_{(1)} \le x_{(1)}) \end{align}

Problem. This is maximized with a high sample (i.e., $x_{(1)} = , X_{(N-n+1)}\dots, x_{(n)} = X_{(N)}$).



ctgrubb/lemur.pack documentation built on May 7, 2023, 4:13 a.m.