In jtipton25/pgR: Bayesian Polya-Gamma Regression

We are testing the spatial Polya-gamma linear model pg_stlm_mra()

knitr::opts_chunk$set(fig.width = 16, fig.height = 9)
library(pgR)
library(mvnfast)
# library(MCMCpack)
library(splines)
library(tidyverse)
library(patchwork)
library(BayesMRA)
library(spam)

simulate some data

set.seed(11)
N <- 30^2
J <- 6
M <- 3
p <- 2
n_time <- 10
## setup the spatial process
locs <- as.matrix(
  expand.grid(
    seq(0, 1, length = sqrt(N)),
    seq(0, 1, length = sqrt(N))
  )
)

## generate the MRA basis
MRA <- mra_wendland_2d(locs, M = M, n_coarse_grid = 8)
W <- MRA$W

## generate the MRA spatially-correlated random effects precision matrix
tau2 <- sapply(1:(J-1), function(j) 100 * 2^(1:M) * rgamma(M, 1, 1))

Q_alpha <-  make_Q_alpha_2d(sqrt(MRA$n_dims), rep(0.999, length(MRA$n_dims)))
Q_alpha_tau2 <- vector(mode = "list", length = J-1)
for (j in 1:(J-1)) {
  Q_alpha_tau2[[j]] <- make_Q_alpha_tau2(Q_alpha, tau2[, j])
}

## define the sum-to-0 constraint for alpha
constraints <- make_constraint(MRA, constraint = "resolution", joint = TRUE)
A_constraint <- constraints$A_constraint
a_constraint <- constraints$a_constraint


## temporal autocorrelation parameter rho
rho <- runif(J-1, 0.8, 1)

## simulate the spatial random effects
alpha <- array(0, dim = c(sum(MRA$n_dims), J-1, n_time))
for (j in 1:(J-1)) {
  alpha[, j, 1] <- rmvnorm.prec.const(n = 1, mu = rep(0, sum(MRA$n_dims)), Q = Q_alpha_tau2[[j]], A = A_constraint, a = a_constraint)
}
for (tt in 2:n_time) {
  for (j in 1:(J-1)) {
    alpha[, j, tt] <- rmvnorm.prec.const(n = 1, mu = rho[j] * alpha[, j, tt-1], Q = Q_alpha_tau2[[j]])
  }  
}
W_alpha <- array(0, dim = c(N, J-1, n_time))
for (tt in 1:n_time) {
  W_alpha[, , tt] <- W %*% alpha[, , tt]
}

# check if first field is mean 0
apply(W_alpha, c(2, 3), sum)[, 1]

## setup the fixed effects process
X <- cbind(1, matrix(runif(N*p), N, p))
beta <- matrix(rnorm((J-1) * ncol(X), 0, 0.25), ncol(X), (J-1))
## make the intercepts smaller to reduce stochastic ordering effect
beta[1, ] <- beta[1, ] - seq(from = 2, to = 0, length.out = J-1)

## add in some residual error
sigma2 <- rgamma(J-1, 1, 5)

eta <- array(0, dim = c(N, J-1, n_time))
pi      <- array(0, dim = c(N, J, n_time), 
                 dimnames = list(site = 1:N, species = 1:J, time = 1:n_time))
Y_prop  <- array(0, dim = c(N, J, n_time), 
                 dimnames = list(site = 1:N, species = 1:J, time = 1:n_time))
Y      <- array(0, dim = c(N, J, n_time))
for (tt in 1:n_time) {
  eta[, , tt] <- X %*% beta + W_alpha[, , tt] + sapply(1:(J-1), function(j) rnorm(N, 0, sqrt(sigma2[j])))
  pi[, , tt] <- eta_to_pi(eta[, , tt])
  for (i in 1:N) {
    Y[i, , tt] <- rmultinom(1, 500, pi[i, , tt])
  }
  Y_prop[, , tt]     <- counts_to_proportions(Y[, , tt])
}

Plot the simulated data

## put data into data.frame for plotting
dat_Y_prop <- as.data.frame.table(Y_prop, responseName = "Y_prop")
dat_pi     <- as.data.frame.table(pi, responseName = "pi")
dat_locs <- data.frame(
  lon     = locs[, 1],
  lat     = locs[, 2],
  site    = factor(1:N)
)

dat_plot <- Reduce(
  function(x, y) merge(x, y, all = TRUE), 
  list(dat_Y_prop, dat_pi, dat_locs)
)


p_simulated <- dat_plot %>%
  ggplot(aes(x = lon, y = lat, fill = pi)) +
  geom_raster() +
  scale_fill_viridis_c() +
  facet_grid(time ~ species) +
  ggtitle("Simulated pi over time") +
  theme(legend.position = "none")
p_simulated

Spatial Polya-gamma multinomial regression

Below is a DAG for the model

## GGDags

## https://cran.r-project.org/web/packages/ggdag/vignettes/intro-to-ggdag.html

# install.packages("dagitty")
# install.packages("ggdag")
library(dagitty)
library(ggdag)
library(cowplot)
library(tidyverse)
library(latex2exp)

## set coordinates for dag
coords <- tibble::tribble(
  ~name,        ~x,   ~y,
  "Y",          3,    1, 
  "eta",        2,    1,
  "locs",       2.5,  1.25,
  "tau2",       1.5,  0.75,
  "theta",      1.5,  1.25, 
  "beta",       1,    1,
  "X",          0,    1,
  "mu_beta",    0.5,  0.75,
  "Sigma_beta", 0.5,  1.25
)


dag <- dagify(
  Y ~ eta,
  eta ~ beta + locs + theta + tau2,
  beta ~ X + mu_beta + Sigma_beta,
  # exposure = "X",
  outcome = "Y",
  coords = coords
)



dag_tidy <- dag %>%
  tidy_dagitty(seed = 404) %>%
  arrange(name) %>%
  mutate(type = case_when(
    name %in% c("X", "Y", "locs") ~ "data",
    name %in% c("tau2", "theta") ~ "hyperparameter",
    name %in% c("mu_beta", "Sigma_beta") ~ "prior",
    TRUE ~ "parameter"
  ))

## manually rearrange the values
# dag %>%
#   tidy_dagitty(seed = 404) %>%
#   arrange(name)


dag_tidy %>%
  ggplot(aes(x = x, y = y, xend = xend, yend = yend, color = type)) +
  geom_dag_point() +
  geom_dag_edges() +
  geom_dag_text(
    color = "black",
    label = c(
      TeX("$\\beta"),
      TeX("$\\eta"),
      "lat/lon",
      TeX("$\\mu_{\\beta}$"),
      TeX("$\\Sigma_{\\beta}$"),
      TeX("$\\tau^2"),
      TeX("$\\theta$"),
      "X",
      "Y"
    )
  ) +
  theme_dag() +
  scale_color_viridis_d(begin = 0.9, end = 0.4) +
  theme(legend.position = "bottom")

Let $\mathbf{y}i = (y{i, 1}, \ldots, y_{i, J})'$ be a $J$-dimensional vector of counts where $M_i = \sum_{j=1}^J y_{ij}$ is the total count and $\boldsymbol{\pi}i = ( \pi{i, 1}, \ldots, \pi_{i, J})'$ is a vector of probabilities with $\sum_{j=1}^J \pi_{i, j} = 1$. Then, the likelihood of $\mathbf{y}_i$ is given by

\begin{align} [\mathbf{y}i | M_i, \boldsymbol{\pi}_i] & = \frac{M_i!} {\prod{j=1}^J y_{i, j}!} \pi_{i1}^{y_{i, 1}} \cdots \pi_{iJ}^{y_{i, J}} (#eq:multinomial) \end{align}

The canonical multinomial regression uses a soft-max link function where the $J$-dimensional probabilities are modeled in $\mathcal{R}^{J-1}$ with $J-1$ dimensional relative to a fixed reference category. Assigning latent variables $\boldsymbol{\eta}i = (\eta{i, 1}, \ldots, \eta_{i, J-1})'$ the softmax (multi-logit) function for $j = 1, \ldots, J-1$ is

\begin{align} \pi_{ij} = \frac{e^{\eta_{ij}}} {1 + \sum_{j=1}^{J-1} e^{\eta_{ij}}} \end{align}

where this can be interpreted in an $\mathcal{R}^{J}$ dimensional space with $\eta_{i,J} \equiv 0$. Multinomial regression assumes that given an $N \times q$-dimensional design matrix $\mathbf{X}$ for $j = 1, \ldots, J-1$, the latent parameter $\eta_{i, j} = \mathbf{X}i \boldsymbol{\beta}_j$. After assigning each $j = 1, \ldots, J-1$ a $\operatorname{N}(\boldsymbol{\mu}\beta, \boldsymbol{\Sigma}_\beta)$ prior, the posterior distribution is

\begin{align} [\boldsymbol{\beta} | \mathbf{y}] & \propto \prod_{i=1}^N [\mathbf{y}i | \boldsymbol{\beta}] \prod{j=1}^{J-1} [\boldsymbol{\beta}_j]. \end{align}

The difficulty in evaluating the above posterior is that the distribution is not available in closed form and sampling requires a Metropolis-Hastings update (or some other non-conjugate sampler). This motivates the following data augmentation scheme.

The multinomial likelihood as a product of binomial distributions.

We can re-write the multinomial distribution in \@ref(eq:multinomial) as a recursive product of $J-1$ binomial distributions

\begin{align} [\mathbf{y}i | M_i, \boldsymbol{\pi}_i] & = \operatorname{Mult} \left(M_i, \pi_i \right) \ & = \prod{j=1}^{J-1} \operatorname{Binomial} \left( y_{i,j} \middle| \widetilde{M}{i, j}, \widetilde{\pi}{i, j} \right) \ & = \prod_{j=1}^{J-1} \binom{\widetilde{M}{i, j}}{y{i, j}} \widetilde{\pi}{i, j}^{y{i, j}} (1 - \widetilde{\pi}{i, j})^{\widetilde{M}{i, j} - y_{i, j}} \end{align}

where

\begin{align} \widetilde{M}{i, j} & = \begin{cases} \widetilde{M}{i, j} & \mbox{ if } j = 1 \ \widetilde{M}{i, j} - \sum{k < j} y_{i, k} & \mbox{ if } 1 < j \leq J - 1 \end{cases} \end{align}

and the transformed (conditional) probabilities $\widetilde{\pi}_{i, j}$ recursively defined by

\begin{align} \widetilde{\pi}{i, j} & = \begin{cases} \pi{i, 1} & \mbox{ if } j = 1 \ \frac{\pi_{i, j}}{1 - \sum_{k < j} \pi_{i, k}} & \mbox{ if } 1 < j \leq J - 1 \end{cases} \end{align}

where the stick-breaking transformation $\pi_{SB} \left( \boldsymbol{\eta}_{i} \right)$ maps the $J-1$ dimensional vector $\boldsymbol{\eta}_i$ over $\mathcal{R}^{J-1}$ to the $J$-dimensional unit simplex by

\begin{align} \pi_{SB} \left( \eta_{i, j} \right) = \frac{e^{ \eta_{i, j}} }{ \prod_{k \leq j} 1 + e^{ \eta_{i, j} } }. \end{align}

Pólya-gamma data augmentation

The key idea for the Pólya-gamma data augmentation is that the multinomial likelihood can be written as

\begin{align} [\mathbf{y}i | \boldsymbol{\eta}_i] & = \prod{j=1}^{J-1} \binom{\widetilde{M}{i, j}}{y{i, j}} \widetilde{\pi}{i, j}^{y{i, j}} (1 - \widetilde{\pi}{i, j})^{\widetilde{M}{i, j} - y_{i, j}} \nonumber \ & \propto \prod_{j=1}^{J-1}\frac{ (e^{\eta_{i,j}})^{a_{i, j}} }{(1 + e^{\eta_{i,j}})^{b_{i, j}}} (#eq:likelihood) \end{align}

where $\widetilde{\pi_{i,j}} = \frac{e^{\eta_{i,j}}}{1 + e^{\eta_{i,j}}}$ for some latent variable $\eta_{i, j}$ on the real line, $a_{i, j} = y_{i, j}$, and $b_{i, j} = \widetilde{M_{i, j}}$. Then, applying the identity [@polson2013bayesian]

\begin{align} \frac{\left( e^{\eta_{i, j}} \right)^{y_{i, j}} }{ \left( 1 + e^{\eta_{i, j}} \right)^{\widetilde{M}{i, j} }} & = 2^{-\widetilde{M}{i, j}} e^{\kappa_{i, j} \eta_{i, j}} \int_0^\infty e^{- \omega_{i, j} \eta_{i, j}^2 / 2} \left[\omega_{i, j} | \widetilde{M}{i, j}, 0 \right] \,d\omega{i, j} (#eq:pg-identity) \end{align}

where $\kappa \left( y_{i, j} \right) = y_{i, j} - \widetilde{M}{i, j} / 2$. Equation \@ref(eq:pg-identity) allows for the expression of the likelihood in \@ref(eq:likelihood) as an infinite convolution over the density $\left[\omega{i, j} | \widetilde{M}{i, j}, 0 \right]$ which is the probability density function of a Pólya-gamma random variable $\operatorname{PG} \left(\widetilde{M}{i, j}, 0 \right)$ and a component $e^{- \omega_{i, j} \eta_{i, j}^2 / 2}$ which is proportional to the kernel of a Gaussian density with precision $\omega_{i, j}$. We make the assumption that for all $i$ and $j$, $\omega_{i, j} \stackrel{iid}{\sim} \operatorname{PG} \left(\widetilde{M}{i, j}, 0 \right)$ Therefore, we can express a multinomial likelihood as an infinite convolution of a Gaussian random variable with a Pólya-gamma density. After defining a prior $[\boldsymbol{\eta}] = \prod{i=1}^N \prod_{j=1}^{J-1} [\eta_{i, j} | \boldsymbol{\eta}{-i, -j}]$ where $\boldsymbol{\eta}{-i, -j}$ is all of the elements of $\boldsymbol{\eta}$ except the $ij$th element, we can write the joint distribution $[\mathbf{y}, \boldsymbol{\eta}]$ as

Work on this notation \begin{align} [\mathbf{y}, \boldsymbol{\eta}] & \propto \prod_{i=1}^N \prod_{j=1}^{J-1}\frac{ (e^{\eta_{i,j}})^{y_{i, j}} }{(1 + e^{\eta_{i,j}})^{\widetilde{M}{i, j}}} [\boldsymbol{\eta}] \nonumber \ & \propto \prod{i=1}^N \prod_{j=1}^{J-1} 2^{-\widetilde{M}{i, j}} e^{\kappa(y{i, j}) \eta_{i, j}} \int_0^\infty e^{- \omega_{i, j} \eta_{i, j}^2 / 2} \left[\omega_{i, j} | \widetilde{M}{i, j}, 0 \right] \,d\omega{i, j} [\boldsymbol{\eta}] \nonumber \ & \propto \prod_{i=1}^N \prod_{j=1}^{J-1} \int_0^\infty [\eta_{i, j} | \boldsymbol{\eta}{-i,-j}] 2^{-\widetilde{M}{i, j}} e^{\kappa(y_{i, j}) \eta_{i, j}} e^{- \omega_{i, j} \eta_{i, j}^2 / 2} \left[\omega_{i, j} | \widetilde{M}{i, j}, 0 \right] \,d\omega{i, j} \nonumber \ & \propto \prod_{i=1}^N \prod_{j=1}^{J-1} \int_0^\infty [\mathbf{y}, \eta_{i, j}, \omega_{i, j} | \boldsymbol{\eta}{i, j}] \,d\omega{i, j} \nonumber \ & \propto \int_0^\infty [\mathbf{y}, \boldsymbol{\eta}, \boldsymbol{\omega}] \,d\boldsymbol{\omega} (#eq:da) \end{align}

where $[\mathbf{y}, \boldsymbol{\eta}, \boldsymbol{\omega}]$ is a joint density over the data augmented likelihood. When the prior on $\boldsymbol{\eta}$ is Gaussian, the marginal density $[\boldsymbol{\eta} | \mathbf{y}, \boldsymbol{\omega}] \propto \prod_{i=1}^N \prod_{j=1}^{J-1} e^{\kappa(y_{i, j}) \eta_{i, j}} e^{- \omega_{i, j} \eta_{i, j}^2 / 2} [\boldsymbol{\eta}]$ induced by the integrand in \@ref(eq:da) is also Gaussian. In addition, the exponential tilting property of the Pólya-gamma distribution [@polson2013bayesian] gives the conditional distribution

\begin{align} [\omega_{i, j} | \mathbf{y}, \boldsymbol{\eta}] & \sim \operatorname{PG}(\widetilde{M}{i, j}, \eta{i, j}) \end{align}

Pólya-gamma regression

To perform regression on the multinomial vector given an $N \times q$ design matrix $\mathbf{X}$, we assume that $\eta_{i j} = \mathbf{X}i \boldsymbol{\beta}_j$ and $\boldsymbol{\beta}_j \sim \operatorname{N}(\boldsymbol{\mu}\beta, \boldsymbol{\Sigma}_\beta)$.

Defining $\boldsymbol{\Omega}i = \operatorname{diag}(\omega{i, 1}, \ldots, \omega_{i, J-1})$, we can calculate the full conditional distributions.

Full Conditionals

Full Conditional for $\boldsymbol{\beta}_{j}$

\begin{align} \boldsymbol{\beta}{j} | \mathbf{y}, \boldsymbol{\omega} & \propto \prod{i=1}^N \operatorname{N} \left( \boldsymbol{\beta}{j} | \boldsymbol{\Omega}_i^{-1} \kappa \left( \mathbf{y}{i} \right), \boldsymbol{\Omega}j^{-1} \right) \operatorname{N} \left( \boldsymbol{\beta}{\cdot, j} | \boldsymbol{\mu}{\beta_j}, \boldsymbol{\Sigma}{\beta_j} \right) \ & \propto \operatorname{N} \left( \boldsymbol{\beta}_{\cdot, j} | \tilde{\boldsymbol{\mu}}_j, \tilde{\boldsymbol{\Sigma}}_j \right) \end{align}

where

\begin{align} \tilde{\boldsymbol{\mu}}j & = \tilde{\boldsymbol{\Sigma}}_j \left( {\boldsymbol{\Sigma}{\beta}}^{-1} \boldsymbol{\mu}\beta + \sum{i=1}^N \mathbf{x}i' \kappa \left( \mathbf{y}{i, j} \right) \right), \mbox{ and }\ \tilde{\boldsymbol{\Sigma}}j & = \left( {\boldsymbol{\Sigma}{\beta}}^{-1} + \sum_{i=1}^N \mathbf{x}i' \omega{i, j} \mathbf{x}_i \right)^{-1} \end{align}

where $\mathbf{x}_i$ is the $i$th row of $\mathbf{X}$.

Full Conditional for $\omega_{i, j}$

If $\widetilde{M}{i, j} = 0$, then $\omega{i, j} | \mathbf{y}, \boldsymbol{\beta} \equiv 0$. Otherwise, for $\widetilde{M}_{i, j} > 0$, we have

\begin{align} \omega_{i, j} | \mathbf{y}, \boldsymbol{\beta} & \propto \frac{e^{- \frac{1}{2} \omega_{i, j} \mathbf{x}i' \boldsymbol{\beta}_j}[\omega{i, j}]}{\int_{0}^{\infty} e^{- \frac{1}{2} \omega_{i, j} \mathbf{x}i' \boldsymbol{\beta}_j}[\omega{i, j}] \,d\omega_{i, j}} %\operatorname{N} \left( \mathbf{y}{i} | \boldsymbol{\Omega}_i^{-1} \kappa \left( \mathbf{y}{i} \right), \boldsymbol{\Omega}i^{-1} \right) \operatorname{N} \left( \boldsymbol{\beta}{\cdot, j} | \boldsymbol{\mu}{\beta_j}, \boldsymbol{\Sigma}{\beta_j} \right) \ \end{align}

which is $\operatorname{PG} \left( \widetilde{M}{i, j}, \eta{i, j} \equiv \mathbf{x}_i' \boldsymbol{\beta}_j\right)$ by the exponential tilting property of the Pólya-gamma distribution.

params <- default_params()
params$n_adapt <- 500
params$n_mcmc <- 500
params$n_message <- 50
params$n_thin <- 1
priors <- default_priors_pg_stlm(Y, X)
inits  <- default_inits_pg_stlm(Y, X, priors, shared_covariance_params = TRUE)

if (file.exists(here::here("results", "pg_stlm-mra.RData"))) {
  load(here::here("results", "pg_stlm-mra.RData"))
} else {
  start <- Sys.time()
  out <- pg_stlm_mra(Y, as.matrix(X), as.matrix(locs), params, priors, M = 3, n_coarse_grid = 6, n_cores = 1L)
  stop <- Sys.time()
  runtime <- stop - start

  save(out, runtime, file = here::here("results", "pg_stlm-mra.RData"))
  # cell phone notification when done
  pushoverr::pushover(message = "Finished fitting pg_stlm_mra model")

}

layout(matrix(1:6, 3, 2))
for (i in 1:5) {
  matplot(out$beta[, , i], type = 'l', main = paste("species", i))
  abline(h = beta[, i], col = 1:nrow(beta))
}

plot(apply(out$beta, c(2, 3), mean), beta)
abline(0, 1, col = "red")

## plot eta estimates
eta_post_mean <- apply(out$eta, c(2, 3, 4), mean)
dimnames(eta_post_mean) <- list(
  site = 1:N,
  species = 1:(J-1),
  time = 1:n_time
)
eta_post_lower <- apply(out$eta, c(2, 3, 4), quantile, prob = 0.025)
dimnames(eta_post_lower) <- list(
  site = 1:N,
  species = 1:(J-1),
  time = 1:n_time
)
eta_post_upper <- apply(out$eta, c(2, 3, 4), quantile, prob = 0.975)
dimnames(eta_post_upper) <- list(
  site = 1:N,
  species = 1:(J-1),
  time = 1:n_time
)
dimnames(eta) <- list(
  site = 1:N,
  species = 1:(J-1),
  time = 1:n_time
)
dat_eta           <- as.data.frame.table(eta, responseName = "truth")
dat_eta_post_mean <- as.data.frame.table(eta_post_mean, responseName = "mean")
dat_eta_post_lower <- as.data.frame.table(eta_post_lower, responseName = "lower")
dat_eta_post_upper <- as.data.frame.table(eta_post_upper, responseName = "upper")

dat_plot <- Reduce(
  function(x, y) merge(x, y, all = TRUE),
  list(dat_eta, dat_eta_post_mean, dat_eta_post_lower, dat_eta_post_upper)
)

dat_plot %>% 
  ggplot(aes(x = truth, y = mean, color = species)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point(alpha = 0.5) +
  geom_errorbar(aes(ymin = lower, ymax = upper)) +
  facet_grid(time ~ species) +
  geom_abline(intercept = 0, slope = 1, col = "red") +
  ggtitle("Estimated vs. simulated eta")

## plot pi estimates
pi_post_mean <- apply(out$pi, c(2, 3, 4), mean)
dimnames(pi_post_mean) <- list(
  site = 1:N,
  species = 1:J,
  time = 1:n_time
)

dimnames(pi) <- list(
  site = 1:N,
  species = 1:J,
  time = 1:n_time
)
dat_pi           <- as.data.frame.table(pi, responseName = "truth")
dat_pi_post_mean <- as.data.frame.table(pi_post_mean, responseName = "mean")

# dat_plot <- Reduce(
#   function(x, y) merge(x, y, all = TRUE), 
#   list(dat_eta, dat_eta_post_mean)
# )
dat_plot <- Reduce(
  function(x, y) merge(x, y, all = TRUE),
  list(dat_pi, dat_pi_post_mean)
)

dat_plot %>% 
  ggplot(aes(x = truth, y = mean, color = species)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point(alpha = 0.5) +
  facet_grid(time ~ species) +
  geom_abline(intercept = 0, slope = 1, col = "red") +
  ggtitle("Estimated vs. simulated pi")

## plot beta estimates
dat_plot <- data.frame(
  beta = c(
    c(apply(out$beta, c(2, 3), mean)), 
    c(beta)
  ),
  type = rep(c("estimate", "truth"), each = (J-1) * ncol(X)),
  species = factor(rep(1:(J-1), each = ncol(X))),
  variable = 1:ncol(X)
)

dat_plot %>%
  pivot_wider(names_from = type, values_from = beta) %>%
  ggplot(aes(x = estimate, y = truth)) +
  # scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point(alpha = 0.5) +
  facet_wrap(~ species, nrow = 8) +
  geom_abline(intercept = 0, slope = 1, col = "red") +
  ggtitle("Estimated vs. simulated beta")

predictions

set.seed(111)
n <- 400
s <- sample(N, n)
Y_s      <- Y[s, , ]
Y_oos    <- Y[-s, , ]
X_s      <- X[s, ]
X_oos    <- X[-s, ]
eta_s    <- eta[s, , ]
eta_oos  <- eta[-s, , ]
pi_s     <- pi[s, , ]
pi_oos   <- pi[-s, , ]
locs_s   <- locs[s, ]
locs_oos <- locs[-s, ]
Y_prop_s   <- Y_prop[s, , ]
Y_prop_oos <- Y_prop[-s, , ]


# ## put data into data.frame for plotting
# dat <- data.frame(
#   Y       = c(Y_prop_s),
#   lon     = c(locs_s[, 1]),
#   lat     = c(locs_s[, 2]),
#   species = factor(rep(1:J, each = n)), 
#   pi      = c(pi_s)
# )
# p_observed <- ggplot(data = dat, aes(x = lon, y = lat, fill = pi)) +
#   geom_raster() +
#   scale_fill_viridis_c() +
#   facet_wrap(time ~ species) +
#   ggtitle("Sampled simulated probabilities") +
#   theme(legend.position = "none")
# p_observed

## put data into data.frame for plotting
dat_Y_prop <- as.data.frame.table(Y_prop_s, responseName = "Y_prop")
dat_pi     <- as.data.frame.table(pi_s, responseName = "pi")
dat_locs <- data.frame(
  lon     = locs_s[, 1],
  lat     = locs_s[, 2],
  site    = factor((1:N)[s])
)

dat_plot <- Reduce(
  function(x, y) merge(x, y, all = TRUE), 
  list(dat_Y_prop, dat_pi, dat_locs)
)


p_sampled <- dat_plot %>%
  ggplot(aes(x = lon, y = lat, fill = pi)) +
  geom_raster() +
  scale_fill_viridis_c() +
  facet_grid(time ~ species) +
  ggtitle("Simulated pi at observed locations over time") +
  theme(legend.position = "none")
p_sampled

params <- default_params()
params$n_adapt <- 500
params$n_mcmc <- 500
params$n_message <- 50
params$n_thin <- 1
priors <- default_priors_pg_stlm(Y_s, X_s)

if (file.exists(here::here("results", "pg_stlm-mra-sample.RData"))) {
  load(here::here("results", "pg_stlm-mra-sample.RData"))
} else {
  start <- Sys.time()
  out <- pg_stlm_mra(Y_s, as.matrix(X_s), as.matrix(locs_s), params, priors, M = 3, n_coarse_grid = 6, n_cores = 1L)
  stop <- Sys.time()
  runtime <- stop - start

  save(out, runtime, file = here::here("results", "pg_stlm-mra-sample.RData"))

  # cell phone notification when done
  pushoverr::pushover(message = "Finished fitting sampled pg_stlm_mra model")
}

layout(matrix(1:6, 3, 2))
for (i in 1:5) {
  matplot(out$beta[, , i], type = 'l', main = paste("species", i))
  abline(h = beta[, i], col = 1:nrow(beta))
}

plot(apply(out$beta, c(2, 3), mean), beta)
abline(0, 1, col = "red")

plot(apply(out$sigma2, 2, mean), sigma2)
abline(0, 1, col = "red")

## plot eta estimates
eta_post_mean <- apply(out$eta, c(2, 3, 4), mean)
dimnames(eta_post_mean) <- list(
  site = 1:n,
  species = 1:(J-1),
  time = 1:n_time
)
eta_post_lower <- apply(out$eta, c(2, 3, 4), quantile, prob = 0.025)
dimnames(eta_post_lower) <- list(
  site = 1:n,
  species = 1:(J-1),
  time = 1:n_time
)
eta_post_upper <- apply(out$eta, c(2, 3, 4), quantile, prob = 0.975)
dimnames(eta_post_upper) <- list(
  site = 1:n,
  species = 1:(J-1),
  time = 1:n_time
)
dimnames(eta_s) <- list(
  site = 1:n,
  species = 1:(J-1),
  time = 1:n_time
)
dat_eta           <- as.data.frame.table(eta_s, responseName = "truth")
dat_eta_post_mean <- as.data.frame.table(eta_post_mean, responseName = "mean")
dat_eta_post_lower <- as.data.frame.table(eta_post_lower, responseName = "lower")
dat_eta_post_upper <- as.data.frame.table(eta_post_upper, responseName = "upper")

dat_plot <- Reduce(
  function(x, y) merge(x, y, all = TRUE),
  list(dat_eta, dat_eta_post_mean, dat_eta_post_lower, dat_eta_post_upper)
)

dat_plot %>% 
  ggplot(aes(x = truth, y = mean, color = species)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point(alpha = 0.5) +
  geom_errorbar(aes(ymin = lower, ymax = upper)) +
  facet_grid(time ~ species) +
  geom_abline(intercept = 0, slope = 1, col = "red") +
  ggtitle("Estimated vs. simulated eta")

## plot pi estimates
pi_post_mean <- apply(out$pi, c(2, 3, 4), mean)
dimnames(pi_post_mean) <- list(
  site = 1:n,
  species = 1:J,
  time = 1:n_time
)

dimnames(pi_s) <- list(
  site = 1:n,
  species = 1:J,
  time = 1:n_time
)
dat_pi           <- as.data.frame.table(pi_s, responseName = "truth")
dat_pi_post_mean <- as.data.frame.table(pi_post_mean, responseName = "mean")

# dat_plot <- Reduce(
#   function(x, y) merge(x, y, all = TRUE), 
#   list(dat_eta, dat_eta_post_mean)
# )
dat_plot <- Reduce(
  function(x, y) merge(x, y, all = TRUE),
  list(dat_pi, dat_pi_post_mean)
)

dat_plot %>% 
  ggplot(aes(x = truth, y = mean, color = species)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point(alpha = 0.5) +
  facet_grid(time ~ species) +
  geom_abline(intercept = 0, slope = 1, col = "red") +
  ggtitle("Estimated vs. simulated pi")

runtime


## plot beta estimates
dat_plot <- data.frame(
  beta = c(
    c(apply(out$beta, c(2, 3), mean)), 
    c(beta)
  ),
  type = rep(c("estimate", "truth"), each = (J-1) * ncol(X)),
  species = factor(rep(1:(J-1), each = ncol(X))),
  variable = 1:ncol(X)
)

dat_plot %>%
  pivot_wider(names_from = type, values_from = beta) %>%
  ggplot(aes(x = estimate, y = truth)) +
  # scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point(alpha = 0.5) +
  facet_wrap(~ species, nrow = 8) +
  geom_abline(intercept = 0, slope = 1, col = "red") +
  ggtitle("Estimated vs. simulated beta")

if (file.exists(here::here("results", "pg-stlm-mra-sample-preds.RData"))) {
  load(here::here("results", "pg-stlm-mra-sample-preds.RData"))
} else {
  ## now this is super fast
  preds <- predict_pg_stlm_mra(out, X_s, X_oos, locs_s, locs_oos)
  save(preds, file = here::here("results", "pg-stlm-mra-sample-preds.RData"))
}

## plot predicted eta estimates
eta_pred_mean <- apply(preds$eta, c(2, 3, 4), mean)
dimnames(eta_pred_mean) <- list(
  site = 1:(N-n),
  species = 1:(J-1),
  time = 1:n_time
)
eta_pred_lower <- apply(preds$eta, c(2, 3, 4), quantile, prob = 0.025)
dimnames(eta_pred_lower) <- list(
  site = 1:(N-n),
  species = 1:(J-1),
  time = 1:n_time
)
eta_pred_upper <- apply(preds$eta, c(2, 3, 4), quantile, prob = 0.975)
dimnames(eta_pred_upper) <- list(
  site = 1:(N-n),
  species = 1:(J-1),
  time = 1:n_time
)
dimnames(eta_oos) <- list(
  site = 1:(N-n),
  species = 1:(J-1),
  time = 1:n_time
)
dat_eta_oos           <- as.data.frame.table(eta_oos, responseName = "truth")
dat_eta_pred_mean <- as.data.frame.table(eta_pred_mean, responseName = "mean")
dat_eta_pred_lower <- as.data.frame.table(eta_pred_lower, responseName = "lower")
dat_eta_pred_upper <- as.data.frame.table(eta_pred_upper, responseName = "upper")

dat_plot <- Reduce(
  function(x, y) merge(x, y, all = TRUE),
  list(dat_eta_oos, dat_eta_pred_mean, dat_eta_pred_lower, dat_eta_pred_upper)
)

dat_plot %>% 
  ggplot(aes(x = truth, y = mean, color = species)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point(alpha = 0.5) +
  geom_errorbar(aes(ymin = lower, ymax = upper)) +
  facet_grid(time ~ species) +
  geom_abline(intercept = 0, slope = 1, col = "red") +
  ggtitle("Spatial predicted vs. simulated eta")

## plot predicted pi
pi_pred_mean <- array(0, dim = c(N-n, J, n_time))
for (tt in 1:n_time) {
  pi_pred_mean[, , tt] <- eta_to_pi(eta_pred_mean[, , tt])
}
dimnames(pi_pred_mean) <- list(
  site = 1:(N-n),
  species = 1:J,
  time = 1:n_time
)

dimnames(pi_oos) <- list(
  site = 1:(N-n),
  species = 1:J,
  time = 1:n_time
)
dat_pi_oos       <- as.data.frame.table(pi_oos, responseName = "truth")
dat_pi_pred_mean <- as.data.frame.table(pi_pred_mean, responseName = "mean")

# dat_plot <- Reduce(
#   function(x, y) merge(x, y, all = TRUE), 
#   list(dat_eta, dat_eta_pred_mean)
# )
dat_plot <- Reduce(
  function(x, y) merge(x, y, all = TRUE),
  list(dat_pi_oos, dat_pi_pred_mean)
)

dat_plot %>% 
  ggplot(aes(x = truth, y = mean, color = species)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point(alpha = 0.5) +
  facet_grid(time ~ species) +
  geom_abline(intercept = 0, slope = 1, col = "red") +
  ggtitle("Spatial prediction vs. simulated pi")

##
## prediction mean
##
pi_post_mean <- apply(out$pi, c(2, 3, 4), mean)
dimnames(pi_post_mean) <- list(
  site = 1:n,
  species = 1:J,
  time = 1:n_time
)
pi_pred_mean <- apply(preds$pi, c(2, 3, 4), mean)
dimnames(pi_pred_mean) <- list(
  site = 1:(N-n),
  species = 1:J,
  time = 1:n_time
)

dat_pi_mean_s   <- as.data.frame.table(pi_post_mean, responseName = "mean")
dat_pi_mean_oos <- as.data.frame.table(pi_pred_mean, responseName = "mean")
dat_pi_mean_s$in_sample <- TRUE
dat_pi_mean_oos$in_sample <- FALSE
dat_pi_mean_full <- rbind(dat_pi_mean_s, dat_pi_mean_oos)

##
## prediction variance
##

pi_post_var <- apply(out$pi, c(2, 3, 4), var)
dimnames(pi_post_var) <- list(
  site = 1:n,
  species = 1:J,
  time = 1:n_time
)
pi_pred_var <- apply(preds$pi, c(2, 3, 4), var)
dimnames(pi_pred_var) <- list(
  site = 1:(N-n),
  species = 1:J,
  time = 1:n_time
)

dat_pi_var_s   <- as.data.frame.table(pi_post_var, responseName = "var")
dat_pi_var_oos <- as.data.frame.table(pi_pred_var, responseName = "var")
dat_pi_var_s$in_sample <- TRUE
dat_pi_var_oos$in_sample <- FALSE
dat_pi_var_full <- rbind(dat_pi_var_s, dat_pi_var_oos)


dat_locs_s <- data.frame(
  lon       = locs_s[, 1],
  lat       = locs_s[, 2],
  site      = factor(1:n),
  in_sample = TRUE
)
dat_locs_oos <- data.frame(
  lon       = locs_oos[, 1],
  lat       = locs_oos[, 2],
  site      = factor(1:(N-n)),
  in_sample = FALSE
)
dat_locs_full <- rbind(dat_locs_s, dat_locs_oos)

dat_plot <- Reduce(
  function(x, y) merge(x, y, all = TRUE),
  list(dat_pi_mean_full, dat_pi_var_full, dat_locs_full)
)


p_mean <- dat_plot %>%
  ggplot(aes(x = lon, y = lat, fill = mean)) +
  geom_raster() +
  scale_fill_viridis_c() +
  facet_grid(time ~ species) +
  ggtitle("Esimated mean pi over time") +
  theme(legend.position = "none")

p_var <- dat_plot %>%
  ggplot(aes(x = lon, y = lat, fill = var)) +
  geom_raster() +
  scale_fill_viridis_c() +
  facet_grid(time ~ species) +
  ggtitle("Esimated variance in pi over time") +
  theme(legend.position = "none") 
(p_simulated + p_sampled) / (p_mean + p_var)