In jtipton25/pgR: Bayesian Polya-Gamma Regression

To Do

Figure out issue with fast rmvn_*() functions
add in Polya-gamma data augmentation to DAG

We are testing the Polya-gamma linear model pg_lm()

library(pgR)
# library(MCMCpack)
library(splines)
library(tidyverse)
library(patchwork)

simulate some data

set.seed(11)
N <- 5000
J <- 10
X <- runif(N)
df <- 4
Xbs <- bs(X, df)
beta <- matrix(rnorm((J-1) * df), df, (J-1))
## make the intercepts smaller to reduce stochastic ordering effect
beta[1, ] <- beta[1, ] - seq(from = 4, to = 0, length.out = J - 1)
eta <- Xbs %*% beta 
pi <- eta_to_pi(eta)

Y <- matrix(0, N, J)

# simulate a very skewed distribution of total counts
load(here::here("results", "count-variation.RData"))
Mi <- c(counts, sample(counts, N-length(counts)))
for (i in 1:N) {
    Y[i, ] <- rmultinom(1, Mi[i], pi[i, ])
}
Y_prop     <- counts_to_proportions(Y)

Plot the simulated data

dat <- data.frame(
  y        = c(Y_prop),

  x        = X,
  species  = factor(rep(1:J, each = N)))

dat_truth <- data.frame(
  y        = c(pi),
  x        = X,
  species  = factor(rep(1:J, each = N)))

dat %>%
  group_by(species) %>%
  sample_n(pmin(nrow(Y), 500)) %>%
  ggplot(aes(y = y, x = x, group = species, color = species)) +
  geom_point(alpha = 0.2) +
  ylab("Proportion of count") +
  geom_line(data = dat_truth, aes(y = y, x = x, group = species), color = "black", lwd = 0.5) +
  facet_wrap(~ species, ncol = 4) +
  ggtitle("Simulated data")

## fit the model to verify parameters
params <- default_params() 
params$n_adapt <- 1000
params$n_mcmc <- 1000
params$n_message <- 50
params$n_thin <- 1
priors <- default_priors_pg_lm(Y, Xbs)
inits <- default_inits_pg_lm(Y, Xbs, priors)
config <- list(save_omega = TRUE)

if (file.exists(here::here("results", "pg_lm-count-variation.RData"))) {
    load(here::here("results", "pg_lm-count-variation.RData"))
} else {
    start <- Sys.time()
    out <- pg_lm(Y, as.matrix(Xbs), params, priors, config = config, n_cores = 1L, sample_rmvn = TRUE)
    stop <- Sys.time()
    runtime <- stop - start

    save(out, runtime, file = here::here("results", "pg_lm-count-variation.RData"))
}

betas <- out$beta
dimnames(betas) <- list(
  iteration = 1:dim(betas)[1],
  parameter = 1:dim(betas)[2],
  species = 1:dim(betas)[3]
)
as.data.frame.table(betas, responseName = "value") %>%
  mutate(iteration = as.numeric(iteration)) %>%
  ggplot(aes(x = iteration, y = value, group = parameter, color = parameter)) +
  geom_line() +
  facet_wrap(~ species)

# layout(matrix(1:9, 3, 3))
# for (i in 1:9) {
#   matplot(out$beta[, , i], type = 'l', main = paste("species", i))
#   abline(h = beta[, i], col = 1:nrow(beta))
# }

etas <- out$eta

## plot beta estimates
p_beta <- data.frame(
  beta_mean = c(apply(out$beta, c(2, 3), mean)), 
  beta_lower = c(apply(out$beta, c(2, 3), quantile, prob = 0.025)),  
  beta_upper = c(apply(out$beta, c(2, 3), quantile, prob = 0.975)), 
  beta_truth = c(beta),
  species = factor(rep(1:(J-1), each = ncol(Xbs))),
  knots = factor(1:ncol(Xbs))
) %>%
  ggplot(aes(x = beta_truth, y = beta_mean, color = knots)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point() +
  geom_errorbar(aes(ymin = beta_lower, ymax = beta_upper)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~ species, nrow = 3) +
  geom_abline(intercept = 0, slope = 1, col = "red")

## plot eta estimates
p_eta <- data.frame(
  eta_mean = c(apply(out$eta, c(2, 3), mean)), 
  eta_lower = c(apply(out$eta, c(2, 3), quantile, prob = 0.025)),  
  eta_upper = c(apply(out$eta, c(2, 3), quantile, prob = 0.975)), 
  eta_truth = c(eta),
  species = factor(rep(1:(J-1), each = ncol(Xbs))),
  knots = factor(1:ncol(Xbs))
) %>%
  ggplot(aes(x = eta_truth, y = eta_mean, color = knots)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point() +
  geom_errorbar(aes(ymin = eta_lower, ymax = eta_upper)) +
  geom_point(alpha = 0.5) +
  facet_wrap(knots ~ species, nrow = 3) +
  geom_abline(intercept = 0, slope = 1, col = "red")

p_beta / p_eta

pi_post <- array(0, dim = c(dim(out$eta)[1], dim(out$eta)[2], dim(out$eta)[3] + 1))
for (i in 1:dim(out$eta)[1]) {
  pi_post[i, , ] <- eta_to_pi(out$eta[i, , ])
}

 dat_pi <- data.frame(
  pi_mean = c(apply(pi_post, c(2, 3), mean)), 
  pi_lower = c(apply(pi_post, c(2, 3), quantile, prob = 0.025)),  
  pi_upper = c(apply(pi_post, c(2, 3), quantile, prob = 0.975)), 
  pi_truth = c(pi),
  species = factor(rep(1:J, each = N)),
  observation = factor(1:N)) 

p_pi <-  dat_pi %>% 
  ggplot(aes(x = pi_truth, y = pi_mean)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point() +
  geom_errorbar(aes(ymin = pi_lower, ymax = pi_upper)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~ species, nrow = 3) +
  geom_abline(intercept = 0, slope = 1, col = "red")

p_pi

# plot the fitted response curves
dat <- data.frame(
  y        = c(Y_prop),

  x        = X,
  species  = factor(rep(1:J, each = N)))

dat_fit <- data.frame(
  pi_mean  = c(apply(pi_post, c(2, 3), mean)),
  pi_lower = c(apply(pi_post, c(2, 3), quantile, prob = 0.025)),  
  pi_upper = c(apply(pi_post, c(2, 3), quantile, prob = 0.975)), 
  x        = X,
  species  = factor(rep(1:J, each = N)))

dat %>%
  group_by(species) %>%
  sample_n(pmin(nrow(Y), 500)) %>%
  ggplot(aes(y = y, x = x, group = species, color = species)) +
  geom_point(alpha = 0.2) +
  ylab("Proportion of count") +
  geom_line(data = dat_fit, aes(y = pi_mean, x = x, group = species), color = "black", lwd = 0.5) +
  # geom_line(data = dat_truth, aes(y = y, x = x, group = species), color = "blue", lwd = 0.5) +
  facet_wrap(~ species, ncol = 4) +
  ggtitle("Simulated data")

Fit a mis-specified linear model

## fit the model with a linear response only
params <- default_params() 
params$n_adapt <-1000
params$n_mcmc <- 1000
params$n_message <- 50
params$n_thin <- 1
priors <- default_priors_pg_lm(Y, as.matrix(X))
inits <- default_inits_pg_lm(Y, as.matrix(X), priors)
config <- list(save_omega = TRUE)

if (file.exists(here::here("results", "pg_lm-linear-count-variation.RData"))) {
    load(here::here("results", "pg_lm-linear-count-variation.RData"))
} else {
    start <- Sys.time()
    out_linear <- pg_lm(Y, as.matrix(X), params, priors, 
                        config = config, n_cores = 1L, sample_rmvn = FALSE)
    stop <- Sys.time()
    runtime_linear <- stop - start

    save(out_linear, runtime_linear, file = here::here("results", "pg_lm-linear-count-variation.RData"))
}

# model fit using loo
ll_bs <- calc_ll_pg_lm(Y, Xbs, out)
# fit model using linear response
ll_linear <- calc_ll_pg_lm(Y, as.matrix(X), out_linear)

library(loo)
loo_bs <- loo(t(ll_bs$ll), cores = 32L)
loo_linear <- loo(t(ll_linear$ll), cores = 32L)
comp <- loo_compare(loo_bs, loo_linear)
print(comp)
plot(loo_bs)
plot(loo_linear)

prediction code testing

set.seed(44)
## subsample the simulated data
n <- 2000
s <- sample(N, n)
Y_s     <- Y[s, ]
Y_oos   <- Y[-s, ]
Xbs_s   <- Xbs[s, ]
Xbs_oos <- Xbs[-s, ]
eta_s   <- eta[s, ]
eta_oos <- eta[-s, ]
pi_s    <- pi[s, ]
pi_oos  <- pi[-s, ]

## fit the model to verify parameters
params <- default_params()
params$n_adapt <- 500
params$n_mcmc <- 1000
params$n_message <- 50
params$n_thin <- 1
priors <- default_priors_pg_lm(Y_s, Xbs_s)
inits <- default_inits_pg_lm(Y_s, Xbs_s, priors)
if (file.exists(here::here("results", "pg_lm-sample-count-variation.RData"))) {
    load(here::here("results", "pg_lm-sample-count-variation.RData"))
} else {
    ## need to parallelize the polya-gamma random variables
    ## can I do this efficiently using openMP?
    ## Q: faster to parallelize using openMP or foreach?
    start <- Sys.time()
    out <- pg_lm(Y_s, as.matrix(Xbs_s), params, priors, n_cores = 1L, sample_rmvn = FALSE)
    stop <- Sys.time()
    runtime <- stop - start

    save(out, runtime, file = here::here("results", "pg_lm-sample-count-variation.RData"))
}

layout(matrix(1:9, 3, 3))
for (i in 1:9) {
  matplot(out$beta[, , i], type = 'l', main = paste("species", i))
  abline(h = beta[, i])
}

runtime

## plot beta estimates
dat_plot <- data.frame(
    beta = c(
        c(apply(out$beta, c(2, 3), mean)), 
        c(beta)
    ),
    type = rep(c("estimate", "truth"), each = (J-1) * ncol(Xbs)),
    species = factor(rep(1:(J-1), each = ncol(Xbs))),
    knots = 1:ncol(Xbs)
)

dat_plot %>%
    pivot_wider(names_from = type, values_from = beta) %>%
    ggplot(aes(x = estimate, y = truth, color = species)) +
    scale_color_viridis_d(begin = 0, end = 0.8) +
    geom_point(alpha = 0.5) +
    facet_wrap(~ species, nrow = 8) +
    geom_abline(intercept = 0, slope = 1, col = "red")

## predict from the model
preds <- predict_pg_lm(out, Xbs_oos)

pi_pred_mean <- apply(preds$pi, c(2, 3), mean)
dat_pi_pred <- data.frame(
  pi   = c(
    c(pi_oos),
    c(pi_pred_mean)
  ),
  type = rep(c("observed", "predicted"), each = J * (N-n)),
  species = factor(rep(1:J, each = (N-n))),
  obs     = 1:(N-n) 
)

p_pi_pred <- dat_pi_pred %>%
  pivot_wider(names_from = type, values_from = pi) %>%
  ggplot(aes(x = observed, y = predicted, color = species)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point(alpha = 0.5) +
  facet_wrap(~ species, nrow = 5) +
  geom_abline(intercept = 0, slope = 1, col = "red") +
  ggtitle("Predicted vs simulated latent probability pi")


eta_pred_mean <- apply(preds$eta, c(2, 3), mean)
dat_eta_pred <- data.frame(
  eta   = c(
    c(eta_oos),
    c(eta_pred_mean)
  ),
  type = rep(c("observed", "predicted"), each = (J-1) * (N-n)),
  species = factor(rep(1:(J-1), each = (N-n))),
  obs     = 1:(N-n) 
)

p_eta_pred <- dat_eta_pred %>%
  pivot_wider(names_from = type, values_from = eta) %>%
  ggplot(aes(x = observed, y = predicted, color = species)) +
  scale_color_viridis_d(begin = 0, end = 0.8) +
  geom_point(alpha = 0.5) +
  facet_wrap(~ species, nrow = 5) +
  geom_abline(intercept = 0, slope = 1, col = "red") +
  ggtitle("Predicted vs simulated latent intensity eta")
p_pi_pred + p_eta_pred