rmvn_*()
functionsWe are testing the Polya-gamma linear model pg_lm()
library(pgR) # library(MCMCpack) library(splines) library(tidyverse) library(patchwork)
set.seed(11) N <- 5000 J <- 10 X <- runif(N) df <- 4 Xbs <- bs(X, df) beta <- matrix(rnorm((J-1) * df), df, (J-1)) ## make the intercepts smaller to reduce stochastic ordering effect beta[1, ] <- beta[1, ] - seq(from = 4, to = 0, length.out = J - 1) eta <- Xbs %*% beta pi <- eta_to_pi(eta) Y <- matrix(0, N, J) # simulate a very skewed distribution of total counts load(here::here("results", "count-variation.RData")) Mi <- c(counts, sample(counts, N-length(counts))) for (i in 1:N) { Y[i, ] <- rmultinom(1, Mi[i], pi[i, ]) } Y_prop <- counts_to_proportions(Y)
dat <- data.frame( y = c(Y_prop), x = X, species = factor(rep(1:J, each = N))) dat_truth <- data.frame( y = c(pi), x = X, species = factor(rep(1:J, each = N))) dat %>% group_by(species) %>% sample_n(pmin(nrow(Y), 500)) %>% ggplot(aes(y = y, x = x, group = species, color = species)) + geom_point(alpha = 0.2) + ylab("Proportion of count") + geom_line(data = dat_truth, aes(y = y, x = x, group = species), color = "black", lwd = 0.5) + facet_wrap(~ species, ncol = 4) + ggtitle("Simulated data")
## fit the model to verify parameters params <- default_params() params$n_adapt <- 1000 params$n_mcmc <- 1000 params$n_message <- 50 params$n_thin <- 1 priors <- default_priors_pg_lm(Y, Xbs) inits <- default_inits_pg_lm(Y, Xbs, priors) config <- list(save_omega = TRUE) if (file.exists(here::here("results", "pg_lm-count-variation.RData"))) { load(here::here("results", "pg_lm-count-variation.RData")) } else { start <- Sys.time() out <- pg_lm(Y, as.matrix(Xbs), params, priors, config = config, n_cores = 1L, sample_rmvn = TRUE) stop <- Sys.time() runtime <- stop - start save(out, runtime, file = here::here("results", "pg_lm-count-variation.RData")) }
betas <- out$beta dimnames(betas) <- list( iteration = 1:dim(betas)[1], parameter = 1:dim(betas)[2], species = 1:dim(betas)[3] ) as.data.frame.table(betas, responseName = "value") %>% mutate(iteration = as.numeric(iteration)) %>% ggplot(aes(x = iteration, y = value, group = parameter, color = parameter)) + geom_line() + facet_wrap(~ species) # layout(matrix(1:9, 3, 3)) # for (i in 1:9) { # matplot(out$beta[, , i], type = 'l', main = paste("species", i)) # abline(h = beta[, i], col = 1:nrow(beta)) # } etas <- out$eta
## plot beta estimates p_beta <- data.frame( beta_mean = c(apply(out$beta, c(2, 3), mean)), beta_lower = c(apply(out$beta, c(2, 3), quantile, prob = 0.025)), beta_upper = c(apply(out$beta, c(2, 3), quantile, prob = 0.975)), beta_truth = c(beta), species = factor(rep(1:(J-1), each = ncol(Xbs))), knots = factor(1:ncol(Xbs)) ) %>% ggplot(aes(x = beta_truth, y = beta_mean, color = knots)) + scale_color_viridis_d(begin = 0, end = 0.8) + geom_point() + geom_errorbar(aes(ymin = beta_lower, ymax = beta_upper)) + geom_point(alpha = 0.5) + facet_wrap(~ species, nrow = 3) + geom_abline(intercept = 0, slope = 1, col = "red") ## plot eta estimates p_eta <- data.frame( eta_mean = c(apply(out$eta, c(2, 3), mean)), eta_lower = c(apply(out$eta, c(2, 3), quantile, prob = 0.025)), eta_upper = c(apply(out$eta, c(2, 3), quantile, prob = 0.975)), eta_truth = c(eta), species = factor(rep(1:(J-1), each = ncol(Xbs))), knots = factor(1:ncol(Xbs)) ) %>% ggplot(aes(x = eta_truth, y = eta_mean, color = knots)) + scale_color_viridis_d(begin = 0, end = 0.8) + geom_point() + geom_errorbar(aes(ymin = eta_lower, ymax = eta_upper)) + geom_point(alpha = 0.5) + facet_wrap(knots ~ species, nrow = 3) + geom_abline(intercept = 0, slope = 1, col = "red") p_beta / p_eta
pi_post <- array(0, dim = c(dim(out$eta)[1], dim(out$eta)[2], dim(out$eta)[3] + 1)) for (i in 1:dim(out$eta)[1]) { pi_post[i, , ] <- eta_to_pi(out$eta[i, , ]) } dat_pi <- data.frame( pi_mean = c(apply(pi_post, c(2, 3), mean)), pi_lower = c(apply(pi_post, c(2, 3), quantile, prob = 0.025)), pi_upper = c(apply(pi_post, c(2, 3), quantile, prob = 0.975)), pi_truth = c(pi), species = factor(rep(1:J, each = N)), observation = factor(1:N)) p_pi <- dat_pi %>% ggplot(aes(x = pi_truth, y = pi_mean)) + scale_color_viridis_d(begin = 0, end = 0.8) + geom_point() + geom_errorbar(aes(ymin = pi_lower, ymax = pi_upper)) + geom_point(alpha = 0.5) + facet_wrap(~ species, nrow = 3) + geom_abline(intercept = 0, slope = 1, col = "red") p_pi
# plot the fitted response curves dat <- data.frame( y = c(Y_prop), x = X, species = factor(rep(1:J, each = N))) dat_fit <- data.frame( pi_mean = c(apply(pi_post, c(2, 3), mean)), pi_lower = c(apply(pi_post, c(2, 3), quantile, prob = 0.025)), pi_upper = c(apply(pi_post, c(2, 3), quantile, prob = 0.975)), x = X, species = factor(rep(1:J, each = N))) dat %>% group_by(species) %>% sample_n(pmin(nrow(Y), 500)) %>% ggplot(aes(y = y, x = x, group = species, color = species)) + geom_point(alpha = 0.2) + ylab("Proportion of count") + geom_line(data = dat_fit, aes(y = pi_mean, x = x, group = species), color = "black", lwd = 0.5) + # geom_line(data = dat_truth, aes(y = y, x = x, group = species), color = "blue", lwd = 0.5) + facet_wrap(~ species, ncol = 4) + ggtitle("Simulated data")
## fit the model with a linear response only params <- default_params() params$n_adapt <-1000 params$n_mcmc <- 1000 params$n_message <- 50 params$n_thin <- 1 priors <- default_priors_pg_lm(Y, as.matrix(X)) inits <- default_inits_pg_lm(Y, as.matrix(X), priors) config <- list(save_omega = TRUE) if (file.exists(here::here("results", "pg_lm-linear-count-variation.RData"))) { load(here::here("results", "pg_lm-linear-count-variation.RData")) } else { start <- Sys.time() out_linear <- pg_lm(Y, as.matrix(X), params, priors, config = config, n_cores = 1L, sample_rmvn = FALSE) stop <- Sys.time() runtime_linear <- stop - start save(out_linear, runtime_linear, file = here::here("results", "pg_lm-linear-count-variation.RData")) }
# model fit using loo ll_bs <- calc_ll_pg_lm(Y, Xbs, out) # fit model using linear response ll_linear <- calc_ll_pg_lm(Y, as.matrix(X), out_linear) library(loo) loo_bs <- loo(t(ll_bs$ll), cores = 32L) loo_linear <- loo(t(ll_linear$ll), cores = 32L) comp <- loo_compare(loo_bs, loo_linear) print(comp) plot(loo_bs) plot(loo_linear)
set.seed(44) ## subsample the simulated data n <- 2000 s <- sample(N, n) Y_s <- Y[s, ] Y_oos <- Y[-s, ] Xbs_s <- Xbs[s, ] Xbs_oos <- Xbs[-s, ] eta_s <- eta[s, ] eta_oos <- eta[-s, ] pi_s <- pi[s, ] pi_oos <- pi[-s, ] ## fit the model to verify parameters params <- default_params() params$n_adapt <- 500 params$n_mcmc <- 1000 params$n_message <- 50 params$n_thin <- 1 priors <- default_priors_pg_lm(Y_s, Xbs_s) inits <- default_inits_pg_lm(Y_s, Xbs_s, priors) if (file.exists(here::here("results", "pg_lm-sample-count-variation.RData"))) { load(here::here("results", "pg_lm-sample-count-variation.RData")) } else { ## need to parallelize the polya-gamma random variables ## can I do this efficiently using openMP? ## Q: faster to parallelize using openMP or foreach? start <- Sys.time() out <- pg_lm(Y_s, as.matrix(Xbs_s), params, priors, n_cores = 1L, sample_rmvn = FALSE) stop <- Sys.time() runtime <- stop - start save(out, runtime, file = here::here("results", "pg_lm-sample-count-variation.RData")) }
layout(matrix(1:9, 3, 3)) for (i in 1:9) { matplot(out$beta[, , i], type = 'l', main = paste("species", i)) abline(h = beta[, i]) }
runtime ## plot beta estimates dat_plot <- data.frame( beta = c( c(apply(out$beta, c(2, 3), mean)), c(beta) ), type = rep(c("estimate", "truth"), each = (J-1) * ncol(Xbs)), species = factor(rep(1:(J-1), each = ncol(Xbs))), knots = 1:ncol(Xbs) ) dat_plot %>% pivot_wider(names_from = type, values_from = beta) %>% ggplot(aes(x = estimate, y = truth, color = species)) + scale_color_viridis_d(begin = 0, end = 0.8) + geom_point(alpha = 0.5) + facet_wrap(~ species, nrow = 8) + geom_abline(intercept = 0, slope = 1, col = "red")
## predict from the model preds <- predict_pg_lm(out, Xbs_oos)
pi_pred_mean <- apply(preds$pi, c(2, 3), mean) dat_pi_pred <- data.frame( pi = c( c(pi_oos), c(pi_pred_mean) ), type = rep(c("observed", "predicted"), each = J * (N-n)), species = factor(rep(1:J, each = (N-n))), obs = 1:(N-n) ) p_pi_pred <- dat_pi_pred %>% pivot_wider(names_from = type, values_from = pi) %>% ggplot(aes(x = observed, y = predicted, color = species)) + scale_color_viridis_d(begin = 0, end = 0.8) + geom_point(alpha = 0.5) + facet_wrap(~ species, nrow = 5) + geom_abline(intercept = 0, slope = 1, col = "red") + ggtitle("Predicted vs simulated latent probability pi") eta_pred_mean <- apply(preds$eta, c(2, 3), mean) dat_eta_pred <- data.frame( eta = c( c(eta_oos), c(eta_pred_mean) ), type = rep(c("observed", "predicted"), each = (J-1) * (N-n)), species = factor(rep(1:(J-1), each = (N-n))), obs = 1:(N-n) ) p_eta_pred <- dat_eta_pred %>% pivot_wider(names_from = type, values_from = eta) %>% ggplot(aes(x = observed, y = predicted, color = species)) + scale_color_viridis_d(begin = 0, end = 0.8) + geom_point(alpha = 0.5) + facet_wrap(~ species, nrow = 5) + geom_abline(intercept = 0, slope = 1, col = "red") + ggtitle("Predicted vs simulated latent intensity eta") p_pi_pred + p_eta_pred
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.