In syma-research/CRT_microbiome: Conditional randomized test for microbiome data

knitr::opts_knit$set(root.dir = "../")

library(magrittr)
library(ggplot2)
set.seed(1)

load("../data/genera.RData")
physeq <- physeq_genera %>% 
  phyloseq::subset_samples(dataset_name == "MucosalIBD") %>% 
  smar::prune_taxaSamples(flist_taxa = genefilter::kOverA(k = 5, A = 1))
mat_X_count <- smar::otu_table2(physeq)
rownames(mat_X_count) <- CRTmicrobiome:::simplify_feature(rownames(mat_X_count))
mat_X_p <- apply(mat_X_count, 2, function(x) x / sum(x))
df_marginal <- CRTmicrobiome:::get_marginal(t(mat_X_p))

Agenda

Two different models for $p$ exist, copulasso and gcoda.
copulasso: $f_p(p_1, \dots, p_m) = f_u(u_1, \dots, u_m) \times \prod f_p(p_j)$; $f_u(u_1, \dots, u_m)$ can be estimated with glasso.
- Model does not enforce compositionality!
gcoda: assumes logistic MVN on relative abundances $a_j$. $p_j = \frac{a_j}{\sum a_j}$. $f_a(\log a_1, \dots, \log a_m) \sim MVN(\mu, \Omega^{-1})$. Key insights:
- Given observations of $p$, both $\mu$ and $\Omega$ are non-identifiable.
- Liklihood involves integrating out total abundance $A = \sum a_j$. That is,
$f_p(p_1, \dots, p_m | \mu, \Omega) = \int \left { f_a(A p_1, \dots, A a_m | \mu, \Omega) \times \prod \frac{1}{p_j} \times \frac{1}{A} \right } d A$
- $\hat{\mu} = (\bar{\log p_1}, \dots, \bar{\log p_m})$ is a MLE, and so is any $\hat{\mu} + c$ for constant $c$.
- $\Omega$ can be solved by maximizing regularized likelihood.
- Logistic-MVN does not allow for zeros - has to add pseudo counts to data!
Compare their performance on marginals and correlation/covariance structures
Obtain (regularized) MLE parameters for both.
Generate samples based on MLE.
Compare new samples versus originals.

lfit_copulasso <- CRTmicrobiome:::copulasso(
  data = t(mat_X_p),
  lambda_list = seq(0.01, 0.4,
                    by = 0.01),
  K_CV = 5)
fit_copulasso <-
  lfit_copulasso$fits[[order(lfit_copulasso$df_eval$BIC)[1]]]

mat_X_p_pseudocount <- apply(mat_X_count + 0.5, 2, function(x) x / sum(x))
lfit_gcoda <- CRTmicrobiome:::gcoda(
  data = t(mat_X_p_pseudocount),
  lambda_list = seq(0.02, 0.9,
                    by = 0.02),
  K_CV = 5)
fit_gcoda <-
  lfit_gcoda$fits[[order(lfit_gcoda$df_eval$BIC)[1]]]

R <- 10000
# copulasso
samples_copulasso <-
  CRTmicrobiome:::rcopulasso(n = R,
                             mean = df_marginal$mu,
                             sd = df_marginal$sigma,
                             pi = df_marginal$pi,
                             sigma = solve(fit_copulasso),
                             norm = FALSE) %>%
  t()
rownames(samples_copulasso) <- rownames(mat_X_count)

# gcoda pseudo count
mu_a <- CRTmicrobiome:::get_mean_logisticMVN(t(mat_X_p_pseudocount))
samples_a <- mvtnorm::rmvnorm(n = R,
                              mean = mu_a,
                              sigma = solve(fit_gcoda))
samples_gcoda <- apply(samples_a, 1, function(x) exp(x) / sum(exp(x)))
rownames(samples_gcoda) <- rownames(mat_X_count)

Marginals

copulasso does reasonable job in simulating marginals, with some exceptions
gcoda assumes a unimodal distribution on $(\log) p_j$ when in fact it's often bimodal due to zero-inflation.

df_original <- mat_X_p %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::mutate(data = "original")
df_copulasso <- samples_copulasso %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::mutate(data = "copulasso")
df_copulasso_renorm <- samples_copulasso %>%
  apply(2, function(x) x / sum(x)) %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::mutate(data = "copulasso_renorm")
df_p_copulasso <- rbind(df_original,
                        df_copulasso,
                        df_copulasso_renorm) %>%
  dplyr::filter(abd > 0) %>%
  dplyr::mutate(data = factor(data,
                              levels = c("original",
                                         "copulasso",
                                         "copulasso_renorm")),
                log_abd = log10(abd + 1e-16)) %>%
  dplyr::group_by(feature) %>%
  dplyr::summarise(
    l_p = list(
      data.frame(log_abd = log_abd,
                 data = data) %>%
        ggplot(aes(x = log_abd)) +
        geom_histogram(bins = 50) +
        facet_grid(data~., scales = "free_y") +
        ggtitle(feature[1])
    ))
pdf("../results/copulasso_vs_gcoda/marginal_copulasso.pdf",
    width = 4, height = 10)
for(i in seq_len(nrow(df_p_copulasso)))
  print(df_p_copulasso$l_p[[i]])
dev.off()
df_p_copulasso %>%
  dplyr::filter(feature == "f__Ruminococcaceae") %>%
  extract2("l_p") %>%
  print()

df_original <- mat_X_count %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::group_by(sample) %>%
  dplyr::mutate(is_pos = abd > 0,
                abd = (abd + 0.5) / sum(abd + 0.5)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(data = "original")
df_gcoda <- samples_gcoda %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::mutate(is_pos = TRUE,
                data = "gcoda")
df_p_gcoda <- rbind(df_original,
                    df_gcoda) %>%
  dplyr::mutate(
    data = factor(data, levels = c("original", "gcoda")),
    log_abd = log10(abd)) %>%
  dplyr::group_by(feature) %>%
  dplyr::summarise(
    l_p = list(
      data.frame(log_abd = log_abd,
                 data = data,
                 is_pos = is_pos) %>%
        ggplot(aes(x = log_abd, fill = is_pos)) +
        geom_histogram(bins = 50) +
        facet_grid(data~., scales = "free_y") +
        scale_fill_manual(values = c("FALSE" = "grey", "TRUE" = "black")) +
        ggtitle(feature[1])
    ))
pdf("../results/copulasso_vs_gcoda/marginal_gcoda.pdf",
    width = 4, height = 8)
for(i in seq_len(nrow(df_p_gcoda)))
  print(df_p_gcoda$l_p[[i]])
dev.off()
df_p_gcoda %>%
  dplyr::filter(feature == "f__Christensenellaceae") %>%
  extract2("l_p") %>%
  print()

Correlation/covariance structure of $p$

copulasso obtains shrinkage estimation on $\text{corr}(p_i, p_j)$
In practice needs to renormalize features, which disrupts the correlation structure a little.
gcoda introduces positive covariance between low abundance features (small $a_j$).
Measure covariance between $\log p_i, \log p_j$
Pseudo count introduces positive covariance between features that are often zeros.
- Obtain $\tilde{p}_j$, relative abundances after adding pseudo count to all data.
- Pseudo count covariance is $\text{cov} (\log \tilde{p}_i, \log \tilde{p}_j)$
- Original covariance estimated as $\rho^{-1} (\text{spearman} (p_i, p_j)) * \text{sd} (\log \tilde{p}_i) * \text{sd} (\log \tilde{p}_j)$
Low abundant features' proportions dominated by highly abundant and variable features, thus vary together.

rho_original <- CRTmicrobiome:::cor2(x = t(mat_X_p))
rho_copulaso <- CRTmicrobiome:::cor2(x = t(samples_copulasso))
rho_copulaso_renorm <- samples_copulasso %>%
  apply(2, function(x) x / sum(x)) %>% t() %>% CRTmicrobiome:::cor2()

cov_pseudo <- (mat_X_count + 0.5) %>%
  apply(2, function(x) x / sum(x)) %>% t() %>% log() %>%
  cov()
cov_original <- CRTmicrobiome:::cor2(x = t(mat_X_p)) %>%
  CRTmicrobiome:::iRho() %>%
  `*`(sqrt(diag(cov_pseudo))) %>%
  `*`(rep(sqrt(diag(cov_pseudo)), each = nrow(mat_X_p)))
cov_gcoda <- log(samples_gcoda) %>% t() %>%
  cov()

p_copulaso <- CRTmicrobiome:::plot_cors(rho_original, rho_copulaso,
                                        c("original", "copulaso")) +
  ggtitle("Spearman correlation of copulaso p")
p_copulaso_renorm <- CRTmicrobiome:::plot_cors(rho_original, 
                                               rho_copulaso_renorm,
                                               c("original", "copulaso")) +
  ggtitle("Spearman correlation of copulaso p (renormed)")
p_pseudo <- CRTmicrobiome:::plot_cors(cov_original, cov_pseudo,
                                        c("original", "with pseudo count")) +
  ggtitle("Covariance of log p")
p_gcoda <- CRTmicrobiome:::plot_cors(cov_original, cov_gcoda,
                                     c("with pseudo count", "gcoda")) +
  ggtitle("Covariance of gcoda log p")

cowplot::plot_grid(p_copulaso, p_copulaso_renorm,
                   p_pseudo, p_gcoda, nrow = 2)

Improvement on gcoda (??)

Can use ZI marginals to model $f_a(a_j)$ instead?

Original likelihood is $f_a^{\text{MVN}}(a_1, \dots, a_m)$
Write new likelihood as

$\begin{aligned} f_a(a_1, \dots, a_m) &= \frac{f_a(a_1, \dots, a_m)}{\prod f_a(a_j)} \times \prod f_a(a_j) \ &\approx \frac{f^{\text{MVN}}_a(a_1, \dots, a_m)}{\prod f^{\text{MVN}}_a(a_j)} \times \prod f_a(a_j) \end{aligned}$

This is not a proper likelihood (won't integrate to 1).

Model evaluation criteria comparison

Varying tuning parameter $\lambda$, can evaluate model with:

Sparsity
CV likelihood
AIC, BIC, EBIC

lfit_copulasso$df_eval %>%
  dplyr::mutate(sparsity = df / nrow(mat_X_p) / (nrow(mat_X_p) - 1) * 2) %>%
  tidyr::gather(key = metric_name, value = metric,
                negLogLik_CV, sparsity, AIC, BIC, EBIC) %>%
  ggplot(aes(x = lambda, y = metric)) +
  geom_point() + geom_line() +
  facet_wrap(.~metric_name, scales = "free_y", ncol = 1) +
  ggtitle("Copula lasso fits")
lfit_copulasso$df_eval %>%
  tidyr::gather(key = metric_name, value = metric,
                negLogLik_CV, AIC, BIC, EBIC) %>%
  dplyr::group_by(metric_name) %>%
  dplyr::arrange(metric) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup() %>%
  print()
lfit_gcoda$df_eval %>%
  dplyr::mutate(sparsity = df / nrow(mat_X_p) / (nrow(mat_X_p) - 1) * 2) %>%
  tidyr::gather(key = metric_name, value = metric,
                negLogLik_CV, sparsity, AIC, BIC, EBIC) %>%
  ggplot(aes(x = lambda, y = metric)) +
  geom_point() + geom_line() +
  facet_wrap(.~metric_name, scales = "free_y", ncol = 1) +
  ggtitle("gcoda fits with pseudo count = 0.5")
lfit_gcoda$df_eval %>%
  tidyr::gather(key = metric_name, value = metric,
                negLogLik_CV, AIC, BIC, EBIC) %>%
  dplyr::group_by(metric_name) %>%
  dplyr::arrange(metric) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup() %>%
  print()