knitr::opts_knit$set(root.dir = "../")
library(magrittr)
library(ggplot2)
set.seed(1)
load("../data/genera.RData")
physeq <- physeq_genera %>% 
  phyloseq::subset_samples(dataset_name == "MucosalIBD") %>% 
  smar::prune_taxaSamples(flist_taxa = genefilter::kOverA(k = 5, A = 1))
mat_X_count <- smar::otu_table2(physeq)
rownames(mat_X_count) <- CRTmicrobiome:::simplify_feature(rownames(mat_X_count))
mat_X_p <- apply(mat_X_count, 2, function(x) x / sum(x))
df_marginal <- CRTmicrobiome:::get_marginal(t(mat_X_p))

Agenda

lfit_copulasso <- CRTmicrobiome:::copulasso(
  data = t(mat_X_p),
  lambda_list = seq(0.01, 0.4,
                    by = 0.01),
  K_CV = 5)
fit_copulasso <-
  lfit_copulasso$fits[[order(lfit_copulasso$df_eval$BIC)[1]]]

mat_X_p_pseudocount <- apply(mat_X_count + 0.5, 2, function(x) x / sum(x))
lfit_gcoda <- CRTmicrobiome:::gcoda(
  data = t(mat_X_p_pseudocount),
  lambda_list = seq(0.02, 0.9,
                    by = 0.02),
  K_CV = 5)
fit_gcoda <-
  lfit_gcoda$fits[[order(lfit_gcoda$df_eval$BIC)[1]]]
R <- 10000
# copulasso
samples_copulasso <-
  CRTmicrobiome:::rcopulasso(n = R,
                             mean = df_marginal$mu,
                             sd = df_marginal$sigma,
                             pi = df_marginal$pi,
                             sigma = solve(fit_copulasso),
                             norm = FALSE) %>%
  t()
rownames(samples_copulasso) <- rownames(mat_X_count)

# gcoda pseudo count
mu_a <- CRTmicrobiome:::get_mean_logisticMVN(t(mat_X_p_pseudocount))
samples_a <- mvtnorm::rmvnorm(n = R,
                              mean = mu_a,
                              sigma = solve(fit_gcoda))
samples_gcoda <- apply(samples_a, 1, function(x) exp(x) / sum(exp(x)))
rownames(samples_gcoda) <- rownames(mat_X_count)

Marginals

df_original <- mat_X_p %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::mutate(data = "original")
df_copulasso <- samples_copulasso %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::mutate(data = "copulasso")
df_copulasso_renorm <- samples_copulasso %>%
  apply(2, function(x) x / sum(x)) %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::mutate(data = "copulasso_renorm")
df_p_copulasso <- rbind(df_original,
                        df_copulasso,
                        df_copulasso_renorm) %>%
  dplyr::filter(abd > 0) %>%
  dplyr::mutate(data = factor(data,
                              levels = c("original",
                                         "copulasso",
                                         "copulasso_renorm")),
                log_abd = log10(abd + 1e-16)) %>%
  dplyr::group_by(feature) %>%
  dplyr::summarise(
    l_p = list(
      data.frame(log_abd = log_abd,
                 data = data) %>%
        ggplot(aes(x = log_abd)) +
        geom_histogram(bins = 50) +
        facet_grid(data~., scales = "free_y") +
        ggtitle(feature[1])
    ))
pdf("../results/copulasso_vs_gcoda/marginal_copulasso.pdf",
    width = 4, height = 10)
for(i in seq_len(nrow(df_p_copulasso)))
  print(df_p_copulasso$l_p[[i]])
dev.off()
df_p_copulasso %>%
  dplyr::filter(feature == "f__Ruminococcaceae") %>%
  extract2("l_p") %>%
  print()
df_original <- mat_X_count %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::group_by(sample) %>%
  dplyr::mutate(is_pos = abd > 0,
                abd = (abd + 0.5) / sum(abd + 0.5)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(data = "original")
df_gcoda <- samples_gcoda %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::mutate(is_pos = TRUE,
                data = "gcoda")
df_p_gcoda <- rbind(df_original,
                    df_gcoda) %>%
  dplyr::mutate(
    data = factor(data, levels = c("original", "gcoda")),
    log_abd = log10(abd)) %>%
  dplyr::group_by(feature) %>%
  dplyr::summarise(
    l_p = list(
      data.frame(log_abd = log_abd,
                 data = data,
                 is_pos = is_pos) %>%
        ggplot(aes(x = log_abd, fill = is_pos)) +
        geom_histogram(bins = 50) +
        facet_grid(data~., scales = "free_y") +
        scale_fill_manual(values = c("FALSE" = "grey", "TRUE" = "black")) +
        ggtitle(feature[1])
    ))
pdf("../results/copulasso_vs_gcoda/marginal_gcoda.pdf",
    width = 4, height = 8)
for(i in seq_len(nrow(df_p_gcoda)))
  print(df_p_gcoda$l_p[[i]])
dev.off()
df_p_gcoda %>%
  dplyr::filter(feature == "f__Christensenellaceae") %>%
  extract2("l_p") %>%
  print()

Correlation/covariance structure of $p$

rho_original <- CRTmicrobiome:::cor2(x = t(mat_X_p))
rho_copulaso <- CRTmicrobiome:::cor2(x = t(samples_copulasso))
rho_copulaso_renorm <- samples_copulasso %>%
  apply(2, function(x) x / sum(x)) %>% t() %>% CRTmicrobiome:::cor2()

cov_pseudo <- (mat_X_count + 0.5) %>%
  apply(2, function(x) x / sum(x)) %>% t() %>% log() %>%
  cov()
cov_original <- CRTmicrobiome:::cor2(x = t(mat_X_p)) %>%
  CRTmicrobiome:::iRho() %>%
  `*`(sqrt(diag(cov_pseudo))) %>%
  `*`(rep(sqrt(diag(cov_pseudo)), each = nrow(mat_X_p)))
cov_gcoda <- log(samples_gcoda) %>% t() %>%
  cov()

p_copulaso <- CRTmicrobiome:::plot_cors(rho_original, rho_copulaso,
                                        c("original", "copulaso")) +
  ggtitle("Spearman correlation of copulaso p")
p_copulaso_renorm <- CRTmicrobiome:::plot_cors(rho_original, 
                                               rho_copulaso_renorm,
                                               c("original", "copulaso")) +
  ggtitle("Spearman correlation of copulaso p (renormed)")
p_pseudo <- CRTmicrobiome:::plot_cors(cov_original, cov_pseudo,
                                        c("original", "with pseudo count")) +
  ggtitle("Covariance of log p")
p_gcoda <- CRTmicrobiome:::plot_cors(cov_original, cov_gcoda,
                                     c("with pseudo count", "gcoda")) +
  ggtitle("Covariance of gcoda log p")

cowplot::plot_grid(p_copulaso, p_copulaso_renorm,
                   p_pseudo, p_gcoda, nrow = 2)

Improvement on gcoda (??)

Can use ZI marginals to model $f_a(a_j)$ instead?

$\begin{aligned} f_a(a_1, \dots, a_m) &= \frac{f_a(a_1, \dots, a_m)}{\prod f_a(a_j)} \times \prod f_a(a_j) \ &\approx \frac{f^{\text{MVN}}_a(a_1, \dots, a_m)}{\prod f^{\text{MVN}}_a(a_j)} \times \prod f_a(a_j) \end{aligned}$

Model evaluation criteria comparison

Varying tuning parameter $\lambda$, can evaluate model with:

lfit_copulasso$df_eval %>%
  dplyr::mutate(sparsity = df / nrow(mat_X_p) / (nrow(mat_X_p) - 1) * 2) %>%
  tidyr::gather(key = metric_name, value = metric,
                negLogLik_CV, sparsity, AIC, BIC, EBIC) %>%
  ggplot(aes(x = lambda, y = metric)) +
  geom_point() + geom_line() +
  facet_wrap(.~metric_name, scales = "free_y", ncol = 1) +
  ggtitle("Copula lasso fits")
lfit_copulasso$df_eval %>%
  tidyr::gather(key = metric_name, value = metric,
                negLogLik_CV, AIC, BIC, EBIC) %>%
  dplyr::group_by(metric_name) %>%
  dplyr::arrange(metric) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup() %>%
  print()
lfit_gcoda$df_eval %>%
  dplyr::mutate(sparsity = df / nrow(mat_X_p) / (nrow(mat_X_p) - 1) * 2) %>%
  tidyr::gather(key = metric_name, value = metric,
                negLogLik_CV, sparsity, AIC, BIC, EBIC) %>%
  ggplot(aes(x = lambda, y = metric)) +
  geom_point() + geom_line() +
  facet_wrap(.~metric_name, scales = "free_y", ncol = 1) +
  ggtitle("gcoda fits with pseudo count = 0.5")
lfit_gcoda$df_eval %>%
  tidyr::gather(key = metric_name, value = metric,
                negLogLik_CV, AIC, BIC, EBIC) %>%
  dplyr::group_by(metric_name) %>%
  dplyr::arrange(metric) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup() %>%
  print()


syma-research/CRT_microbiome documentation built on July 8, 2022, 8 a.m.