In syma-research/CRT_microbiome: Conditional randomized test for microbiome data

knitr::opts_knit$set(root.dir = "../")

library(magrittr)
library(ggplot2)
set.seed(1)
ncores <- 6

load("../data/genera.RData")
physeq <- physeq_genera %>% 
  phyloseq::subset_samples(dataset_name == "MucosalIBD") %>% 
  smar::prune_taxaSamples(flist_taxa = genefilter::kOverA(k = 5, A = 1))
mat_X_count <- smar::otu_table2(physeq)
rownames(mat_X_count) <- CRTmicrobiome:::simplify_feature(rownames(mat_X_count))
mat_X_p <- apply(mat_X_count, 2, function(x) x / sum(x))
df_marginal <- CRTmicrobiome:::get_marginal(t(mat_X_p))

mat_X_p_pseudocount <- apply(mat_X_count + 0.5, 2, function(x) x / sum(x))
mu_a <- CRTmicrobiome:::get_mean_logisticMVN(mat_X_p_pseudocount)
fit_gcoda <- CRTmicrobiome:::gcoda(
  data = t(mat_X_p_pseudocount),
  lambda_list = 0.6)$fits[[1]]
params_gcoda <- list(mu = mu_a,
                     Omega = fit_gcoda)

Seems that penalization gives higher estimation features' variances.

tb_sd <- tibble::tibble(
  original = mat_X_p_pseudocount %>% 
    log() %>% t() %>% var() %>% diag() %>% sqrt(),
  estimated = diag(solve(fit_gcoda)) %>% sqrt()
)
tb_sd %>% 
  ggplot(aes(x = original, y = estimated)) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color = "red")

dir.create("../results/gcoda_marginal/", 
           recursive = TRUE, showWarnings = FALSE)
R <- 10000
df_marginal_mix <- 
  CRTmicrobiome:::get_marginal_mix(data = t(mat_X_p_pseudocount),
                                   df_marginal = df_marginal,
                                   mean0 = log(0.5) - 
                                     mean(log(apply(mat_X_count, 2, sum))),
                                   var0 = var(log(apply(mat_X_count, 2, sum))))
sds_gcoda <- sqrt(diag(solve(fit_gcoda)))
sd0 <- df_marginal_mix$sigma0 / 
  df_marginal_mix$sigma_overall * 
  sds_gcoda
sd1 <- df_marginal_mix$sigma1 / 
  df_marginal_mix$sigma_overall * 
  sds_gcoda
samples_gcoda <- CRTmicrobiome:::rgcoda(n = R,
                                        mean0 = df_marginal_mix$mu0,
                                        mean1 = df_marginal_mix$mu1,
                                        sd0 = sd0, sd1 = sd1,
                                        pi = df_marginal_mix$pi,
                                        sigma = solve(fit_gcoda))
cov_samples <- cov(samples_gcoda)
p_original <- 
  CRTmicrobiome:::plot_cors(cov(t(log(mat_X_p_pseudocount))), 
                            solve(fit_gcoda),
                            labels = c("Data", "Fit"))
p_sample <- 
  CRTmicrobiome:::plot_cors(cov(t(log(samples_gcoda$abd))), 
                                      solve(fit_gcoda),
                            labels = c("Sample", "Fit"))
cowplot::plot_grid(p_original, p_sample, nrow = 1) %>% 
  ggsave("../results/gcoda_marginal/cov_comparison.pdf",
         .,
         width = 10, height = 5)

df_original <- mat_X_p_pseudocount %>%
  CRTmicrobiome:::longify_abd() %>% 
  dplyr::left_join(
    (mat_X_count != 0) %>% 
      CRTmicrobiome:::longify_abd(abundance = "is_pos"),
    by = c("feature", "sample")
  ) %>% 
  dplyr::mutate(data = "original")

dimnames(samples_gcoda$abd) <- 
  dimnames(samples_gcoda$ind) <- 
  list(rownames(mat_X_count),
       paste0("sample", seq_len(R)))
df_gcoda <- samples_gcoda$abd %>%
  CRTmicrobiome:::longify_abd() %>%
  dplyr::left_join(
    (samples_gcoda$ind == 1) %>% 
      CRTmicrobiome:::longify_abd(abundance = "is_pos"),
    by = c("feature", "sample")
  ) %>% 
  dplyr::mutate(data = "gcoda")
df_p_gcoda <- rbind(df_original,
                    df_gcoda) %>%
  dplyr::mutate(
    data = factor(data, levels = c("original", "gcoda")),
    log_abd = log10(abd)) %>%
  dplyr::group_by(feature) %>%
  dplyr::summarise(
    l_p = list(
      data.frame(log_abd = log_abd,
                 data = data,
                 is_pos = is_pos) %>%
        ggplot(aes(x = log_abd, fill = is_pos)) +
        geom_histogram(bins = 50) +
        facet_grid(data~., scales = "free_y") +
        scale_fill_manual(values = c("FALSE" = "grey", "TRUE" = "black")) +
        ggtitle(feature[1])
    ))
pdf("../results/gcoda_marginal/marginal_gcoda.pdf",
    width = 4, height = 8)
for(i in seq_len(nrow(df_p_gcoda)))
  print(df_p_gcoda$l_p[[i]])
dev.off()