library(phyloseq)
library(tidyverse)
library(genefilter) 
library(Rcpp)
library(rstan)
library(randomcoloR)
library(DESeq2)
library(diffTop)
theme_set(theme_minimal())
theme_update(
  text = element_text(size = 10),
  legend.text = element_text(size = 10)
)

Here, we are interested infer on the topics (bacterial communities) when we use original and modified PNA clams (two different experimental conditions). We use latent Dirichlet allocation to identify topics.

Data

In this workflow we use psE_BARBI phyloseq object in this package. In psE_BARBI, we removed DNA contaminants using BARBI.

data("psE_BARBI")
ps <- psE_BARBI

rm(psE_BARBI)

Preprocessing

We choose ASVs that have at least 25 counts in at least two specimens.

threshold <- kOverA(2, A = 25) 
ps <- phyloseq::filter_taxa(
  ps, 
  threshold, TRUE) 

ps


ps <- prune_taxa(taxa_sums(ps) > 0, ps)
ps

Edit specimen names

We edit specimen names and identify Asteraceae and non-Asteraceae plants.

sam_names <- str_replace(sample_names(ps), "E106", "E-106")
sam_names <- str_replace(sam_names, "_F_filt.fastq.gz", "")
sam_names <- str_replace(sam_names, "Connor-", "E")
sample_names(ps) <- sam_names
sample_data(ps)$X <- sam_names
sample_data(ps)$unique_names <- sam_names
aster <- c("142","143","15","ST","22","40")
non_aster <- c("33", "71", "106")

paired_aster <- c("E-142-1", "E142-1", "E-142-5", "E142-5", "E-142-10", "E142-10", "E-143-2", "E143-2", "E-143-7", "E143-7", "E-15-1", "E15-1", "ST-CAZ-4-R-O", "ST-CAZ-4-R-M", "ST-SAL-22-R-O", "ST-SAL-22-R-M", "ST-TRI-10-R-O", "ST-TRI-10-R-M")
paired_non_aster <- c("E33-7", "E-33-7", "E33-8", "E-33-8", "E33-9", "E-33-9", "E71-10", "E-71-10","E71-2","E-71-2" ,"E71-3", "E-71-3", "E106-1", "E-106-1", "E106-3", "E-106-3", "E106-4", "E-106-4")
paired_specimens <- c(paired_aster, paired_non_aster)

Choose colors for plots. We will use nine colors for nine different plants in the dataset.

plant_colors <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "purple")

Input data for LDA

Based on the ordination, we choose the number of topics as K=11. We set the hyperparameters $\alpha = 0.8$ and $\gamma = 0.8$.

K <- 11
alpha <- 0.8
gamma <- 0.8
stan.data <- setUpData(
  ps = ps,
  K = K,
  alpha = alpha,
  gamma = gamma
)

Posterior sampling

We need to estimate $\theta$ and $B = \left[\beta_{1}, \cdots, \beta_{K} \right]^{T}.$

We estimate the parameters using HMC NUTS with four chains and 2000 iterations. Out of these 2000 iterations, 1000 iterations were used as warm-up samples.

We use high-performance computing and save the results.

We specify a file name.

fileN <- paste0(
  "JABES_Endo_all_K_",
  K,
  "_ite_",
  iter,
  ".RData"
  )
iter <- 2000
chains <- 4

stan.fit <- LDAtopicmodel(
  stan_data = stan.data, 
  iter = iter, 
  chains = chains, 
  sample_file = NULL,
  diagnostic_file = NULL,
  cores = 4,
  control = list(adapt_delta = 0.9),
  save_dso = TRUE,
  algorithm = "NUTS"
  )
save(stan.fit, file = fileN)
load(file = fileN)

We extract posterior samples from the results.

samples <- rstan::extract(
  stan.fit, 
  permuted = TRUE, 
  inc_warmup = FALSE, 
  include = TRUE
  )

Alignment

We estimated the parameters using the HMC-NUTS sampler with four chains and 2000 iterations. Out of these 2000 iterations, 1000 iterations were used as warm-up samples and discarded. Label switching across chains makes it difficult to directly compute log predictive density, split-$\hat{R}$, effective sample size for model assessment, and evaluate convergence and mixing of chains.

To address this issue, we fixed the order of topics in chain one and then found the permutation that best aligned the topics across all four chains. For each chain two to four, we identified the estimated topics pair with the highest correlation, then found the next highest pair among the remaining, and so forth.

We create a Topic $*$ Chain matrix.

theta <- samples$theta 

aligned <- alignmentMatrix(
  theta, 
  ps, 
  K, 
  iterUse = iter/2,
  chain = chains,
  SampleID_name = "unique_names"
  )

Align topic proportion

We align the topic proportion in each specimen across four chains.

theta_aligned <- thetaAligned(
  theta, 
  K, 
  aligned, 
  iterUse = iter/2, 
  chain = chains
  )

Align ASV proportion

We align the ASVs proportion in each topic across four chains.

beta <- samples$beta 


beta_aligned <- betaAligned(
  beta, 
  K, 
  aligned, 
  iterUse = iter/2, 
  chain = chains
  ) 

Visualization

Plot topic proportion

Plot the topic proportion in specimens and we can save it for publication purposes. We can draw an informative summary of bacterial communities in each phenotype.

p <- plotTopicProportion(
  ps,
  theta_aligned,
  K,
  col_names_theta_all = c("iteration", "Sample", "Topic", "topic.dis"),
  chain = 4,
  iterUse = iter/2,
  SampleIDVarInphyloseq = "unique_names",
  design = ~ pna
)

ggsave(paste0("topic_dis_all_",K, ".png")), p, width = 30, height = 16)
knitr::include_graphics(paste0("topic_dis_all_",K,".png"))

Plot ASVs distribution

We plot the ASVs distribution in each topic.

p <- plotASVCircular(
  ps,
  K,
  iterUse = iter/2,
  chain = chains,
  beta_aligned,
  varnames = c("iterations", "topic", "rsv_ix"),
  value_name = "beta_h",
  taxonomylevel = "Class",
  thresholdASVprop = 0.008
  )

ggsave(paste0("circular_plot_plant_all_K_", K, ".png"), p, width = 28, height = 22)
knitr::include_graphics(paste0("circular_plot_plant_all_K_", K, ".png"))

Model assessement

We do goodness of fit test using the posterior estimates and observed data.

p_hist <- modelAssessment(
  ps,
  stan.fit,
  iterUse = iter/2,
  statistic = "max",
  ASVsIndexToPlot = c(1, 3, 10:14, 19:26, 36, 51:53, 148)
)

ggsave(
  paste0(
    "model_assesment_hist_all_obs_sim_K_", 
    K, 
    ".png"
    ), 
  p_hist, 
  width = 12, 
  height = 6
  )

```r. Each facet shows the histogram of G(Kij) of each ASV in specimens from the posterior predictive distribution and the vertical line shows the value of G (Kij ) of each ASV in observed data.", fig.align="center"} knitr::include_graphics(paste0("model_assesment_hist_all_obs_sim_K_", K, ".png"))

## Diagnostics 

### Effective sample size (ESS)

We compute effective sample size for each parameter.

```r
p <- diagnosticsPlot(
  theta_aligned = theta_aligned,
  beta_aligned = beta_aligned,
  iterUse = iter/2,
  chain = 4
)
p_ess_bulk <- p[[[1]]]

ggsave(
  paste0("ess_bulk_plant_all_K_", K, ".png"), 
  p_ess_bulk, 
  width = 9, 
  height = 6
  )
knitr::include_graphics(paste0("ess_bulk_plant_all_K_", K, ".png"))

$\hat{R}$

We can check whether $\hat{R}$ is less than 1.02.

p_rhat <- p[[2]]

ggsave(
  paste0("Rhat_plant_all_K_", K, ".png"), 
  p_rhat, 
  width = 9, 
  height = 6
  )
knitr::include_graphics(paste0("Rhat_plant_all_K_", K, ".png"))

Differential topic analysis

Test on all specimens

We do test on all specimens. We can write DESeq2 results to LaTex.

dds_all <- diffTopAnalysis(
  design = ~ pna,
  ps,
  theta_aligned,
  subsetSample = sample_names(ps),
  fitType = "mean"
)

saveRDS(
  dds_all, 
  file = paste0("dds_all_K_", K,".rds")
  )
data("dds_all_K_11")
res <- results(dds_all_K_11)
res 
#writeResTable(res, fileN = paste0("dds_all_K_", K,".tex"))

Test on paired-specimens

We do the test on paired-specimens.

dds_all_paired <- diffTopAnalysis(
  design = ~ pna,
  ps,
  theta_aligned,
  subsetSample = paired_specimens
)

saveRDS(
  dds_all_paired, 
  file = paste0("dds_all_paired_K_", K,".rds")
  )
data("dds_all_paired_K_11")
res <- results(dds_all_paired_K_11)
res 
#writeResTable(res, fileN = paste0("dds_all_paired_K_", K,".tex"))

Test on paired non-Aster specimens

We do the test on paired non-Aster specimens.

dds_all_paired_nonaster <- diffTopAnalysis(
  design = ~ pna,
  ps,
  theta_aligned,
  subsetSample = paired_non_aster
)


saveRDS(dds_all_paired_nonaster, 
        file = paste0("dds_all_paired_nonaster_K_", K,".rds")
        )
data("dds_all_paired_nonaster_K_11")
res <- results(dds_all_paired_nonaster_K_11)
res 
#writeResTable(res, fileN = paste0("dds_all_paired_nonaster_", K,".tex"))

Test on paired Aster specimens

We do the test on paired Aster specimens.

dds_all_paired_aster <- diffTopAnalysis(
  design = ~ pna,
  ps,
  theta_aligned,
  subsetSample = paired_aster
)


saveRDS(
  dds_all_paired_aster, 
  file = paste0("dds_all_paired_aster_K_", K,".rds")
  )
data("dds_all_paired_aster_K_11")
res <- results(dds_all_paired_aster_K_11)
res 
#writeResTable(res, fileN = paste0("dds_all_paired_aster_", K,".tex"))
rm(ps, stan.data, aster, iter, K, non_aster, paired_aster, paired_non_aster, paired_specimens, plant_colors, sam_names, res)


PratheepaJ/diffTop documentation built on Dec. 18, 2021, 7:48 a.m.