library(phyloseq) library(tidyverse) library(genefilter) library(Rcpp) library(rstan) library(randomcoloR) library(DESeq2) library(diffTop)
theme_set(theme_minimal()) theme_update( text = element_text(size = 10), legend.text = element_text(size = 10) )
Here, we are interested infer on the topics (bacterial communities) when we use original and modified PNA clams (two different experimental conditions). We use latent Dirichlet allocation to identify topics.
In this workflow we use psE_BARBI
phyloseq
object in this package. In psE_BARBI
, we removed DNA contaminants using BARBI.
data("psE_BARBI") ps <- psE_BARBI rm(psE_BARBI)
We choose ASVs that have at least 25 counts in at least two specimens.
threshold <- kOverA(2, A = 25) ps <- phyloseq::filter_taxa( ps, threshold, TRUE) ps ps <- prune_taxa(taxa_sums(ps) > 0, ps) ps
We edit specimen names and identify Asteraceae and non-Asteraceae plants.
sam_names <- str_replace(sample_names(ps), "E106", "E-106") sam_names <- str_replace(sam_names, "_F_filt.fastq.gz", "") sam_names <- str_replace(sam_names, "Connor-", "E") sample_names(ps) <- sam_names sample_data(ps)$X <- sam_names sample_data(ps)$unique_names <- sam_names aster <- c("142","143","15","ST","22","40") non_aster <- c("33", "71", "106") paired_aster <- c("E-142-1", "E142-1", "E-142-5", "E142-5", "E-142-10", "E142-10", "E-143-2", "E143-2", "E-143-7", "E143-7", "E-15-1", "E15-1", "ST-CAZ-4-R-O", "ST-CAZ-4-R-M", "ST-SAL-22-R-O", "ST-SAL-22-R-M", "ST-TRI-10-R-O", "ST-TRI-10-R-M") paired_non_aster <- c("E33-7", "E-33-7", "E33-8", "E-33-8", "E33-9", "E-33-9", "E71-10", "E-71-10","E71-2","E-71-2" ,"E71-3", "E-71-3", "E106-1", "E-106-1", "E106-3", "E-106-3", "E106-4", "E-106-4") paired_specimens <- c(paired_aster, paired_non_aster)
Choose colors for plots. We will use nine colors for nine different plants in the dataset.
plant_colors <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "purple")
Based on the ordination, we choose the number of topics as K=11. We set the hyperparameters $\alpha = 0.8$ and $\gamma = 0.8$.
K <- 11 alpha <- 0.8 gamma <- 0.8 stan.data <- setUpData( ps = ps, K = K, alpha = alpha, gamma = gamma )
We need to estimate $\theta$ and $B = \left[\beta_{1}, \cdots, \beta_{K} \right]^{T}.$
We estimate the parameters using HMC NUTS with four chains and 2000 iterations. Out of these 2000 iterations, 1000 iterations were used as warm-up samples.
We use high-performance computing and save the results.
We specify a file name.
fileN <- paste0( "JABES_Endo_all_K_", K, "_ite_", iter, ".RData" )
iter <- 2000 chains <- 4 stan.fit <- LDAtopicmodel( stan_data = stan.data, iter = iter, chains = chains, sample_file = NULL, diagnostic_file = NULL, cores = 4, control = list(adapt_delta = 0.9), save_dso = TRUE, algorithm = "NUTS" ) save(stan.fit, file = fileN)
load(file = fileN)
We extract posterior samples from the results.
samples <- rstan::extract( stan.fit, permuted = TRUE, inc_warmup = FALSE, include = TRUE )
We estimated the parameters using the HMC-NUTS sampler with four chains and 2000 iterations. Out of these 2000 iterations, 1000 iterations were used as warm-up samples and discarded. Label switching across chains makes it difficult to directly compute log predictive density, split-$\hat{R}$, effective sample size for model assessment, and evaluate convergence and mixing of chains.
To address this issue, we fixed the order of topics in chain one and then found the permutation that best aligned the topics across all four chains. For each chain two to four, we identified the estimated topics pair with the highest correlation, then found the next highest pair among the remaining, and so forth.
We create a Topic $*$ Chain matrix.
theta <- samples$theta aligned <- alignmentMatrix( theta, ps, K, iterUse = iter/2, chain = chains, SampleID_name = "unique_names" )
We align the topic proportion in each specimen across four chains.
theta_aligned <- thetaAligned( theta, K, aligned, iterUse = iter/2, chain = chains )
We align the ASVs proportion in each topic across four chains.
beta <- samples$beta beta_aligned <- betaAligned( beta, K, aligned, iterUse = iter/2, chain = chains )
Plot the topic proportion in specimens and we can save it for publication purposes. We can draw an informative summary of bacterial communities in each phenotype.
p <- plotTopicProportion( ps, theta_aligned, K, col_names_theta_all = c("iteration", "Sample", "Topic", "topic.dis"), chain = 4, iterUse = iter/2, SampleIDVarInphyloseq = "unique_names", design = ~ pna ) ggsave(paste0("topic_dis_all_",K, ".png")), p, width = 30, height = 16)
knitr::include_graphics(paste0("topic_dis_all_",K,".png"))
We plot the ASVs distribution in each topic.
p <- plotASVCircular( ps, K, iterUse = iter/2, chain = chains, beta_aligned, varnames = c("iterations", "topic", "rsv_ix"), value_name = "beta_h", taxonomylevel = "Class", thresholdASVprop = 0.008 ) ggsave(paste0("circular_plot_plant_all_K_", K, ".png"), p, width = 28, height = 22)
knitr::include_graphics(paste0("circular_plot_plant_all_K_", K, ".png"))
We do goodness of fit test using the posterior estimates and observed data.
p_hist <- modelAssessment( ps, stan.fit, iterUse = iter/2, statistic = "max", ASVsIndexToPlot = c(1, 3, 10:14, 19:26, 36, 51:53, 148) ) ggsave( paste0( "model_assesment_hist_all_obs_sim_K_", K, ".png" ), p_hist, width = 12, height = 6 )
```r. Each facet shows the histogram of G(Kij) of each ASV in specimens from the posterior predictive distribution and the vertical line shows the value of G (Kij ) of each ASV in observed data.", fig.align="center"} knitr::include_graphics(paste0("model_assesment_hist_all_obs_sim_K_", K, ".png"))
## Diagnostics ### Effective sample size (ESS) We compute effective sample size for each parameter. ```r p <- diagnosticsPlot( theta_aligned = theta_aligned, beta_aligned = beta_aligned, iterUse = iter/2, chain = 4 ) p_ess_bulk <- p[[[1]]] ggsave( paste0("ess_bulk_plant_all_K_", K, ".png"), p_ess_bulk, width = 9, height = 6 )
knitr::include_graphics(paste0("ess_bulk_plant_all_K_", K, ".png"))
We can check whether $\hat{R}$ is less than 1.02.
p_rhat <- p[[2]] ggsave( paste0("Rhat_plant_all_K_", K, ".png"), p_rhat, width = 9, height = 6 )
knitr::include_graphics(paste0("Rhat_plant_all_K_", K, ".png"))
We do test on all specimens. We can write DESeq2 results to LaTex.
dds_all <- diffTopAnalysis( design = ~ pna, ps, theta_aligned, subsetSample = sample_names(ps), fitType = "mean" ) saveRDS( dds_all, file = paste0("dds_all_K_", K,".rds") )
data("dds_all_K_11") res <- results(dds_all_K_11) res #writeResTable(res, fileN = paste0("dds_all_K_", K,".tex"))
We do the test on paired-specimens.
dds_all_paired <- diffTopAnalysis( design = ~ pna, ps, theta_aligned, subsetSample = paired_specimens ) saveRDS( dds_all_paired, file = paste0("dds_all_paired_K_", K,".rds") )
data("dds_all_paired_K_11") res <- results(dds_all_paired_K_11) res #writeResTable(res, fileN = paste0("dds_all_paired_K_", K,".tex"))
We do the test on paired non-Aster specimens.
dds_all_paired_nonaster <- diffTopAnalysis( design = ~ pna, ps, theta_aligned, subsetSample = paired_non_aster ) saveRDS(dds_all_paired_nonaster, file = paste0("dds_all_paired_nonaster_K_", K,".rds") )
data("dds_all_paired_nonaster_K_11") res <- results(dds_all_paired_nonaster_K_11) res #writeResTable(res, fileN = paste0("dds_all_paired_nonaster_", K,".tex"))
We do the test on paired Aster specimens.
dds_all_paired_aster <- diffTopAnalysis( design = ~ pna, ps, theta_aligned, subsetSample = paired_aster ) saveRDS( dds_all_paired_aster, file = paste0("dds_all_paired_aster_K_", K,".rds") )
data("dds_all_paired_aster_K_11") res <- results(dds_all_paired_aster_K_11) res #writeResTable(res, fileN = paste0("dds_all_paired_aster_", K,".tex"))
rm(ps, stan.data, aster, iter, K, non_aster, paired_aster, paired_non_aster, paired_specimens, plant_colors, sam_names, res)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.