In PratheepaJ/diffTop: Differential topic analysis using latent Dirichlet allocation

knitr::opts_chunk$set(echo = TRUE, 
  fig.width = 7, 
  fig.height = 4, 
  fig.align = 'center',
  message = FALSE, 
  warning = FALSE)

library(phyloseq)
library(tidyverse)
library(genefilter) #KOverA
library(ggrepel) # geom_text_repel
library(randomcoloR)# distinctColorPalette(n)
library(adaptiveGPCA)
library(igraph)
library(phyloseqGraphTest)
devtools::load_all()

theme_set(theme_minimal())
theme_update(
  text = element_text(size = 10),
  legend.text = element_text(size = 10)
)

Data

data("psE_BARBI")
threshold <- kOverA(2, A = 25) 
psE_BARBI <- phyloseq::filter_taxa(
  psE_BARBI, 
  threshold, TRUE) 

psE_BARBI

ps <- psE_BARBI
rm(psE_BARBI)
ps <- prune_taxa(taxa_sums(ps) > 0, ps)
ps

Edit specimen names

We edit specimen names and identify Asteraceae and non-Asteraceae plants.

sam_names <- str_replace(sample_names(ps), "E106", "E-106")
sam_names <- str_replace(sam_names, "_F_filt.fastq.gz", "")
sam_names <- str_replace(sam_names, "Connor-", "E")
sample_names(ps) <- sam_names
sample_data(ps)$X <- sam_names
sample_data(ps)$unique_names <- sam_names
aster <- c("142","143","15","ST","22","40")
non_aster <- c("33", "71", "106")

paired_aster <- c("E-142-1", "E142-1", "E-142-5", "E142-5", "E-142-10", "E142-10", "E-143-2", "E143-2", "E-143-7", "E143-7", "E-15-1", "E15-1", "ST-CAZ-4-R-O", "ST-CAZ-4-R-M", "ST-SAL-22-R-O", "ST-SAL-22-R-M", "ST-TRI-10-R-O", "ST-TRI-10-R-M")
paired_non_aster <- c("E33-7", "E-33-7", "E33-8", "E-33-8", "E33-9", "E-33-9", "E71-10", "E-71-10","E71-2","E-71-2" ,"E71-3", "E-71-3", "E106-1", "E-106-1", "E106-3", "E-106-3", "E106-4", "E-106-4")
paired_specimens <- c(paired_aster, paired_non_aster)

# We will use 9 colors for 9 different plants
plant_colors <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "purple")

7.1 Ordination

Multidimensional scaling (MDS) with weighted unifrac

ps_log <- transform_sample_counts(
  ps, 
  function(x) log(1 + x)
  )

out.wuf.log <- ordinate(
  ps_log, 
  method = "MDS", 
  distance = "wunifrac"
  )

evals <- out.wuf.log$values$Eigenvalues

mds_wuni <- plot_ordination(
  ps_log, 
  out.wuf.log, 
  color = "species_names", 
  shape = "pna"
  ) +
  labs(
    col = "Plant type"
    ) +
  coord_fixed(
    sqrt(evals[2] / evals[1])
    ) + 
  scale_colour_manual(
    values = plant_colors
    ) + 
  facet_grid(~Type) +
  geom_point(size = 3)

# adding only paired-specimen names
df_paired <- mds_wuni$data %>% 
  dplyr::filter(
    unique_names %in% paired_specimens
    )

mds_wuni + 
  geom_text_repel(
    data = df_paired, 
    aes(
      x = Axis.1, 
      y = Axis.2, 
      label = unique_names
      ), 
    color = "black"
    )

We see that Centaurea solstitialis specimens are outlier that were sampled in a field across three countries, while other plants are grown from seeds. We label the paired specimens in all plants. The paired specimens are relatively close to each other, except (E-143-7, E143-7), (E-142-5, E142-5), (E-71-2, E71-2), and (E-71-3, E71-3) that are in positive and negative axes of the first two principal axes.

Fitzpatrick et al. (2018) concluded that there is no effect in within and between specimen bacterial diversity when they used universal or Aster-modified pPNA types.

The above Figure shows that specimens from the aster plants that are paired with universal or Aster-modified pPNA types make clusters.

MDS with BC

An MDS plot using Bray-Curtis distance between specimens.

out.bc.log <- ordinate(
  ps_log, 
  method = "MDS", 
  distance = "bray"
  )

evals <- out.bc.log$values$Eigenvalues

mds_bc <- plot_ordination(
  ps_log, 
  out.bc.log, 
  color = "species_names", 
  shape = "pna"
  ) +
  coord_fixed(
    sqrt(evals[2] / evals[1])
    ) +
  labs(
    col = "Plant type", 
    shape = "pna"
    ) + 
  scale_colour_manual(
    values = plant_colors
    ) + 
  facet_grid(~Type) +
  geom_point(size=3) 

df_paired <- mds_bc$data %>% 
  dplyr::filter(
    unique_names %in% paired_specimens
    )

mds_bc + 
  geom_text_repel(
    data = df_paired, 
    aes(x = Axis.1, 
        y = Axis.2, 
        label = unique_names
        ), 
    color = "black"
    )

The ordination of specimens in MDS with BC is similar to MDS with weighted unifrac.

DPCoA

A DPCoA plot incorporates phylogenetic information.

The DPCoA specimens positions can be interpreted with respect to the ASVs coordinates in this display.

data("ps_ans")
out.dpcoa.log <- ordinate(
  ps_ans, 
  method = "DPCoA"
  )

evals <- out.dpcoa.log$eig

dpcoa_sam <- plot_ordination(
  ps_ans,
  out.dpcoa.log,
  color = "species_names",
  shape = "pna"
  ) +
  coord_fixed(
    sqrt(evals[2] / evals[1])
    ) +
  labs(
    col = "Plant type", 
    shape = "pna"
    ) + 
  scale_colour_manual(
    values = plant_colors
    ) + 
  facet_grid(~Type) +
  geom_point(size = 3) 

df_paired <- dpcoa_sam$data %>% 
  dplyr::filter(
    unique_names %in% paired_specimens
    )

dpcoa_sam + 
  geom_text_repel(
    data = df_paired,
    aes(
      x = CS1, 
      y = CS2,
      label = unique_names
      ), 
    color = "black"
    )


dpcoa_taxa <- plot_ordination(
  ps_ans, 
  out.dpcoa.log,
  type = "species", 
  color = "Class"
  ) 


df_taxa <- dplyr::filter(
  dpcoa_taxa$data, 
  (CS1 > 0.63 | CS1 < -0.14 | CS2 > 0.29 | CS2 < -0.13) & (
    !is.na(Class)
    )
  )

dpcoa_taxa$data <- df_taxa

dpcoa_taxa +
  coord_fixed(
    sqrt(evals[2] / evals[1])
    ) +
  geom_point(size = 2) + 
  scale_color_manual(
    values = distinctColorPalette(
      length(
        unique(
          tax_table(ps_ans)[, "Class"]
          )
        )
      )
    )

7.2 Integrating the phylogenetic tree into the analyses

Adaptive gPCA

pp <- processPhyloseq(
  ps
  )
out.agpca <- adaptivegpca(
  pp$X, 
  pp$Q, 
  k = 2
  )

agPCA_samples <- ggplot(
  data.frame(
    out.agpca$U, 
    sample_data(ps)
    )
  ) +
    geom_point(
      aes(x = Axis1,
          y = Axis2, 
          color = species_names,
          shape = pna),
      size = 3
      ) +
  labs(
    x = "Axis 1", 
    y = "Axis 2",
    col = "Plant type"
    ) + 
  scale_colour_manual(
    values = plant_colors
    ) +
  facet_grid(~ Type)

agPCA_samples

agPCA_asv <- ggplot(
  data.frame(
    out.agpca$QV, 
    tax_table(ps)
    )
  ) +
    geom_point(
      aes(
        x = Axis1, 
        y = Axis2, 
        color = Class
        ), 
      size = 3
      ) +
    xlab(
      "Axis 1"
      ) + 
  ylab(
    "Axis 2"
    )

main_class <- c("Bacilli", 
                "Actinobacteria", 
                "Cytophagia", 
                "Gammaproteobacteria", 
                "Betaproteobacteria", 
                "Deltaproteobacteria", 
                "Alphaproteobacteria", 
                "Opitutae")

df_taxa <- dplyr::filter(
  agPCA_asv$data, 
  Class %in% main_class
  )

agPCA_asv$data <- df_taxa

agPCA_asv <- agPCA_asv +
  scale_color_manual(
    values = distinctColorPalette(
      length(
        unique(
          tax_table(ps_ans)[, "Class"])
        )
      )
    )
agPCA_asv

Centaurea solstitialis specimens are outliers among Asteraceae plants on the left of Axis 1. Axis 2 explains the microbial variability in plant types.

7.3 Correspondence analysis

ps_ca <- ps
out.ca <- ordinate(
  ps_ca, 
  method = "CCA"
  )

evals <- out.ca$CA$eig
evals_prop <- evals/sum(evals)

ca_plot <- plot_ordination(
  ps_ca,
  out.ca,
  type = "sample",
  color = "species_names",
  shape = "pna") +
  labs(
    col = "Plant type",
    shape = "pna"
    ) + 
  scale_color_manual(
    values = plant_colors
    ) + 
  facet_grid(~Type) +
  geom_point(size=2) +
  geom_jitter()+
  coord_fixed(0.4) + 
  theme_update(
    text = element_text(size = 10),
    legend.text = element_text(size = 10),
    panel.spacing = unit(2, "lines")
)


ca_plot

ca_plot_taxa <- plot_ordination(
  ps_ca,
  out.ca,
  type = "species",
  color = "Class"
  ) 

main_class <- c("Bacilli", 
                "Actinobacteria", 
                "Cytophagia", 
                "Gammaproteobacteria",
                "Betaproteobacteria",
                "Deltaproteobacteria",
                "Alphaproteobacteria", 
                "Opitutae", 
                "Sphingobacteriia")

df_taxa <- dplyr::filter(
  ca_plot_taxa$data, 
  Class %in% main_class
  )

df_taxa$Class <- factor(df_taxa$Class)

ca_plot_taxa$data <- df_taxa

ca_plot_taxa <- ca_plot_taxa +
  coord_fixed(0.4) +
  labs(col = "Class") + 
  geom_point(size=3) +
  geom_jitter() +
  scale_color_manual(
    values = distinctColorPalette(
      length(
        unique(
          main_class
          )
        )
      )
    )

ca_plot_taxa

7.4 Network analysis

Network analysis on paired specimens

ps <- subset_samples(
  ps, 
  unique_names %in% paired_specimens
  )
ps <- prune_taxa(
  taxa_sums(ps) > 0, 
  ps
  )
ps

net <- make_network(
  ps, 
  max.dist = 0.8
  )

sampledata <- sample_data(ps) %>%
  data.frame()

V(net)$unique_names <- sampledata[names(V(net)), "unique_names"]
V(net)$pna <- sampledata[names(V(net)), "pna"]
V(net)$species_names <- sampledata[names(V(net)), "species_names"]

p_net <- plot_network(
  net, 
  ps, 
  color = "species_names",
  shape = "pna"
  ) + 
  labs(
    col = "Species names"
    ) + 
  scale_colour_manual(
    values = plant_colors
    )
p_net

Graph-based two-sample tests

We first perform a test using an Minimum spanning tree (MST) with Jaccard dissimilarity. We want to know whether the universal or Asteraceae-modified pPNAs come from the same distribution in paired-specimens.

Since there is a grouping in the data by specimen, we can’t simply permute all the labels, we need to maintain this nested structure – this is what the grouping argument does. Here we permute the pna labels but keep the plant type structure intact.

set.seed(15)
gt <- graph_perm_test(
  ps, 
  sampletype = "pna", 
  grouping = "unique_names",
  distance = "jaccard", 
  type = "mst")
gt$pval

This test has a larger p-value, and we do not reject the null hypothesis that the paired-specimens come from the same distribution.

This result mirrors the observations from the PCA after the truncated-rank transformation, adaptive gPCA, MDS, and DPCoA that paired-specimens have similar microbial variability.

rm(df_paired, df_taxa, dpcoa_sam, dpcoa_taxa, mds_bc, mds_wuni, out.bc.log, out.dpcoa.log, out.wuf.log, ps, ps_log, agPCA_asv, agPCA_samples, out.agpca, pp, aster, evals, main_class, non_aster, paired_aster, paired_non_aster, paired_specimens, plant_colors, sam_names, ca_plot, out.ca, ps_ca, evals_prop, ca_plot_taxa, gt, net, p_net, sampledata)

PratheepaJ/diffTop documentation built on Dec. 18, 2021, 7:48 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

PratheepaJ/diffTop
Differential topic analysis using latent Dirichlet allocation

In PratheepaJ/diffTop: Differential topic analysis using latent Dirichlet allocation

Data

Edit specimen names

7.1 Ordination

Multidimensional scaling (MDS) with weighted unifrac

MDS with BC

DPCoA

7.2 Integrating the phylogenetic tree into the analyses

Adaptive gPCA

7.3 Correspondence analysis

7.4 Network analysis

Network analysis on paired specimens

Graph-based two-sample tests

R Package Documentation

Browse R Packages

We want your feedback!

PratheepaJ/diffTop Differential topic analysis using latent Dirichlet allocation

In PratheepaJ/diffTop: Differential topic analysis using latent Dirichlet allocation

Data

Edit specimen names

7.1 Ordination

Multidimensional scaling (MDS) with weighted unifrac

MDS with BC

DPCoA

7.2 Integrating the phylogenetic tree into the analyses

Adaptive gPCA

7.3 Correspondence analysis

7.4 Network analysis

Network analysis on paired specimens

Graph-based two-sample tests

R Package Documentation

Browse R Packages

We want your feedback!

PratheepaJ/diffTop
Differential topic analysis using latent Dirichlet allocation