This vignette explores the inherent structuring of the microbial communities from the WRS dataset, independent of any environmental patterning. In particular, the Bray-Curtis distance index is employed for dimensional reduction and clustering of the community data.

Prepare environment

# Import data
data("WRS_OTU_model")
opts_chunk$set(fig.width = 7, fig.height = 7)
# Save graphics parameters
op <- par(no.readonly = TRUE)

Metric and nonmetric multidimensional scaling

# perform NMDS on community data
WRS_NMDS <- lapply(WRS_OTU_model, metaMDS)
par(mfrow = c(2, 2), mar = c(2, 2, 2, 2))
lapply(seq_along(WRS_NMDS), FUN = function(i) {
  plot(WRS_NMDS[[i]], main = names(WRS_NMDS)[i], display = "sites")
  text(WRS_NMDS[[i]], cex = 1.5)
})
# Stress values?
WRS_NMDS_stress <- lapply(WRS_NMDS, "[", "stress")
# Archaea (bottom right) have an interesting continuous turnover; other taxa have varying degrees of "core" clustering and a projected spread.
lapply(WRS_NMDS, vegan::stressplot)
lapply(WRS_NMDS, goodness)

# Pretty good fits.

Metric (classic) MDS (i.e. PCoA)

WRS_bray <- lapply(WRS_OTU_model, vegdist)
WRS_PCoA <- lapply(WRS_bray, wcmdscale, eig = TRUE, x.ret = TRUE)

# Stressplots
par(mfrow = c(2, 2), mar = c(2, 2, 2, 2))
taxa_stressplot <- function(PCoA_list) {
  for(i in seq_along(PCoA_list)){
    # assumes all elements have same length as first element. I should abstract this later probably.
        vegan::stressplot(PCoA_list[[i]])
        title(main = names(PCoA_list)[i],
              xlab = "Bray-Curtis distance",
              ylab = "PCoA euclidean distance",
              line = -1)
  }
}
taxa_stressplot(WRS_PCoA)

Not amazing.

Let's examine the eigenvalues:

par(mfrow = c(2, 2), mar = c(2, 2, 2, 2))
taxa_screeplot <- function(PCoA_list) {
  for(i in seq_along(PCoA_list)){ # assumes all elements have same length as first element. I should abstract this later probably.
    # Create broken-stick model
    eig <- PCoA_list[[i]]$eig
    n <- length(eig)
    bsm <- data.frame(j = seq(1:n), p = 0)
    bsm$p[1] <- 1 / n
    for(x in 2 : n) {
      bsm$p[x] <- bsm$p[x - 1] + (1 / (n + 1 - x))
    }
    bsm$p <- 100 * bsm$p / n
    # Initialise plot
    barplot(t(cbind(100 * eig / sum(eig), bsm$p[n:1])),
            beside = TRUE, col = c("bisque", 2), las = 2)
    # Plot legend
    legend("topright", c("% eigenvalue", "Broken stick model"),
           pch = 15, col = c("bisque", 2), bty = "n", title = names(PCoA_list)[i])
  }
}
taxa_screeplot(WRS_PCoA)

Archaea and bacteria: first axis is obviously far superior. Fungi: first two, but lots of variability captured in the subsequent ~6 axes Eukarya: First two, then next two OK, but overall not much better than BSM.

PCoA ordination plots

par(mfrow = c(2, 2))
# Plot the results
taxa_PCoA_plot <- function(PCoA_list) {
  for(i in seq_along(PCoA_list)){
        plot <- vegan::ordiplot(PCoA_list[[i]], display = "sites",
                                type = "none")
        text(plot, what = "sites", cex = 1.5)
    abline(h = 0, lty = 3)
    abline(v = 0, lty = 3)
    mtext(names(PCoA_list)[i])
  } 
}

taxa_PCoA_plot(WRS_PCoA)

No obvious clustering. Significant arch effect happening in Eukarya. Otherwise not bad. Jolly interesting that samples 48 (river), 39 (landfill) and 38 (next in landfill transect) all cluster separately for Eukarya, and a little the same for Archaea. Surface-influenced community composition here, perhaps? Those taxa sensitive to these conditions?

Formal tests of clustering

# Investigate a range of clustering algorithms
WRS_clust_ward <- lapply(WRS_bray, hclust, method = "ward.D2")
WRS_clust_complete <- lapply(WRS_bray, hclust, method = "complete")
WRS_clust_average <- lapply(WRS_bray, hclust, method = "average")
WRS_clust_wa <- lapply(WRS_bray, hclust, method = "mcquitty")
WRS_clusters <- list("ward" = WRS_clust_ward, "complete" = WRS_clust_complete,
                      "average" = WRS_clust_average, "wa" = WRS_clust_wa)
WRS_cophenetic <- WRS_clusters
for(i in 1:length(WRS_clusters)) {
  WRS_cophenetic[[i]] <- lapply(WRS_clusters[[i]], cophenetic)
}
WRS_cophenetic <- unlist(WRS_cophenetic, recursive = FALSE)
# Create permuation table
cycles <- rep(1:4, 4)
cors <- vector(length = length(WRS_cophenetic))
names(cors) <- names(WRS_cophenetic)
for (i in 1:length(WRS_cophenetic)) {
  cors[i] <- cor(WRS_bray[[cycles[i]]], WRS_cophenetic[[i]])
}
# Average is the best

# Plot the Shepherd-type diagrams (write a function later)
# Fungi
plot(WRS_bray$fungi,
     WRS_cophenetic$average.fungi,
     xlab = "bray-curtis distance",
     ylab = "cophenetic distance",
     main = c("Average linkage (Fungi)",
              paste("cophenetic correlation ",
                    round(cor(WRS_bray$fungi, WRS_cophenetic$average.fungi), 3)
                    )
              )
     )
abline(0, 1)
lines(lowess(WRS_bray$fungi, WRS_cophenetic$average.fungi), col = "red")
# Eukarya
plot(WRS_bray$eukarya,
     WRS_cophenetic$average.eukarya,
     xlab = "bray-curtis distance",
     ylab = "cophenetic distance",
     main = c("Average linkage (Eukarya)",
              paste("cophenetic correlation ",
                    round(cor(WRS_bray$eukarya, WRS_cophenetic$average.eukarya), 3)
                    )
              )
     )
abline(0, 1)
lines(lowess(WRS_bray$eukarya, WRS_cophenetic$average.eukarya), col = "red")
# Bacteria
plot(WRS_bray$bacteria,
     WRS_cophenetic$average.bacteria,
     xlab = "bray-curtis distance",
     ylab = "cophenetic distance",
     main = c("Average linkage (Bacteria)",
              paste("cophenetic correlation ",
                    round(cor(WRS_bray$bacteria, WRS_cophenetic$average.bacteria), 3)
                    )
              )
     )
abline(0, 1)
lines(lowess(WRS_bray$bacteria, WRS_cophenetic$average.bacteria), col = "red")
# Archaea
plot(WRS_bray$archaea,
     WRS_cophenetic$average.archaea,
     xlab = "bray-curtis distance",
     ylab = "cophenetic distance",
     main = c("Average linkage (Archaea)",
              paste("cophenetic correlation ",
                    round(cor(WRS_bray$archaea, WRS_cophenetic$average.archaea), 3)
                    )
              )
     )
abline(0, 1)
lines(lowess(WRS_bray$archaea, WRS_cophenetic$average.archaea), col = "red")

# Examine the clusterings
lapply(seq_along(WRS_clust_average), FUN = function(i) {
  plot(WRS_clust_average[[i]], main = names(WRS_clust_average)[i])
})

# Do Mantel test of optimum number of clusters
# Create list of average-only clusterings
WRS_average <- WRS_cophenetic[grep("average", names(WRS_cophenetic))]
# Write a function for this later; for now, just copy-paste the code.
grpdist <- function(X) {
  gr <- as.data.frame(as.factor(X))
  distgr <- daisy(gr, "gower")
  return(distgr)
}
kt <- data.frame(k = 1:nrow(WRS_OTU_model[[1]]), r = 0)

# Fungi
for(i in 2:(nrow(WRS_OTU_model$fungi) - 1)) {
  gr <- cutree(WRS_clust_average$fungi, i)
  distgr <- grpdist(gr)
  mt <- cor(WRS_bray$fungi, distgr, method = "pearson")
  kt[i, 2] <- mt
}
k.best <- which.max(kt$r)
plot(kt$k, kt$r, type = "h", main = "Mantel-optimal number of fungi clusters", xlab = "k (number of groups)", ylab = "Pearson's correlation")
axis(1, k.best, paste("optimum", k.best, sep = "\n"), col = "red", font = 2, col.axis = "red")
points(k.best, max(kt$r), pch = 16, col = "red", cex = 1.5)
# Eukarya
for(i in 2:(nrow(WRS_OTU_model$eukarya) - 1)) {
  gr <- cutree(WRS_clust_average$eukarya, i)
  distgr <- grpdist(gr)
  mt <- cor(WRS_bray$eukarya, distgr, method = "pearson")
  kt[i, 2] <- mt
}
k.best <- which.max(kt$r)
plot(kt$k, kt$r, type = "h", main = "Mantel-optimal number of eukarya clusters", xlab = "k (number of groups)", ylab = "Pearson's correlation")
axis(1, k.best, paste("optimum", k.best, sep = "\n"), col = "red", font = 2, col.axis = "red")
points(k.best, max(kt$r), pch = 16, col = "red", cex = 1.5)
# Bacteria
for(i in 2:(nrow(WRS_OTU_model$bacteria) - 1)) {
  gr <- cutree(WRS_clust_average$bacteria, i)
  distgr <- grpdist(gr)
  mt <- cor(WRS_bray$bacteria, distgr, method = "pearson")
  kt[i, 2] <- mt
}
k.best <- which.max(kt$r)
plot(kt$k, kt$r, type = "h", main = "Mantel-optimal number of bacteria clusters", xlab = "k (number of groups)", ylab = "Pearson's correlation")
axis(1, k.best, paste("optimum", k.best, sep = "\n"), col = "red", font = 2, col.axis = "red")
points(k.best, max(kt$r), pch = 16, col = "red", cex = 1.5)
# Archaea
for(i in 2:(nrow(WRS_OTU_model$archaea) - 1)) {
  gr <- cutree(WRS_clust_average$archaea, i)
  distgr <- grpdist(gr)
  mt <- cor(WRS_bray$archaea, distgr, method = "pearson")
  kt[i, 2] <- mt
}
k.best <- which.max(kt$r)
plot(kt$k, kt$r, type = "h", main = "Mantel-optimal number of archaea clusters", xlab = "k (number of groups)", ylab = "Pearson's correlation")
axis(1, k.best, paste("optimum", k.best, sep = "\n"), col = "red", font = 2, col.axis = "red")
points(k.best, max(kt$r), pch = 16, col = "red", cex = 1.5)

Approximate optimal numbers are as follows: Fungi; 4 - 5, Eukarya, 3 - 4; Bacteria, >5 (without becoming too hard to interpret); Archaea, 5 - 7 Think I'll try 4 for Fungi and Eukarya, and 5 for Bacteria and Archaea.

Compare the ordinations and the clustering results

cutoff <- c(4, 4, 5, 5)
clust_cut <- sapply(seq_along(WRS_clust_average), function(i) {
  cutree(WRS_clust_average[[i]], k = cutoff[i])
})
names(clust_cut) <- names(WRS_clust_average)

# NMDS
lapply(seq_along(WRS_NMDS), FUN = function(i) {
  plot(WRS_NMDS[[i]], main = names(WRS_NMDS)[[i]])
  text(WRS_NMDS[[i]], cex = 1.5, labels = clust_cut[, i])
})

# PCoA
lapply(seq_along(WRS_PCoA), FUN = function(i) {
  plot(WRS_PCoA[[i]], type = "n", main = names(WRS_PCoA)[[i]])
  text(WRS_PCoA[[i]]$points[, 1:2], labels = clust_cut[, i], cex = 1.5)
})

Overall, these don't appear to well-match the structuring of the biotic communities. It could be that higher numbers of clusters would work better; let's have a look.

cutoff <- c(6, 6, 7, 7)
clust_cut <- sapply(seq_along(WRS_clust_average), function(i) {
  cutree(WRS_clust_average[[i]], k = cutoff[i])
})
names(clust_cut) <- names(WRS_clust_average)

# NMDS
lapply(seq_along(WRS_NMDS), FUN = function(i) {
  plot(WRS_NMDS[[i]], main = names(WRS_NMDS)[[i]], display = "sites")
  text(WRS_NMDS[[i]], cex = 1.5, labels = clust_cut[, i])
})

# PCoA
lapply(seq_along(WRS_PCoA), FUN = function(i) {
  plot(WRS_PCoA[[i]], type = "n", main = names(WRS_PCoA)[[i]])
  text(WRS_PCoA[[i]]$points[, 1:2], labels = clust_cut[, i], cex = 1.5)
})

Compared to the abiotic data, which showed a rather noticeable clustering into two major groups, generally along the lines of surface/deep, it looks like the biological communities are better characterised as a "central" grouping of similar communities (arranged over a gradient), with a small number of outliers. This is interesting.

# Compare the trees for each taxon
WRS_dend <- lapply(WRS_clust_average, as.dendrogram)
WRS_dend <- as.dendlist(WRS_dend)
names(WRS_dend) <- names(WRS_clust_average)
all.equal(WRS_dend)
dist.dendlist(WRS_dend)
cor.dendlist(WRS_dend)
# corrplot(cor.dendlist(WRS_dend), "pie", "lower") # Broken?
name_grid <- expand.grid(names(WRS_dend), names(WRS_dend), stringsAsFactors = FALSE)
WRS_BG <- vector(length = nrow(name_grid))
for(i in 1:length(WRS_BG)) {
  WRS_BG[i] <- cor_bakers_gamma(
    WRS_dend[c(unlist(name_grid[i, ]))][[1]],
    WRS_dend[c(unlist(name_grid[i, ]))][[2]]
    )
}
taxa <- c("fungi", "eukarya", "bacteria", "archaea")
WRS_BG <- matrix(WRS_BG, ncol = 4, nrow = 4, dimnames = list(taxa, taxa))
WRS_BG
WRS_cophen <- vector(length = nrow(name_grid))
for(i in 1:length(WRS_BG)) {
  WRS_cophen[i] <- cor_cophenetic(
    WRS_dend[c(unlist(name_grid[i, ]))][[1]],
    WRS_dend[c(unlist(name_grid[i, ]))][[2]]
  )
}
WRS_cophen <- matrix(WRS_cophen, ncol = 4, nrow = 4, dimnames = list(taxa, taxa))
WRS_cophen

# Similarities are very low
# Highest is between Eukarya and Fungi.

Those abiotic ordinations are interesting... there are a few sites which are intermediate (they even cluster separately for some classes), and might represent ecotones. I should check their hydrogeology (likely confluence of more than one hydrogeological zone) and test for increased biodiversity.

The larger approach is to get some measure of "habitat purity" from the environmental ordinations - i.e. is there a correlation between average (environmental i.e. Gower) distance to other sites and richness? Patterns on a per-taxon and per-data-class basis?

The clustering approach could use average cophenetic distance, perhaps, or something similar.



mixtrak/wellington.aquifer.ecol documentation built on Nov. 30, 2017, 4:25 a.m.