JMDplots: Plots from Papers by Jeffrey M. Dick

Documented in geo16S1 geo16S2 geo16S3

# JMDplots/geo16S.R
# Make plots for the paper:
# Chemical links between redox conditions and estimated community proteomes from 16S rRNA and reference protein sequences
# 20210416 Initial commit to JMDplots
# 20210527 Updated plots for RefSeq release 206

# Figure 1: Distinct chemical parameters of reference proteomes for taxonomic groups 20200925
geo16S1 <- function(pdf = FALSE) {

  if(pdf) pdf("geo16S1.pdf", width = 11, height = 5)
  par(mfrow = c(1, 3))

  # Read chemical metrics of all taxa
  datadir <- system.file("RefDB/RefSeq_206", package = "JMDplots")
  metrics <- read.csv(file.path(datadir, "taxon_metrics.csv.xz"), as.is = TRUE)

  ## Panel A: Major phyla
  # Get names of phyla with more than 500 representatives
  phyla <- metrics[metrics$rank == "phylum", ]
  phyla <- phyla[phyla$ntaxa > 500, ]
  phyla <- phyla[order(phyla$ntaxa, decreasing = TRUE), ]
  # Move viruses to end 20200926
  ivirus <- phyla$parent == "Viruses"
  phyla <- rbind(phyla[!ivirus, ], phyla[ivirus, ])
  taxa <- phyla$group
  xlim <- c(-0.25, -0.05)
  ylim <- c(-0.9, -0.65)
  pch <- ifelse(phyla$parent == "Bacteria", 21, ifelse(phyla$parent == "Archaea", 24, 23))
  # Set colors for points
  col <- seq_along(taxa)
  # Use semi-transparent colors for lines 20210518
  lcol <- palette()
  lcol[1] <- "#000000"  # black
  lcol[8] <- "#9e9e9e"  # gray62
  lcol <- paste0(lcol, "80")
  lcol <- rep(lcol, length.out = length(col))

  # Make the plot
  ps1 <- plot_starburst(taxa, metrics = c("Zc", "nH2O"), refdb = "RefSeq_206", xlim = xlim, ylim = ylim,
    pch = pch, lcol = lcol, hline = c(-0.81, -0.68), terminal_H2O = 1)
  # Label Halobacteria and Nanohaloarchaea 20200930
  iEury <- match("Euryarchaeota", names(ps1))
  children <- ps1[[iEury]]$children
  ihalo <- match(c("Methanococci", "Archaeoglobi", "Thermococci", "Halobacteria"), children$taxon)
  dy <- 0.005
  dx <- c(0, 0, 0, 0.002)
  text(children$Xvals[ihalo] + dx, children$Yvals[ihalo] + dy, c(1, 2, 3, 4))
  # Add legend
  len <- length(taxa)
  legend <- c("Cellular", taxa[1:6], "Viruses", taxa[7:11])
  pch <- c(NA, pch[1:6], NA, pch[7:11])
  col <- c(NA, col[1:6], NA, col[7:11])
  legend("bottomleft", legend, text.font = c(2, 1,1,1,1,1,1, 2, 1,1,1,1,1), pch = pch, col = col, pt.bg = col, cex = 0.9, bg = "white")
  title("Major phyla and their classes", font.main = 1, cex.main = 1.4)
  label.figure("A", font = 2, cex = 1.6)
  # Draw lines indicating zoom area in next plot
  par(xpd = NA)
  lines(c(-0.05, -0.015), c(-0.81, -0.9), lty = 2, col = "gray40")
  lines(c(-0.05, -0.015), c(-0.68, -0.65), lty = 2, col = "gray40")
  par(xpd = FALSE)

  ## Panel B: Major cellular phyla
  # This is like Major phyla but excludes Viruses
  phyla <- metrics[metrics$rank == "phylum" & metrics$parent != "Viruses", ]
  phyla <- phyla[phyla$ntaxa > 60, ]
  phyla <- phyla[order(phyla$ntaxa, decreasing = TRUE), ]
  # Swap Chloroflexi and Crenarchaeota so latter doesn't have same color as Euryarchaeota 20210527
  phyla[14:15, ] <- phyla[15:14, ]
  taxa <- phyla$group
  xlim <- c(-0.25, -0.05)
  ylim <- c(-0.81, -0.68)
  pch <- rep(22, length(taxa))
  pch[1:8] <- 21
  pch[phyla$parent == "Archaea"] <- 24
  # Set colors for points
  col <- seq_along(taxa)
  # Use semi-transparent colors for lines 20210518
  lcol <- palette()
  lcol[1] <- "#000000"  # black
  lcol[8] <- "#9e9e9e"  # gray62
  lcol <- paste0(lcol, "80")
  lcol <- rep(lcol, length.out = length(col))

  # Make the plot
  ps2 <- plot_starburst(taxa, metrics = c("Zc", "nH2O"), refdb = "RefSeq_206", xlim = xlim, ylim = ylim,
    pch = pch, lcol = lcol, hline = c(-0.77, -0.71), terminal_H2O = 1)
  # Label Halobacteria and Nanohaloarchaea 20200930
  iEury <- match("Euryarchaeota", names(ps1))
  children <- ps1[[iEury]]$children
  ihalo <- match(c("Methanococci", "Archaeoglobi", "Thermococci", "Halobacteria"), children$taxon)
  dy <- 0.0025
  dx <- c(0, 0, 0, 0.002)
  text(children$Xvals[ihalo] + dx, children$Yvals[ihalo] + dy, c(1, 2, 3, 4))
  # Label Clostridia 20200930
  iFirm <- match("Firmicutes", names(ps1))
  children <- ps1[[iFirm]]$children
  iclos <- match("Clostridia", children$taxon)
  text(children$Xvals[iclos], children$Yvals[iclos] + 0.0025, 5)
  # Add legend
  len <- length(taxa)
  legend("bottomleft", taxa[1:8], pch = pch[1:8], col = col[1:8], pt.bg = col[1:8], cex = 0.9, bg = "white")
  legend("bottomright", taxa[9:len], pch = pch[9:len], col = col[9:len], pt.bg = col[9:len], cex = 0.9, bg = "white")
  title("Major cellular phyla and their classes", font.main = 1, cex.main = 1.4)
  label.figure("B", font = 2, cex = 1.6)
  par(xpd = NA)
  lines(c(-0.05, -0.015), c(-0.77, -0.81), lty = 2, col = "gray40")
  lines(c(-0.05, -0.015), c(-0.71, -0.68), lty = 2, col = "gray40")
  par(xpd = FALSE)

  ## Panel C: Proteobacteria 20200925
  # How to count the representatives in each proteobacterial class:
  #> taxa <- read.csv(system.file("RefDB/RefSeq_206/taxonomy.csv.xz", package = "JMDplots"), as.is = TRUE)
  #> sort(table(na.omit(taxa$class[taxa$phylum == "Proteobacteria"])), decreasing = TRUE)
  #
  #  Gammaproteobacteria   Alphaproteobacteria    Betaproteobacteria 
  #                 8269                  5667                  2456 
  #Epsilonproteobacteria   Deltaproteobacteria           Oligoflexia 
  #                  451                   441                    32 
  #    Acidithiobacillia     Hydrogenophilalia    Zetaproteobacteria 
  #                   20                    11                    11 
  taxa <- c("Alphaproteobacteria", "Betaproteobacteria", "Gammaproteobacteria", "Deltaproteobacteria", "Epsilonproteobacteria", "Zetaproteobacteria",
            "Acidithiobacillia", "Hydrogenophilalia", "Oligoflexia")
  pch <- rep(21:23, 3)
  xlim <- c(-0.25, -0.05)
  ylim <- c(-0.77, -0.71)
  # Set colors for points
  col <- seq_along(taxa)
  # Use semi-transparent colors for lines 20210518
  lcol <- palette()
  lcol[1] <- "#000000"  # black
  lcol[8] <- "#9e9e9e"  # gray62
  lcol <- paste0(lcol, "80")
  lcol <- rep(lcol, length.out = length(col))

  # Make the plot
  ps3 <- plot_starburst(taxa, metrics = c("Zc", "nH2O"), refdb = "RefSeq_206", xlim = xlim, ylim = ylim,
    pch = pch, lcol = lcol, terminal_H2O = 1)
  # Add legend
  len <- length(taxa)
  legend("topright", taxa[1:6], pch = pch[1:6], col = col[1:6], pt.bg = col[1:6], cex = 0.9, bg = "white")
  legend("bottomleft", taxa[7:len], pch = pch[7:len], col = col[7:len], pt.bg = col[7:len], cex = 0.9, bg = "white")
  title("Proteobacterial classes and their orders", font.main = 1, cex.main = 1.4)
  label.figure("C", font = 2, cex = 1.6)

  if(pdf) dev.off()

  # Return data for Supplementary Table 20210831
  datA <- do.call(rbind, lapply(ps1, function(x) {do.call(rbind, x)}))
  datA <- cbind(plot = "A", datA)
  datB <- do.call(rbind, lapply(ps2, function(x) {do.call(rbind, x)}))
  datB <- cbind(plot = "B", datB)
  datC <- do.call(rbind, lapply(ps3, function(x) {do.call(rbind, x)}))
  datC <- cbind(plot = "C", datC)
  out <- rbind(datA, datB, datC)
  rownames(out) <- NULL
  invisible(out)

}

# Figure 2: Natural environment datasets 20200923
geo16S2 <- function(pdf = FALSE) {

  if(pdf) pdf("geo16S2.pdf", width = 9, height = 7)
  oopar <- par(no.readonly = TRUE)
  par(mar = c(4, 4, 3, 2))
  par(mgp = c(2.5, 1, 0))
  layout(matrix(c(1,2,3, 4,9,5, 6,7,8), nrow = 3, byrow = TRUE))

  # Function to add points
  pointfun <- function(pcomp) {
    ifill <- pcomp$pch > 20
    points(pcomp$Zc[ifill], pcomp$nH2O[ifill], pch = pcomp$pch[ifill], col = 1, bg = pcomp$col[ifill])
    points(pcomp$Zc[!ifill], pcomp$nH2O[!ifill], pch = pcomp$pch[!ifill], col = pcomp$col[!ifill])
  }

  p1 <- plotmet_geo16S("BGPF13", title = FALSE, points = FALSE)
#  title("Yellowstone hot springs\nBowen De Le\u00f3n et al., 2013", font.main = 1)
  title("Yellowstone hot springs", font.main = 1)
  add_hull(p1$Zc, p1$nH2O, border= 2)
  pointfun(p1)
  legend <- c("Archaea", "Bacteria")
  legend("bottomleft", legend, pch = c(23, 22), col = c(1, 1), pt.bg = c(6, 5), bg = "white")

  p2 <- plotmet_geo16S("SVH+19", title = FALSE, points = FALSE)
#  title("Black Sea\nSollai et al., 2019", font.main = 1)
  title("Black Sea", font.main = 1)
  add_hull(p2$Zc, p2$nH2O, border = "blue", lty = 2)
  pointfun(p2)
  legend <- c("Oxic", "Suboxic", "Euxinic")
  legend("bottomright", legend, pch = c(24, 20, 25), pt.bg = c(4, 1, 2), bg = "white")

  p3 <- plotmet_geo16S("HLA+16", title = FALSE, points = FALSE)
#  title("Baltic Sea\nHerlemann et al., 2016", font.main = 1)
  title("Baltic Sea", font.main = 1)
  add_hull(p3$Zc, p3$nH2O, border = "blue")
  pointfun(p3)
  legend <- c("< 6", "6-20", "> 20")
  legend("bottomright", legend, pch = c(24, 20, 21), col = c(1, 1, 1), pt.bg = c(3, NA, 4), bg = "white", title = "Salinity")

  p4 <- plotmet_geo16S("MPB+17", title = FALSE, points = FALSE)
#  title("Manus Basin submarine vents\nMeier et al., 2017", font.main = 1)
  title("Manus Basin submarine vents", font.main = 1)
  add_hull(p4$Zc, p4$nH2O, border = 2, lty = 2)
  pointfun(p4)
  legend <- as.expression(c(quote(italic(T)~"< 50 \u00B0C"), quote(italic(T)~"> 50 \u00B0C")))
  legend("bottomleft", legend, pch = c(21, 23), col = c(1, 1), pt.bg = c(4, 2), bg = "white", title = "Water")
  legend("bottomright", c("Rock", "Fauna"), pch = c(20, 8), col = c(1, "#757500C0"), bg = "white")

  p5 <- plotmet_geo16S("ZLM+16", title = FALSE, points = FALSE)
#  title("Tibetan Plateau lakes\nZhong et al., 2016", font.main = 1)
  title("Tibetan Plateau lakes", font.main = 1)
  add_hull(p5$Zc, p5$nH2O, border = "turquoise3", lty = 2)
  pointfun(p5)
  legend <- c("< 10 g/L", "24-99 g/L", "> 300 g/L")
  legend("topright", legend, pch = c(24, 20, 21), col = c(1, 1, 1), pt.bg = c(3, NA, 4), bg = "white", title = "Salinity")

  p6 <- plotmet_geo16S("JHM+16", title = FALSE, points = FALSE)
#  title("Lake Fryxell oxygen gradient\nJungblut et al., 2016", font.main = 1)
  title("Lake Fryxell oxygen gradient", font.main = 1)
  add_hull(p6$Zc, p6$nH2O, border = "tan1")
  pointfun(p6)
  legend <- c("Oxic", "Transition", "Anoxic")
  legend("bottomright", legend, pch = c(24, 20, 25), pt.bg = c(4, 1, 2), bg = "white")

  p7 <- plotmet_geo16S("HCW+13", title = FALSE, points = FALSE, ylim = c(-0.7685, -0.7585))
#  title("Guerrero Negro mat layers\nHarris et al., 2013", font.main = 1)
  title("Guerrero Negro mat layers", font.main = 1)
  add_hull(p7$Zc, p7$nH2O, border = "tan1", lty = 2)
  pointfun(p7)
  text(c(-0.1512, -0.1572, -0.1576), c(-0.7587, -0.7645, -0.7684), c("0-1 mm", "1-2 mm", "2-3 mm"))
  legend <- c("Photic/oxic", "Low sulfide", "High sulfide")
  legend("topleft", legend, pch = c(24, 20, 25), pt.bg = c(4, 1, 2), bg = "white")

  p8 <- plotmet_geo16S("XDZ+17", title = FALSE, points = FALSE)
#  title("Qarhan Salt Lake and\nnormal soils, Xie et al., 2017", font.main = 1)
  title("Qarhan Salt Lake\nand normal soils", font.main = 1)
  add_hull(p8$Zc, p8$nH2O, border = "turquoise3")
  pointfun(p8)
  legend <- c("Normal", "Saline")
  legend("topright", legend, pch = c(24, 21), pt.bg = c(3, 4), bg = "white")

  # Make an index plot
  opar <- par(mar = c(2.5, 2.5, 0.5, 0.5))
  xlim <- c(-0.22, -0.09)
  ylim <- c(-0.77, -0.71)
  plot(xlim, ylim, xlab = "", ylab = "", type = "n")
  lmlines()
  # Add convex hulls for each dataset in this figure
  add_hull(p1$Zc, p1$nH2O, border = 2)
  add_hull(p2$Zc, p2$nH2O, border = "blue", lty = 2)
  add_hull(p3$Zc, p3$nH2O, border = "blue")
  add_hull(p4$Zc, p4$nH2O, border = 2, lty = 2)
  add_hull(p5$Zc, p5$nH2O, border = "turquoise3", lty = 2)
  add_hull(p6$Zc, p6$nH2O, border = "tan1")
  add_hull(p7$Zc, p7$nH2O, border = "tan1", lty = 2)
  add_hull(p8$Zc, p8$nH2O, border = "turquoise3")
  par(opar)

  # Add environment type labels 20210427
  par(xpd = NA)
  text(-0.397, -0.703, "Hydrothermal", cex = 1.5, font = 2, srt = 90)
  text(-0.24, -0.857, "Microbial Mats", cex = 1.5, font = 2)
  text(-0.075, -0.635, "Seawater", cex = 1.5, font = 2)
  text(0.074, -0.785, "Hypersaline", cex = 1.5, font = 2, srt = 90)
  par(xpd = FALSE)

  par(oopar)
  if(pdf) dev.off()

  # Return data for Supplementary Table 20210831
  out <- rbind(p1, p2, p3, p4, p5, p6, p7, p8)
  out$Zc <- round(out$Zc, 6)
  out$nH2O <- round(out$nH2O, 6)
  invisible(out)

}

# Figure 3: Stratified lakes and seawater 20210428
geo16S3 <- function(pdf = FALSE) {

  if(pdf) pdf("geo16S3.pdf", width = 7, height = 9)
  mat <- matrix(c(1,1,1,1, 2,3,4,5, 6,7,8,9, 10,11,12,13), byrow = TRUE, nrow = 4)
  layout(mat, heights = c(1, 10, 10, 10))
  # Make legend
  par(mar = c(0, 0, 0, 0))
  plot.new()
  legend <- as.expression(c(quote(italic(Z)[C]), quote(O[2])))
  legend("top", legend = legend, lty = c(1, 1), lwd = 1.5, col = c(1, 2), pch = c(21, NA), pt.bg = "white", ncol = 3, bty = "n")
  # Setup plot metrics
  par(mgp = c(1.8, 0.5, 0), mar = c(3, 3, 3, 1))

  # Identify datasets to plot
  # "__BSMG__" is flag for Black Sea metagenome 20220119
  study <- c("SVH+19", "MZG+20", "MZG+20", "GBL+15", "GBL+15",
             "__BSMG__", NA, "HXZ+20", "HXZ+20",
             "BCA+21", "BCA+21", "BCA+21", "BCA+21")
  column <- c("study", "lake", "lake", "size", "size",
              "", NA, "Station", "Station",
              "Month", "Month", "Month", "Month")
  ID <- c("SVH+19", "Lake Zug", "Lake Lugano", "0.2-1.6micron", "1.6-30micron",
          "", NA, "SYBL", "C4",
          "Jul", "Nov", "Feb", "Apr")
  # \n are used for vertical offset from plot bottom
  title <- c("Black Sea\n", "Lake Zug\n", "Lake\nLugano\n", "ETNP\n", "ETNP\n",
             "Black Sea\n\n", NA, "Blue Hole\n\n", "Blue Hole\n\n",
             "Ursu Lake\n\n", "Ursu Lake\n\n", "Ursu Lake\n\n", "Ursu Lake\n\n")
  subtitle <- c("", "", "", "", "",
                "Metagenome\n", NA, "Inside\n", "Outside (C4)\n",
                "July 2015\n", "November 2015\n", "February 2016\n", "April 2016\n")
  titlesub <- paste(title, subtitle)

  # Make object to hold data for Supplementary Table
  out <- list()
  # Loop over studies
  for(i in 1:length(study)) {
    if(is.na(study[i])) {
      # Don't make a plot, but add text 20220119
      plot.new()
      par(xpd = NA)
      arrows(0, 0.75, -0.4, 0.75, length = 0.1)
      text(-0.4, 0.5, paste(
        "This plot is for protein sequences",
        "inferred from shotgun metagenomes.",
        "",
        "All other plots are for estimated",
        "community proteomes from 16S rRNA",
        "and reference protein sequences.",
        sep = "\n"), cex = 0.9, adj = 0)
      par(xpd = FALSE)
      next
    }
    # Zc range for plots
    if(study[i] == "BCA+21") Zclim <- c(-0.180, -0.145) else Zclim <- c(-0.170, -0.140)
    if(study[i] == "__BSMG__") {
      ## Get data for Black Sea metagenome
      ARASTdir <- system.file("extdata/geo16S/ARAST", package = "JMDplots")
      aa <- read.csv(file.path(ARASTdir, "Black_Sea_AA.csv"))
      Zc <- Zc(aa)
      nH2O <- nH2O(aa)
      depth = c(50, 70, 80, 85, 90, 95, 100, 105,
        110, 130, 170, 250, 500, 1000, 2000)
      Metagenome = c("SRR12347146", "SRR12347145", "SRR12347139", "SRR12347138", "SRR12347137", "SRR12347136", "SRR12347135", "SRR12347134",
        "SRR12347133", "SRR12347132", "SRR12347144", "SRR12347143", "SRR12347142", "SRR12347141", "SRR12347140")
      ## Get metadata for O2 concentration
      alldat <- getmdat_geo16S("SVH+19")
      # Check that the samples are in the right order
      stopifnot(all(aa$protein == Metagenome))
      stopifnot(all(alldat$depth == depth))
      # Put in correct data for the metagenome
      alldat$study <- "VMW+21"
      alldat$name <- "Black Sea metagenome"
      alldat$Run <- Metagenome
    } else {
      ## Get data for 16S studies
      # Get the metadata and chemical metrics for this study
      # Keep all rows for higher-resolution O2 measurements
      mdat <- getmdat_geo16S(study[i], dropNA = FALSE)
      metrics <- getmetrics_geo16S(study[i])
      # Get the rows matching the ID
      iID <- mdat[, column[i]] == ID[i]
      mdat <- mdat[iID, ]
      # Sort the data by depth
      alldat <- mdat <- mdat[order(mdat$depth), ]
      # Now exclude NA samples
      mdat <- mdat[!is.na(mdat$name), ]
      depth <- mdat$depth
      # Get the Zc and nH2O values
      imet <- match(mdat$Run, metrics$Run)
      Zc <- metrics$Zc[imet]
      nH2O <- metrics$nH2O[imet]
    }

    # Reverse y-axis (depth)
    ylim <- rev(range(depth))
    # Visualize deeper O2 concentrations in Ursu Lake
    if(study[i] == "BCA+21") ylim <- c(11, 0)
    if(study[i] %in% c("SVH+19", "__BSMG__")) {
      # Plot 1000 and 2000 m samples closer to the others 20210608
      ylim <- c(700, 50)
      depth[match(c(1000, 2000), depth)] <- c(600, 700)
    }
    # Add space at bottom of ETNP for spearman correlations 20220114
    if(study[i] == "GBL+15") ylim <- c(320, 30)
    # Determine whether the title has changed
    newplot <- TRUE
    if(i > 1) if(titlesub[i]==titlesub[i-1]) newplot <- FALSE
    if(newplot) {
      if(study[i] %in% c("SVH+19", "__BSMG__")) {
        plot(Zc, depth, xlim = Zclim, ylim = ylim, xlab = axis.label("Zc"), ylab = "Depth (m)", type = "b", yaxt = "n")
        axis(2, at = seq(100, 700, 100), labels = c(100, 200, 300, 400, 500, 1000, 2000), gap.axis = 0)
        # Plot y-axis break 20210715
        par(xpd = NA)
        rect(-0.172, 557, -0.168, 542, col = "white", border = NA)
        text(-0.1712, 542, "/", srt = 90)
        text(-0.1712, 556, "/", srt = 90)
        par(xpd = FALSE)
      } else plot(Zc, depth, xlim = Zclim, ylim = ylim, xlab = axis.label("Zc"), ylab = "Depth (m)", type = "b")
    } else {
      # Add to plot if the title hasn't changed
      points(Zc, depth, type = "b", pch = 0)
    }
    # Save Zc and depth for Spearman correlation 20220114
    Zcdepth <- list(Zc = Zc, depth = depth)

    if(newplot) {
      # Add title in lower right
      if(grepl("Outside", subtitle[i])) {
        text(Zclim[1], ylim[1], title[i], adj = c(0, 0), font = 2)
        text(Zclim[1], ylim[1], subtitle[i], adj = c(0, 0))
      } else {
        text(Zclim[2], ylim[1], title[i], adj = c(1, 0), font = 2)
        text(Zclim[2], ylim[1], subtitle[i], adj = c(1, 0))
      }
      # Plot O2 concentrations
      nc <- nchar(title[i])
      what <- "O2"
      if(study[i] == "BCA+21") xlim <- c(0, 25) else xlim <- c(0, 220)
      col <- 2
      lty <- 1
      icol <- grep("^O2", colnames(alldat))
      # Remove NA values
      alldat <- alldat[!is.na(alldat[, icol]), ]
      depth <- alldat$depth
      par(new = TRUE)
      plot(alldat[, icol], depth, col = col, lty = lty, type = "l", axes = FALSE, xlab = "", ylab = "", xlim = xlim, ylim = ylim)
      # Add second axis labels
      xlab <- colnames(alldat)[icol]
      if(xlab == "O2 (umol kg-1)") xlab <- quote(O[2]~"(\u00B5mol kg"^-1*")")
      if(xlab == "O2 (umol L-1)") xlab <- quote(O[2]~"(\u00B5mol L"^-1*")")
      if(xlab == "O2 (mg L-1)") xlab <- quote(O[2]~"(mg L"^-1*")")
      axis(3)
      mtext(xlab, side = 3, line = 1.7, cex = par("cex"))
      # Extra labels for ETNP
      if(title[i]=="ETNP\n") {
        text(44, 76, "0.2-\n1.6 \u00B5m")
        text(135, 138, "1.6-\n30 \u00B5m")
      }
      # Restore xlim for plotting Zc
      par(new = TRUE)
      plot(0, 0, type = "n", axes = FALSE, xlab = "", ylab = "", xlim = Zclim, ylim = ylim)
    }
    # Save O2 and depth for Spearman correlation 20220114
    O2depth <- list(O2 = alldat[, icol], depth = depth)

    # Add Spearman correlation 20220114
    depths <- intersect(Zcdepth$depth, O2depth$depth)
    iZc <- match(depths, Zcdepth$depth)
    iO2 <- match(depths, O2depth$depth)
    # Calculate Spearman rank correlation; use = "complete.obs" to drop pairs with NA Zc
    spearman <- cor(Zcdepth$Zc[iZc], O2depth$O2[iO2], use = "complete.obs", method = "spearman")
    rtxt <- bquote(rho == .(formatC(spearman, digits = 2, format = "f")))
    if(grepl("ETNP", title[i])) {
      # For ETNP, put the correlations next to the lines for different size fractions
      if(newplot) text(Zclim[2], ylim[1], rtxt, adj = c(3, 0.1))
      else text(Zclim[2], ylim[1], rtxt, adj = c(1.9, 0.1))
    } else {
      if(grepl("Outside", subtitle[i])) text(Zclim[1], ylim[1], rtxt, adj = c(0, 0.1))
      else text(Zclim[2], ylim[1], rtxt, adj = c(1, 0.1))
    }

    # Assemble the data for Supplementary Table
    sd <- alldat[, c("study", "name", "Run", "sample", "depth")]
    sd <- cbind(sd, "O2 (umol kg-1)" = NA, "O2 (umol L-1)" = NA, "O2 (mg L-1)" = NA, Zc = NA, nH2O = NA)
    # Names of the columns with chemical concentrations
    cnames <- c("O2 (umol kg-1)", "O2 (umol L-1)", "O2 (mg L-1)")
    for(cname in cnames) if(cname %in% colnames(alldat)) sd[, cname] <- alldat[, cname]
    if(study[i] == "__BSMG__") {
      sd$Zc <- Zc
      sd$nH2O <- nH2O
    } else {
      # Put Zc and nH2O values in correct place
      metrics <- metrics[imet, ]
      metrics <- metrics[!is.na(metrics$Run), ]
      isd <- match(metrics$Run, sd$Run)
      sd$Zc[isd] <- metrics$Zc
      sd$nH2O[isd] <- metrics$nH2O
    }

    # Replace NA name with study name
    sd.name <- na.omit(sd$name)[1]
    sd$name[is.na(sd$name)] <- sd.name
    # Use NA instead of "" for missing Run and sample name
    sd$Run[sd$Run == ""] <- NA
    sd$sample[sd$sample == ""] <- NA
    # Place data frame into output list
    out[[i]] <- sd

  } # end loop

  if(pdf) dev.off()

  # Return data for Supplementary Table 20210831
  out <- do.call(rbind, out)
  out$Zc <- round(out$Zc, 6)
  out$nH2O <- round(out$nH2O, 6)
  invisible(out)

}

# Figure 4: Shale gas datasets and Zc difference between oxidized and reduced samples 20210414
geo16S4 <- function(pdf = FALSE) {

  if(pdf) pdf("geo16S4.pdf", width = 10, height = 6)
  layout(matrix(c(1, 3, 2, 4, 5, 5), nrow = 2), widths = c(2, 2, 1.8))
  par(mar = c(4, 4, 1, 1))
  par(mgp = c(2.5, 1, 0))

  ## Plot A: Pennsylvania streams affected by Marcellus Shale activity 20210324

  # Data from Ulrich et al., 2018
  xlim <- c(-0.16, -0.13)
  ylim <- c(-0.755, -0.725)
  plotmet_geo16S("UKD+18.water_2014", xlim = xlim, ylim = ylim, title = FALSE)
  legend("topleft", c("MSA-", "MSA+"), pch = c(1, 21), pt.bg = c(1, 2), bg = "white", title = "NW PA streams (2014)")
  label.figure("A", cex = 1.5, xfrac = 0.03, font = 2)

  ## Plot B: Comparison of different studies on Pennsylvania Streams 20210327

  studies <- c("UKD+18.water_2014", "UKD+18.sediment_2014", "MMA+20_spring", "MMA+20_fall")
  # Start plot
  plot(c(-0.148, -0.140), c(-0.748, -0.735), type = "n", xlab = cplab$Zc, ylab = cplab$nH2O)
  pch <- 21:25
  xadj <- c(0.5, 0.5, 0.5, -0.5)
  yadj <- c(-0.8, -0.8, -0.8, 0.5)
  outB <- list()
  # Loop over studies
  for(i in 1:4) {
    pm <- plotmet_geo16S(studies[[i]], plot.it = FALSE, extracolumn = "type")
    # Determine sample groups from values of pch returned by plot_metrics()  20210901
    i1 <- pm$pch == 1
    i2 <- pm$pch == 21
    means <- list(Zc1 = mean(pm$Zc[i1]), Zc2 = mean(pm$Zc[i2]), nH2O1 = mean(pm$nH2O[i1]), nH2O2 = mean(pm$nH2O[i2]))
    points(means$Zc1, means$nH2O1, pch = pch[i], cex = 1.5, lwd = 2, bg = "#ffffffa0")
    lines(c(means$Zc1, means$Zc2), c(means$nH2O1, means$nH2O2))
    points(means$Zc2, means$nH2O2, pch = pch[i], cex = 1.8, lwd = 2, bg = "#df536ba0")
    # Plot number of samples next to points 20210902
    n1 <- length(pm$Zc[i1])
    n2 <- length(pm$Zc[i2])
    if(i==4) dyadj <- 0.2 else dyadj <- 0
    text(means$Zc1, means$nH2O1, n1, adj = c(xadj[i], yadj[i] - dyadj))
    text(means$Zc2, means$nH2O2, n2, adj = c(xadj[i], yadj[i] + dyadj))
    # Save values for Supplementary Table 20210901
    outB[[i]] <- pm
  }
  # Add labels
  text(-0.143, -0.7393, "NW PA\nstreams (2014)")
  text(-0.1468, -0.7433, "NW PA\nsediment (2014)")
  text(-0.1428, -0.7455, "PASF streams (spring)")
  text(-0.1432, -0.7473, "PASF streams (fall)")
  # Add legend
  legend("topleft", c("MSA- or lowest disturbance", "MSA+ or highest disturbance"), pch = c(21, 21), pt.bg = c("#ffffffa0", "#df536ba0"), pt.cex = c(1.4, 1.7), lwd = 2, lty = NA)
  label.figure("B", cex = 1.5, xfrac = 0.03, font = 2)

  ## Plots C-D: Comparison of different studies on produced water 20210330

  # Panel C: Cluff et al., 2014
  plotmet_geo16S("CHM+14_injected-49", title = FALSE)
  legend("topright", c("Injected fluids (day 0)", "Produced water (day 49 and after)"),
         pch = c(21, 21), pt.bg = c("white", 2), bg = "white", title = "Marcellus Shale")
  box()
  label.figure("C", cex = 1.5, xfrac = 0.03, font = 2)

  # Panel D: Multiple studies
  studies <- c("CHM+14_injected-49", "HRR+18_injected-22", "ZLF+19_injected-18")
  # Start plot
  plot(c(-0.22, -0.14), c(-0.75, -0.71), type = "n", xlab = cplab$Zc, ylab = cplab$nH2O)
  pch <- 21:25
  outD <- list()
  # Loop over studies
  for(i in 1:3) {
    pm <- plotmet_geo16S(studies[[i]], plot.it = FALSE, extracolumn = "type")
    # Determine sample groups from values of pch returned by plot_metrics()  20210901
    i1 <- pm$pch == 1
    i2 <- pm$pch == 21
    means <- list(Zc1 = mean(pm$Zc[i1]), Zc2 = mean(pm$Zc[i2]), nH2O1 = mean(pm$nH2O[i1]), nH2O2 = mean(pm$nH2O[i2]))
    points(means$Zc1, means$nH2O1, pch = pch[i], cex = 1.5, lwd = 2, bg = "#ffffffa0")
    lines(c(means$Zc1, means$Zc2), c(means$nH2O1, means$nH2O2))
    points(means$Zc2, means$nH2O2, pch = pch[i], cex = 1.8, lwd = 2, bg = "#df536ba0")
    # Plot number of samples next to points 20210902
    n1 <- length(pm$Zc[i1])
    n2 <- length(pm$Zc[i2])
    if(i == 3) adj <- c(-1, 1) else adj <- c(-1, 0.5)
    text(means$Zc1, means$nH2O1, n1, adj = adj)
    text(means$Zc2, means$nH2O2, n2, adj = c(0.2, -1))
    outD[[i]] <- pm
  }
  # Add labels
  text(-0.168, -0.726, "Marcellus Shale")
  text(-0.202, -0.731, "Denver-Julesburg Basin")
  text(-0.172, -0.745, "Duvernay Formation")
  # Add legend
  legend("topright", c("Source water\nor injected fluids", "Produced water"), pch = c(21, 21), pt.bg = c("#ffffffa0", "#df536ba0"), pt.cex = c(1.4, 1.7), lwd = 2, lty = NA)
  label.figure("D", cex = 1.5, xfrac = 0.03, font = 2)

  ## Panel E: Zc differences between oxidized and reduced sample subsets in various studies
  study <- c(
    "GBL+15", "JHM+16_O2", "MPB+17", "BCA+21",
    "SVH+19_O2", "MZG+20_Zug", "MZG+20_Lugano", "HXZ+20",
    "UKD+18.water_2014", "UKD+18.sediment_2014", "MMA+20_spring", "MMA+20_fall",
    "CHM+14_injected-49", "HRR+18_injected-22", "ZLF+19_injected-18",
    "SMS+12", "EH18"
  )
  description <- c(
    "ETNP (0.2-1.6 \u00B5M)", "Lake Fryxell mat", "Manus Basin (water samples)", "Ursu Lake (all months)",
    "Black Sea", "Lake Zug", "Lake Lugano", "Blue Hole (Inside)",
    "NW PA streams (2014)", "NW PA sediment (2014)", "PASF streams (spring)", "PASF streams (fall)",
    "Marcellus Shale", "Denver-Julesburg Basin", "Duvernay Formation",
    "Bison Pool", "Mono Lake"
  )
  # Description of conditions of sample groups
  cond2 <- c("anoxic", "anoxic", "> 50 \u00B0C", "anoxic",
             "anoxic", "anoxic", "anoxic", "anoxic",
             "MSA+", "MSA+", "highest", "highest",
             "PW day 49+", "PW day 22+", "PW day 18",
             "anoxic", "anoxic"
  )
  cond1 <- c("oxic", "oxic", "< 50 \u00B0C", "oxic",
             "oxic", "oxic", "oxic", "oxic",
             "MSA-", "MSA-", "lowest", "lowest",
             "IF day 0", "SW day 0", "SW day 0",
             "oxic", "oxic"
  )
  # Values of pch (from getmdat()/plot_metrics()) for oxidized and reduced sample groups
  pch_ox <- c(24, 24, 21, 24,
            24, 24, 24, 24,
            1, 1, 1, 1,
            1, 1, 1,
            24, 24
  )
  pch_red <- c(25, 25, 23, 25,
            25, 25, 25, 25,
            21, 21, 21, 21,
            21, 21, 21,
            25, 25
  )

  # Loop over studies and get Zc difference
  P <- n1 <- n2 <- DZc <- numeric()
  for(i in 1:length(study)) {
    # Get metrics for samples in this study
    pm <- plotmet_geo16S(study[[i]], plot.it = FALSE)
    # Determine oxidized and reduced sample groups from values of pch returned by plot_metrics()  20210901
    pm <- pm[!is.na(pm$pch), ]
    iox <- pm$pch == pch_ox[i]
    ired <- pm$pch == pch_red[i]
    # Keep track of the mean difference of Zc
    DZc <- c(DZc, mean(pm$Zc[ired]) - mean(pm$Zc[iox]))
    # Keep track of number of samples 20210902
    n1 <- c(n1, length(pm$Zc[iox]))
    n2 <- c(n2, length(pm$Zc[ired]))
  }

  # Include numbers of samples in condition text 20210902
  condition <- paste0(cond2, " (", n2, ") - ", cond1, " (", n1, ")")

  # Order samples by Delta Zc 20220118
  orderDZc <- order(DZc)
  DZc <- DZc[orderDZc]
  description <- description[orderDZc]
  condition <- condition[orderDZc]

  # Setup plot
  nsamp <- length(DZc)
  par(mar = c(4, 12, 3, 0.5), las = 1)
  plot(c(min(DZc), 0.01), c(nsamp + 0.5, 0.5), ylim = c(nsamp + 0.5, 0.5), yaxs = "i", yaxt = "n", ylab = "", xlab = quote(Delta*italic(Z)[C]), type = "n")
  title("Mean differences between oxidized                         ", font.main = 1, line = 1.6)
  title("and reduced sample groups                         ", font.main = 1, line = 0.5)
  # Add line at DZc = 0
  abline(v = 0, lty = 2, col = "gray40")
  # Use red for shale gas and hydrothermal systems
  col <- rep(1, length(description))
  col[description %in% c("Bison Pool", "Manus Basin (water samples)", "Marcellus Shale", "Denver-Julesburg Basin", "Duvernay Formation")] <- 2
  # Plot Delta Zc 
  for(i in 1:nsamp) lines(c(DZc[i], DZc[i]), c(i - 0.5, i + 0.5), lwd = 2, col = col[i])

  # Add dataset description and conditions
  axis(2, at = 1:nsamp - 0.2, labels = description, tick = FALSE)
  axis(2, at = 1:nsamp + 0.2, labels = condition, tick = FALSE, cex.axis = 0.9)
  label.figure("E", cex = 1.5, yfrac = 0.97, font = 2)

  if(pdf) dev.off()

  # Return data for Supplementary Table 20210901
  outB <- do.call(rbind, outB)
  outD <- do.call(rbind, outD)
  out <- rbind(outB, outD)
  out <- out[, !colnames(out) %in% c("pch", "col")]
  out$nH2O <- round(out$nH2O, 6)
  out$Zc <- round(out$Zc, 6)
  invisible(out)

}

# Function to plot individual datasets for geo16S5 20220112
# Use H2O = FALSE for Zc, H2O = TRUE for nH2O
MG16S <- function(which, plot.lines = TRUE, lowest.level = NULL, lineage = NULL, rm.outliers = FALSE, H2O = FALSE, cex = 1) {

  # For MG datasets analyzed in this study (others are from gradox paper)
  ARASTdir <- system.file("extdata/geo16S/ARAST", package = "JMDplots")
  # Read data for paired metagenomes and amplicon sequences expanded from Tax4Fun paper (Aßhauer et al., 2015)
  AWDM15file <- system.file("extdata/geo16S/AWDM15.csv", package = "JMDplots")
  AWDM15 <- read.csv(AWDM15file)

  # Start a plot if there isn't one 20220122
  if(is.null(dev.list())) {
    xylim <- c(-0.22, -0.08)
    xlab <- quote(italic(Z)[C]~"from shotgun metagenome or metatranscriptome")
    ylab <- quote(italic(Z)[C]~"estimated from 16S rRNA")
    plot(xylim, xylim, type = "n", xlab = xlab, ylab = ylab)
  }

  # Use semi-transparent colors 20220122
  c1 <- adjustcolor(1, alpha.f = 0.5)
  c2 <- adjustcolor(2, alpha.f = 0.69)
  c4 <- adjustcolor(4, alpha.f = 0.69)
  c5 <- adjustcolor(5, alpha.f = 0.69)
  c6 <- adjustcolor(6, alpha.f = 0.69)
  c8 <- adjustcolor(8, alpha.f = 0.69)

  if(which == "Guerrero_Negro") {
    ## Guerrero Negro metagenome (Kunin et al., 2008)
    dat_MG <- mplot("Guerrero_Negro", "IMG_MGP", plot.it = FALSE, H2O = H2O)
    # Reverse the order because upper mat layers are plotted on the right
    metric_MG <- rev(dat_MG$AA)
    # 16S data (Harris et al., 2013)

    metrics <- getmetrics_geo16S("HCW+13", lowest.level = lowest.level, lineage = lineage)
    mdat <- getmdat_geo16S("HCW+13", metrics)
    dat_16S <- mdat$metrics

    if(H2O) metric_16S <- dat_16S$nH2O else metric_16S <- dat_16S$Zc
    # Check that the sample names are the same
    stopifnot(all.equal(rev(rownames(dat_MG$meancomp)), gsub(".*_", "", dat_16S$sample)))
    # Add lines and points
    if(plot.lines) lines(metric_MG, metric_16S)
    points(metric_MG, metric_16S, pch = 21, bg = "white", cex = cex)
    # Fill symbol for most oxidized (surface) sample
    points(metric_MG[1], metric_16S[1], pch = 21, bg = 4, cex = cex)
    # Get sample name and ID for Supplemental Table 20220125
    Sample <- mdat$metadata$sample
    Amplicon <- mdat$metadata$GenBank
    gradox_S1 <- read.csv(system.file("extdata/gradox/Table_S1.csv", package = "JMDplots"))
    Metagenome <- gradox_S1$ID[gradox_S1$study.name == "Guerrero_Negro"]
  }

  if(which == "ETNP_MG") {
    ## ETNP OMZ metagenome (Glass et al., 2015)
    dat_MG <- mplot("ETNP_OMZ", "SRA_MGP", plot.it = FALSE, H2O = H2O)
    # Reverse the order because smaller water depths are plotted on the right
    metric_MG <- rev(dat_MG$AA)
    # 16S data (Ganesh et al., 2015)
    metrics <- getmetrics_geo16S("GBL+15", lowest.level = lowest.level, lineage = lineage)
    mdat <- getmdat_geo16S("GBL+15", metrics)
    dat_16S <- mdat$metrics
    metadata <- mdat$metadata
    # Use smallest size fraction
    dat_16S <- subset(dat_16S, grepl("1.6micron", dat_16S$sample))
    metadata <- metadata[metadata$Run %in% dat_16S$Run, ]
    if(H2O) metric_16S <- dat_16S$nH2O else metric_16S <- dat_16S$Zc
    # Check that the sample names are the same
    stopifnot(all.equal(rev(rownames(dat_MG$meancomp)), gsub("_.*", "", dat_16S$sample)))
    # Add lines and points
    if(plot.lines) lines(metric_MG, metric_16S)
    points(metric_MG, metric_16S, pch = 21, bg = "white", cex = cex)
    # Fill symbol for surface sample
    points(metric_MG[1], metric_16S[1], pch = 21, bg = 4, cex = cex)
    # Get sample name and ID for Supplemental Table 20220125
    Sample <- paste0(metadata$depth, "m_", metadata$size)
    Amplicon <- metadata$Run
    gradox_S1 <- read.csv(system.file("extdata/gradox/Table_S1.csv", package = "JMDplots"))
    Metagenome <- gradox_S1$ID[gradox_S1$study.name == "ETNP_OMZ" & gradox_S1$type == "MG"]
  }

  if(which == "ETNP_MT") {
    ## ETNP OMZ metatranscriptome (Ganesh et al., 2015)
    dat_MT <- mplot("ETNP_OMZ", "SRA_MTP", plot.it = FALSE, H2O = H2O)
    # Reverse the order because smaller water depths are plotted on the right
    metric_MT <- rev(dat_MT$AA)
    # 16S data (Ganesh et al., 2015)
    metrics <- getmetrics_geo16S("GBL+15", lowest.level = lowest.level, lineage = lineage)
    mdat <- getmdat_geo16S("GBL+15", metrics)
    dat_16S <- mdat$metrics
    metadata <- mdat$metadata
    # Use smallest size fraction
    dat_16S <- subset(dat_16S, grepl("1.6micron", dat_16S$sample))
    metadata <- metadata[metadata$Run %in% dat_16S$Run, ]
    if(H2O) metric_16S <- dat_16S$nH2O else metric_16S <- dat_16S$Zc
    # Check that the sample names are the same
    stopifnot(all.equal(rev(rownames(dat_MT$meancomp)), gsub("_.*", "", dat_16S$sample)))
    # Add lines and points
    if(plot.lines) lines(metric_MT, metric_16S, col = 8)
    points(metric_MT, metric_16S, pch = 22, bg = "white", col = 8, cex = cex)
    # Fill symbol for surface sample
    points(metric_MT[1], metric_16S[1], pch = 22, bg = 4, col = 8, cex = cex)
    # For the return value
    metric_MG <- metric_MT
    # Get sample name and ID for Supplemental Table 20220125
    Sample <- paste0(metadata$depth, "m_", metadata$size)
    Amplicon <- metadata$Run
    gradox_S1 <- read.csv(system.file("extdata/gradox/Table_S1.csv", package = "JMDplots"))
    Metagenome <- gradox_S1$ID[gradox_S1$study.name == "ETNP_OMZ" & gradox_S1$type == "MT"]
  }

  if(which == "Bison_Pool") {
    ## Bison Pool metagenome (Havig et al., 2011)
    dat_MG <- mplot("Bison_Pool", "IMG_MGP", plot.it = FALSE, H2O = H2O)
    metric_MG <- dat_MG$AA
    # 16S data (Swingley et al., 2012)
    # mincount needs to be lowered from default for when lineage = "genus"
    # (site 4 (Q) has less than 200 genus-level classifications)
    metrics <- getmetrics_geo16S("SMS+12", lowest.level = lowest.level, lineage = lineage, mincount = 50)
    mdat <- getmdat_geo16S("SMS+12", metrics)
    dat_16S <- mdat$metrics
    metadata <- mdat$metadata
    if(H2O) metric_16S <- dat_16S$nH2O else metric_16S <- dat_16S$Zc
    # Check that the sample names are the same
    stopifnot(all.equal(rownames(dat_MG$meancomp), metadata$"Field Code"))
    # Add lines and points
    if(plot.lines) lines(metric_MG, metric_16S)
    points(metric_MG, metric_16S, pch = 21, bg = "transparent", cex = cex)
    # Fill symbol for low-T sample
    points(metric_MG[5], metric_16S[5], pch = 21, bg = c4, cex = cex)
    # Get sample name and ID for Supplemental Table 20220125
    gradox_S1 <- read.csv(system.file("extdata/gradox/Table_S1.csv", package = "JMDplots"))
    Library <- sapply(strsplit(gradox_S1$sample.description[gradox_S1$study.name == "Bison_Pool"], " "), "tail", 1)
    Sample <- paste0("Site ", metadata$Sample, " (", metadata$`Field Code`, ") (", Library, ")")
    Amplicon <- metadata$Run
    Metagenome <- gradox_S1$ID[gradox_S1$study.name == "Bison_Pool"]
  }

  if(which == "Mono_Lake") {
    ## Mono Lake metatranscriptome (Edwardson and Hollibaugh, 2017)
    dat_MT <- mplot("Mono_Lake", "SRA_MTP", plot.it = FALSE, H2O = H2O)
    # NOTE: reverse the order because smaller water depths are plotted on the right
    metric_MT <- rev(dat_MT$AA)
    # 16S data (Edwardson and Hollibaugh, 2018)
    metrics <- getmetrics_geo16S("EH18", lowest.level = lowest.level, lineage = lineage)
    mdat <- getmdat_geo16S("EH18", metrics)
    dat_16S <- mdat$metrics
    metadata <- mdat$metadata
    if(H2O) metric_16S <- dat_16S$nH2O else metric_16S <- dat_16S$Zc
    # Check that the sample names are the same
    stopifnot(all.equal(rev(rownames(dat_MT$meancomp)), gsub(".*_", "", dat_16S$sample)))
    # Add lines and points
    if(plot.lines) lines(metric_MT, metric_16S, pch = 0, col = 8)
    points(metric_MT, metric_16S, pch = 22, bg = "white", col = 8, cex = cex)
    # Fill symbol for surface sample
    points(metric_MT[1], metric_16S[1], pch = 22, bg = 4, col = 8, cex = cex)
    # For the return value
    metric_MG <- metric_MT
    # Get sample name and ID for Supplemental Table 20220125
    Sample <- paste0(metadata$sample)
    Amplicon <- metadata$Run
    gradox_S1 <- read.csv(system.file("extdata/gradox/Table_S1.csv", package = "JMDplots"))
    Metagenome <- gradox_S1$ID[gradox_S1$study.name == "Mono_Lake"]
  }

  if(which == "Marcellus_Shale") {
    ## Marcellus metagenomes (Daly et al., 2016) 20211218
    aa <- read.csv(file.path(ARASTdir, "Marcellus_Shale_AA.csv"))
    if(H2O) metric_MG <- nH2O(aa) else metric_MG <- Zc(aa)
    # Marcellus 16S (Cluff et al., 2014)
    metrics <- getmetrics_geo16S("CHM+14", lowest.level = lowest.level, lineage = lineage)
    mdat <- getmdat_geo16S("CHM+14", metrics)
    dat_16S <- mdat$metrics
    metadata <- mdat$metadata
    # Time points: input, T7, T13, T82, T328
    Sample = paste("Day", c(0, 7, 13, 82, 328))
    # List run IDs here
    Metagenome = c("SRR3111417", "SRR3111625", "SRR3111724", "SRR3111729", "SRR3111737")
    Amplicon = c("SRR1184016", "SRR1184060", "SRR1184062", "SRR1184081", "SRR1184083")
    # GC content from https://trace.ncbi.nlm.nih.gov/Traces/sra/?run=SRR******* 20220126
    GC_MG <- c(58.4, 37.7, 57, 39.6, 53.1)
    GC_16S <- c(53.9, 55, 54.8, 51.8, 53.2)
    ## 16S replicate 2
    #Amplicon2 = c("SRR1184049", "SRR1184061", "SRR1184063", "SRR1184082", "SRR1184084")
    # Make sure metagenomes are in correct order
    stopifnot(all(aa$protein == Metagenome))
    # Get 16S runs corresponding to metagenomes
    idat <- match(Amplicon, dat_16S$Run)
    dat_16S <- dat_16S[idat, ]
    if(H2O) metric_16S <- dat_16S$nH2O else metric_16S <- dat_16S$Zc
    # Assign colors: open circle for injected fluid, gray for flowback, red for produced
    metadata <- metadata[idat, ]
    col <- rep(4, nrow(metadata))
    col[metadata$type == "flowback fluid"] <- 8
    col[metadata$type == "produced fluid"] <- 2
    if(plot.lines) type <- "b" else type <- "p"
    points(metric_MG, metric_16S, pch = 23, bg = col, type = type, cex = cex)
  }

  if(which == "Manus_Basin") {
    ## Manus Basin metagenomes (Meier et al., 2017) 20220110
    aa <- read.csv(file.path(ARASTdir, "Manus_Basin_AA.csv"))
    if(H2O) metric_MG <- nH2O(aa) else metric_MG <- Zc(aa)
    # Manus Basin 16S (Meier et al., 2017)
    metrics <- getmetrics_geo16S("MPB+17", lowest.level = lowest.level, lineage = lineage)
    mdat <- getmdat_geo16S("MPB+17", metrics)
    dat_16S <- mdat$metrics
    metadata <- mdat$metadata
    # Samples: NSu-F2b, NSu-F5, Fw-F1b, Fw-F3, RR-F1b
    Sample = c("MNB27-NSu-F2b", "MNB29-NSu-F5", "MNB14-Fw-F1b", "MNB17-Fw-F3", "MNB45-RR-F1b")
    # List run IDs here
    Metagenome = c("ERR1679394", "ERR1679395", "ERR1679397", "ERR1679396", "ERR1679398")
    Amplicon = c("ERR1665247", "ERR1665249", "ERR1665234", "ERR1665237", "ERR1665265")
    # Make sure metagenomes are in correct order
    stopifnot(all(aa$protein == Metagenome))
    # Get 16S runs corresponding to metagenomes
    idat <- match(Amplicon, dat_16S$Run)
    dat_16S <- dat_16S[idat, ]
    if(H2O) metric_16S <- dat_16S$nH2O else metric_16S <- dat_16S$Zc
    # Assign colors: blue for < 10 degC, red for > 50 degC
    metadata <- metadata[idat, ]
    col <- rep("white", nrow(metadata))
    col[metadata$T > 50] <- c2
    col[metadata$T < 10] <- c4
    points(metric_MG, metric_16S, pch = 22, bg = col, col = c1, cex = cex)
  }

  if(which == "Black_Sea") {
    ## Black Sea metagenome (Villanueva et al., 2021) 20220115
    aa <- read.csv(file.path(ARASTdir, "Black_Sea_AA.csv"))
    if(H2O) metric_MG <- nH2O(aa) else metric_MG <- Zc(aa)
    # 16S data (Sollai et al., 2019)
    metrics <- getmetrics_geo16S("SVH+19")
    ## TODO: add missing arguments 20220506
    #metrics <- getmetrics_geo16S("SVH+19", lowest.level = lowest.level, lineage = lineage)
    mdat <- getmdat_geo16S("SVH+19", metrics)
    dat_16S <- mdat$metrics
    metadata <- mdat$metadata
    if(H2O) metric_16S <- dat_16S$nH2O else metric_16S <- dat_16S$Zc
    Sample = c(50, 70, 80, 85, 90, 95, 100, 105,
      110, 130, 170, 250, 500, 1000, 2000)
    Metagenome = c("SRR12347146", "SRR12347145", "SRR12347139", "SRR12347138", "SRR12347137", "SRR12347136", "SRR12347135", "SRR12347134",
      "SRR12347133", "SRR12347132", "SRR12347144", "SRR12347143", "SRR12347142", "SRR12347141", "SRR12347140")
    # Check that the samples are in the right order
    stopifnot(all(metadata$depth == Sample))
    stopifnot(all(aa$protein == Metagenome))
    # Assign colors and symbols: blue up for < 100 m, red down for >= 100 m
    col <- rep(c4, nrow(metadata))
    pch <- rep(24, nrow(metadata))
    col[metadata$depth >= 100] <- c2
    pch[metadata$depth >= 100] <- 25
    points(metric_MG, metric_16S, pch = pch, bg = col, col = c1, cex = cex)
    Amplicon <- metadata$Run
  }

  if(which == "HMP") {
    # HMP 16S
    met <- getmetrics_geo16S("HMP12", lowest.level = lowest.level, lineage = lineage)
    # NOTE: don't use 'metrics' argument here in order to get metadata for *all* samples
    metadata <- getmdat_geo16S("HMP12")
    # HMP metagenomes
    aa <- read.csv(file.path(ARASTdir, "HMP_AA.csv"))
    # Put data in same order
    dat <- AWDM15[AWDM15$Name == "HMP", ]
    imet <- match(dat$Amplicon, met$Run)
    met <- met[imet, ]
    iaa <- match(dat$Metagenome, aa$protein)
    aa <- aa[iaa, ]
    # Make sure the 16S and metagenomes are paired correctly
    stopifnot(all(na.omit(met$Run == dat$Amplicon)))
    stopifnot(all(aa$protein == dat$Metagenome))
    # Get Zc values
    if(H2O) metric_16S <- met$nH2O else metric_16S <- met$Zc
    if(H2O) metric_MG <- nH2O(aa) else metric_MG <- Zc(aa)
    # Don't plot MG with low numbers of protein fragments 20220122
    ilow <- aa$chains < 50000
    if(any(ilow)) {
      metric_MG <- metric_MG[!ilow]
      metric_16S <- metric_16S[!ilow]
      metadata <- metadata[!ilow, ]
      aa <- aa[!ilow, ]
      dat <- dat[!ilow, ]
    }
    # Remove outliers (anomalously high Zc in metagenome) 20221215
    if(rm.outliers & !H2O) {
      iout.Nasal <- dat$Body.site=="Nasal cavity" & metric_MG > -0.12
      iout.UG <- dat$Body.site=="UG tract" & metric_MG > -0.14
      iout <- iout.Nasal | iout.UG
      metric_MG <- metric_MG[!iout]
      metric_16S <- metric_16S[!iout]
      metadata <- metadata[!iout, ]
      aa <- aa[!iout, ]
      dat <- dat[!iout, ]
    }
    # Colors: blue (Skin), green (Nasal cavity), gray (Oral cavity), red (GI tract), magenta (UG tract)
    # Symbols: up triangle (skin, GI tract), circle (Oral cavity), down triangle (Nasal cavity, UG tract)
    col <- sapply(metadata$"Body site", switch, "Skin" = c5, "Nasal cavity" = c4, "Oral cavity" = c8, "GI tract" = c2, "UG tract" = c6)
    pch <- sapply(metadata$"Body site", switch, "Skin" = 24, "Nasal cavity" = 25, "Oral cavity" = 21, "GI tract" = 24, "UG tract" = 25)
    points(metric_MG, metric_16S, pch = pch, bg = col, col = c1)
    # Get sample name and ID for Supplemental Table 20220125
    Sample <- dat$Sample.name
    Amplicon <- dat$Amplicon
    Metagenome <- dat$Metagenome
  }

  if(which == "Guts") {
    # Guts 16S
    met <- getmetrics_geo16S("MKK+11", lowest.level = lowest.level, lineage = lineage, mincount = 50)
    # Guts metagenomes
    aa <- read.csv(file.path(ARASTdir, "Guts_AA.csv"))
    # Make sure the 16S and metagenomes are paired correctly
    dat <- AWDM15[AWDM15$Name == "Guts", ]
    stopifnot(all(met$Run == paste0("mgm", dat$Amplicon)))
    stopifnot(all(aa$protein == paste0("mgm", dat$Metagenome)))
    # Get Zc values
    if(H2O) metric_16S <- met$nH2O else metric_16S <- met$Zc
    if(H2O) metric_MG <- nH2O(aa) else metric_MG <- Zc(aa)
    points(metric_MG, metric_16S, pch = 21, bg = c2, col = c1, cex = cex)
    # Get sample name and ID for Supplemental Table 20220125
    Sample <- dat$Sample.name
    Amplicon <- paste0("mgm", dat$Amplicon)
    Metagenome <- paste0("mgm", dat$Metagenome)
  }

  if(which == "Soils") {
    # Soils 16S
    met <- getmetrics_geo16S("FLA+12", lowest.level = lowest.level, lineage = lineage)
    # Soils metagenomes
    aa <- read.csv(file.path(ARASTdir, "Soils_AA.csv"))
    # Put data in same order
    dat <- AWDM15[AWDM15$Name == "Soils", ]
    imet <- match(paste0("mgm", dat$Amplicon), met$Run)
    met <- met[imet, ]
    iaa <- match(paste0("mgm", dat$Metagenome), aa$protein)
    aa <- aa[iaa, ]
    # Make sure the 16S and metagenomes are paired correctly
    stopifnot(all(met$Run == paste0("mgm", dat$Amplicon)))
    stopifnot(all(aa$protein == paste0("mgm", dat$Metagenome)))
    # Get Zc values
    if(H2O) metric_16S <- met$nH2O else metric_16S <- met$Zc
    if(H2O) metric_MG <- nH2O(aa) else metric_MG <- Zc(aa)
    points(metric_MG, metric_16S, pch = 21, bg = c4, col = c1, cex = cex)
    # Get sample name and ID for Supplemental Table 20220125
    Sample <- dat$Sample.name
    Amplicon <- paste0("mgm", dat$Amplicon)
    Metagenome <- paste0("mgm", dat$Metagenome)
  }

  list(which = rep(which, length(Sample)), metric_MG = metric_MG, metric_16S = metric_16S, Sample = Sample, Amplicon = Amplicon, Metagenome = Metagenome)

}

# Comparison of protein Zc from metagenomic or metatranscriptomic data with estimates from 16S and reference sequences 20211017
# Add Marcellus, HMP, and Soils and Guts 20211218
geo16S5 <- function(pdf = FALSE) {

  if(pdf) pdf("geo16S5.pdf", width = 10, height = 6)
  mat <- matrix(c(1,1, 2,2, 3,3, 0, 4,4, 5,5, 0), nrow = 2, byrow = TRUE)
  layout(mat)
  par(mar = c(4, 4, 2, 1), mgp = c(2.8, 1, 0))
  xylim <- c(-0.22, -0.12)
  xlimHMP <- c(-0.22, -0.08)

  ### Panel A: Comparisons with metagenomes analyzed by Dick et al. (2019)

  # Start plot A
  xlab <- quote(italic(Z)[C]~"from shotgun metagenome or metatranscriptome")
  ylab <- quote(italic(Z)[C]~"estimated from 16S rRNA")
  plot(xylim, xylim, type = "n", xlab = xlab, ylab = ylab)
  lines(xylim, xylim, lty = 2, col = "gray40")
  out1 <- MG16S("Guerrero_Negro")
  # Label points for upper 3 layers
  text(c(-0.135, -0.1382, -0.1422), c(-0.1472, -0.1598, -0.1618), c("1", "2", "3"), cex = 0.8)
  text(-0.135, -0.174, "Guerrero\nNegro\nmat")
  out2 <- MG16S("ETNP_MG")
  out3 <- MG16S("ETNP_MT")
  text(-0.172, -0.160, "ETNP water")
  out4 <- MG16S("Bison_Pool", cex = 1.4, plot.lines = FALSE)
  # Make arrows to show outflow channel 20220120
  dx.fraction <- c(0.1, 0.08, 0.08)
  for(i in 1:3) {
    x <- out4$metric_MG[i:(i+1)]
    y <- out4$metric_16S[i:(i+1)]
    lmxy <- lm(y ~ x)
    # Find coordinates for a fraction of the line length
    dx <- diff(x) * dx.fraction[i]
    xs <- c(x[1] + dx, x[2] - dx)
    ys <- predict(lmxy, data.frame(x = xs))
    arrows(xs[1], ys[1], xs[2], ys[2], length = 0.1)
  }
  text(-0.16, -0.212, "Bison Pool")
  text(-0.1905, -0.2172, "Hot spring source", cex = 0.9)
  text(-0.1875, -0.208, "Outflow channel", cex = 0.9)
  out5 <- MG16S("Mono_Lake")
  text(-0.182, -0.148, "Mono Lake water")
  # Add legend
  legend("topleft", rep("                                ", 3), pch = c(21, 21, 22), col = c(1, 1, 8), pt.bg = c(4, "white", "white"))
  ltxt <- as.expression(c(quote("Highest "*O[2]), "Metagenome", "Metatranscriptome"))
  legend("topleft", ltxt, pch = c(22, NA, NA), col = c(8, NA, NA), pt.bg = c(4, NA, NA), inset = c(0.025, 0), bty = "n")
  # Add title and figure label
  title("Various Environments", font.main = 1, cex.main = 1.1)
  label.figure("A", cex = 1.5, font = 2, xfrac = 0.04, yfrac = 0.96)

  ### Panels B-E: Comparisons with metagenomes analyzed in this study

  # Start plot B
  xlab <- quote(italic(Z)[C]~"from shotgun metagenome")
  plot(xylim, xylim, type = "n", xlab = xlab, ylab = ylab)
  lines(xylim, xylim, lty = 2, col = "gray40")
  out6 <- MG16S("Marcellus_Shale")
  dy <- c(-0.004, 0, 0, -0.005, -0.005)
  dx <- c(0.007, -0.008, 0.009, 0, 0)
  text(out6$metric_MG + dx, out6$metric_16S + dy, out6$Sample)
  # Add legend
  legend("topleft", c("Injected", "Flowback", "Produced"), pch = 23, pt.bg = c(4, 8, 2))
  # Add title and figure label
  title("Marcellus Shale Fluids", font.main = 1, cex.main = 1.1)
  label.figure("B", cex = 1.5, font = 2, xfrac = 0.04, yfrac = 0.96)

  # Use semi-transparent colors 20220122
  c1 <- adjustcolor(1, alpha.f = 0.5)
  c2 <- adjustcolor(2, alpha.f = 0.69)
  c4 <- adjustcolor(4, alpha.f = 0.69)
  c5 <- adjustcolor(5, alpha.f = 0.69)
  c6 <- adjustcolor(6, alpha.f = 0.69)
  c8 <- adjustcolor(8, alpha.f = 0.69)

  # Start plot C
  xlab <- quote(italic(Z)[C]~"from shotgun metagenome")
  plot(xylim, xylim, type = "n", xlab = xlab, ylab = ylab)
  lines(xylim, xylim, lty = 2, col = "gray40")
  out7 <- MG16S("Manus_Basin", cex = 1.4)
  # Plot sample names and O2 concentrations (from Figure S5 of Meier et al., 2017)
  dx <- c(0.010, 0.009, -0.012, -0.008, 0.007)
  dy <- c(0, 0, 0.012, 0, -0.0018)
  samptxt <- substr(out7$Sample, 7, 13)
  text(out7$metric_MG + dx, out7$metric_16S - 0.004 + dy, samptxt, cex = 0.9)
  O2txt <- as.expression(c(quote(0.07~"mM"~O[2]), quote(0.14~"mM"~O[2]), quote(0.17~"mM"~O[2]), quote("ND"~O[2]), quote(0.2~"mM"~O[2])))
  text(out7$metric_MG + dx, out7$metric_16S - 0.009 + dy, O2txt, cex = 0.9)

  # Add Black Sea 20220115
  out8 <- MG16S("Black_Sea", cex = 0.9)
  # Add legends
  legend("topleft", c("Depth < 100 m", "Depth >= 100 m"), pch = c(24, 25), pt.bg = c(c4, c2), col = c1, pt.cex = 0.9, title = "Black Sea")
  legend <- as.expression(c(quote(italic(T)~"< 10 \u00B0C"), quote("10 \u00B0C <"~italic(T)~"< 50 \u00B0C"), quote(italic(T)~"> 50 \u00B0C")))
  legend("bottomright", legend = legend, pch = 22, pt.bg = c(c4, "white", c2), col = c1, pt.cex = 1.3, title = "Manus Basin")
  # Add title and figure label
  title("Manus Basin Vents and Black Sea", font.main = 1, cex.main = 1.1)
  label.figure("C", cex = 1.5, font = 2, xfrac = 0.04, yfrac = 0.96)

  ### Panels D-E: Plot Zc of 16S vs metagenomes for datasets used in Tax4Fun/PICRUSt papers 20211215

  # Start plot D
  xlab <- quote(italic(Z)[C]~"from shotgun metagenome")
  plot(xlimHMP, xylim, type = "n", xlab = xlab, ylab = ylab)
  lines(xylim, xylim, lty = 2, col = "gray40")
  out9 <- MG16S("HMP")
  legend("topleft", c("Skin", "Nasal cavity", "Oral cavity", "GI tract", "UG tract"), pch = c(24, 25, 21, 24, 25), pt.bg = c(c5, c4, c8, c2, c6), col = c1)
  title("Human Microbiome Project", font.main = 1, cex.main = 1.1)
  label.figure("D", cex = 1.5, font = 2, xfrac = 0.04, yfrac = 0.96)

  # Start plot E
  xlab <- quote(italic(Z)[C]~"from shotgun metagenome")
  plot(xylim, xylim, type = "n", xlab = xlab, ylab = ylab)
  lines(xylim, xylim, lty = 2, col = "gray40")
  out10 <- MG16S("Guts")
  out11 <- MG16S("Soils")
  legend("topleft", c("Soils", "Guts"), pch = 21, pt.bg = c(c4, c2), col = c1)
  title("Soils and Mammalian Guts", font.main = 1, cex.main = 1.1)
  label.figure("E", cex = 1.5, font = 2, xfrac = 0.04, yfrac = 0.96)

  if(pdf) dev.off()

  # Return values for Supplemental Table 20220125
  out <- list(out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11)
  out <- data.frame(
    name = unlist(lapply(out, "[[", "which")),
    sample = unlist(lapply(out, "[[", "Sample")),
    ID_MG = unlist(lapply(out, "[[", "Metagenome")),
    ID_16S = unlist(lapply(out, "[[", "Amplicon")),
    Zc_MG = round(unlist(lapply(out, "[[", "metric_MG")), 6),
    Zc_16S = round(unlist(lapply(out, "[[", "metric_16S")), 6)
  )
  invisible(out)

}


# RefSeq and 16S rRNA data processing outline 20220104
geo16S_S1 <- function(pdf = FALSE) {

  if(pdf) pdf("geo16S_S1.pdf", width = 17.1, height = 5)

  par(mar = c(0.1, 0.1, 0.1, 0.1))
  plot(c(1, 10.2), c(1.9, 10.1), type = "n", axes = FALSE, xlab = "", ylab = "", xaxs = "i", yaxs = "i")
  text(1, 10, "Reference Sequence Processing", adj = c(0, 1), font = 2)
  text(1, 9, "1. Use taxids classified at species level", adj = 0)
  text(1, 8, "2. Sum AA of all sequences for each species", adj = 0)
  text(1, 7, "3. Divide by number of sequences to get mean AA for each species (AA[species])", adj = 0)
  text(1, 6, "4. Calculate mean AA[genus] from all AA[species] in each genus", adj = 0)
  text(1, 5, "5. Calculate mean AA[family] from all AA[species] in each family", adj = 0)
  text(1, 4, "6. Calculate mean AA[order] from all AA[species] in each order", adj = 0)
  text(1, 3, "7. Calculate mean AA[class] from all AA[species] in each class", adj = 0)
  text(1, 2, "8. Calculate mean AA[phylum] from all AA[species] in each phylum", adj = 0)
#  text(1, 1, "9. Calculate and record Zc and nH2O for all taxonomic groups at each level", adj = 0)

  text(3.35, 10, "(Data Source)", adj = c(0, 1), font = 2)
  text(3.35, 9, "(NCBI taxonomy files)", adj = 0, font = 2)
  text(3.35, 8, "(RefSeq protein sequences)", adj = 0, font = 2)

  text(5, 10, "Amino Acid Compositions\nfor Taxonomic Groups\n(Reference Proteomes)", adj = c(0, 1), font = 2)
  # Read file with precomputed metrics for taxa at different ranks
  metrics <- read.csv(system.file("RefDB/RefSeq_206/taxon_metrics.csv.xz", package = "JMDplots"))
  ranks <- c("superkingdom", "phylum", "class", "order", "family", "genus")
  plural <- c("superkingdoms (*)", "phyla", "classes", "orders", "families", "genera")
  for(irank in 1:6) {
    nrank <- sum(metrics$rank == ranks[irank])
    text(5, 9-irank, paste(nrank, plural[irank]), adj = 0)
  }
  lines(c(3.87, 3.87), c(1.8, 6.2))
  lines(c(4.9, 4.9), c(7.2, 2.8))
  arrows(3.9, 4, 4.88, 5.00)
  lines(c(5.68, 5.68), c(7.2, 2.8))
  arrows(5.71, 5.00, 6.43, 5.00)

  text(6.5, 10, "16S rRNA Classification and Analysis", adj = c(0, 1), font = 2)
  text(6.5, 9, "1. Run RDP Classifier on public 16S rRNA datasets (see Tables 1 and S1)", adj = 0)
  text(6.5, 8, "2. Assemble counts of lowest-level classifications (genus to phylum)", adj = 0)
  text(6.5, 7, "3. Mapping step 1: Manually convert some RDP names to NCBI names (see Methods)", adj = 0)
  text(6.5, 6, "4. Mapping step 2: Automatic text match between RDP and NCBI names", adj = 0)
  text(6.5, 5, "5. Multiply classification counts by reference proteomes", adj = 0)
  text(6.5, 4, "6. Divide by number of classifications to get estimated community proteome (AA[community])", adj = 0)
  text(6.5, 3, "7. Use AA[community] to calculate Zc and nH2O (see Dick et al., 2020)", adj = 0)
  text(6.5, 2, "8. Visualize data", adj = 0)

  if(pdf) dev.off()

}

# Scatterplots of Zc and nH2O for genera vs higher taxonomic levels 20211130
geo16S_S2 <- function(pdf = FALSE) {
  if(pdf) pdf("geo16S_S2.pdf", width = 12, height = 6)

  # Read file with precomputed metrics for taxa at different ranks
  metrics <- read.csv(system.file("RefDB/RefSeq_206/taxon_metrics.csv.xz", package = "JMDplots"))
  # Get tables of all taxonomic names and amino acid compositions
  names <- read.csv(system.file("RefDB/RefSeq_206/taxonomy.csv.xz", package = "JMDplots"))
  # Take out viruses
  ivirus <- names$superkingdom == "Viruses"
  ivirus[is.na(ivirus)] <- TRUE
  names <- names[!ivirus, ]
  # Take out NA genera and dereplicate lineages 20220107
  names <- names[!is.na(names$genus), ]
  gfocp <- apply(names[, c("genus", "family", "order", "class", "phylum")], 1, paste, collapse = " ")
  names <- names[!duplicated(gfocp), ]

  # Initialize list of Zc and nH2O values for taxa in each rank
  NAvec <- rep(NA, nrow(names))
  nH2O <- Zc <- list(genus = NAvec, family = NAvec, order = NAvec, class = NAvec, phylum = NAvec, superkingdom = NAvec)

  for(rank in c("genus", "family", "order", "class", "phylum", "superkingdom")) {
    icol <- match(rank, colnames(names))
    isrank <- !is.na(names[, icol])
    # Use a copy of the metrics table with blanked-out names of taxa not at this rank
    thismet <- metrics
    thismet$group[metrics$rank != rank] <- ""
    # Lookup these taxa in the metrics table
    imetrics <- match(names[, icol][isrank], thismet$group)
    # Make sure all taxa are in this rank
    stopifnot(all(na.omit(unique(thismet$rank[imetrics])) %in% rank))
    # Store Zc and nH2O values
    Zc[[rank]][isrank] <- thismet$Zc[imetrics]
    names(Zc[[rank]])[isrank] <- names[, icol][isrank]
    nH2O[[rank]][isrank] <- thismet$nH2O[imetrics]
    names(nH2O[[rank]])[isrank] <- names[, icol][isrank]
  }

#  # Print number of genera
#  ngenus <- length(names(Zc[["genus"]]))
#  print(paste("There are", ngenus, "genera"))

  # Make plots
  par(mfrow = c(2, 5))
  par(mgp = c(2.9, 1, 0))
  # Identify superkingdoms
  iArc <- names(Zc[["superkingdom"]]) == "Archaea"
  iBac <- names(Zc[["superkingdom"]]) == "Bacteria"

  cex <- 2
  cex.an <- 1
  # Outer loop: Zc or nH2O
  for(metric in c("Zc", "nH2O")) {
    
    # Inner loop: higher-level ranks
    ranks <- c("family", "order", "class", "phylum", "superkingdom")
    for(rank in ranks) {
      if(metric == "Zc") { x <- Zc[["genus"]]; y <- Zc[[rank]] }
      if(metric == "nH2O") { x <- nH2O[["genus"]]; y <- nH2O[[rank]] }
      xylim <- range(na.omit(x))
      # Start plot and add 1:1 line
      if(metric == "Zc") plot(xylim, xylim, type = "n", xlab = quote("genus"~italic(Z)[C]), ylab = bquote(.(rank)~italic(Z)[C]))
      if(metric == "nH2O") plot(xylim, xylim, type = "n", xlab = quote("genus"~italic(n)*H[2]*O), ylab = bquote(.(rank)~italic(n)*H[2]*O))
      lines(xylim, xylim, lty = 2, col = "gray40")
      # Add points: Bacteria then Archaea
      points(x[iBac], y[iBac], pch = ".", col = "steelblue3", cex = cex)
      points(x[iArc], y[iArc], pch = ".", col = 2, cex = cex)
      # Add points and linear regression
      thislm <- lm(y ~ x)
      lines(xylim, predict(thislm, data.frame(x = xylim)), col = "#00000080")
      # Show R2 and slope
      R2 <- summary(thislm)$r.squared
      R2txt <- bquote(italic(R)^2 == .(formatC(R2, digits = 3, format = "f")))
      legend("topleft", legend = R2txt, bty = "n")
      Slope <- coef(thislm)["x"]
      Slopetxt <- paste("slope =", formatC(Slope, digits = 3, format = "f"))
      legend("bottomright", legend = Slopetxt, bty = "n")
      if(metric == "Zc") {
        # Add title with number of taxa at this rank
        plural <- switch(rank, "genus" = "genera", "family" = "families", "order" = "orders",
          "class" = "classes", "phylum" = "phyla", "superkingdom" = "superkingdoms")
        ntaxa <- length(unique(names(Zc[[rank]])))
        title(paste(ntaxa, plural), font.main = 1)
      }
    }
  }

  if(pdf) dev.off()

}

# nH2O-Zc plots for major phyla and their genera 20220114
geo16S_S3 <- function(pdf = FALSE) {

  if(pdf) pdf("geo16S_S3.pdf", width = 13, height = 11)

  mat <- matrix(c(1,1,2,2, 0,3,3,0), nrow = 2, byrow = TRUE)
  layout(mat)
  par(mar = c(4, 4, 2, 1))
  par(cex= 1.2)
  xlim <- c(-0.3, 0)
  ylim <- c(-0.85, -0.65)

  metrics <- read.csv(system.file("RefDB/RefSeq_206/taxon_metrics.csv.xz", package = "JMDplots"))
  names <- read.csv(system.file("RefDB/RefSeq_206/taxonomy.csv.xz", package = "JMDplots"))
  # Only keep taxa with non-NA genus and phylum
  names <- names[!(is.na(names$genus) | is.na(names$phylum)), ]

  # Plot phylum connected to all genera
  plotit <- function(phylum, col = 1) {
    genera <- unique(names$genus[names$phylum == phylum])
    # Lookup phylum and genera in metrics table
    iphylum <- match(phylum, metrics$group)
    igenera <- match(genera, metrics$group)
    # Use thicker lines and less transparency for phyla with fewer genera
    if(length(genera) < 50) lwd <- 2 else lwd <- 1
    if(length(genera) < 50) alpha.f <- 0.626 else alpha.f <- 0.312
    newcol <- adjustcolor(col, alpha.f = alpha.f)
    for(i in igenera) lines(c(metrics$Zc[iphylum], metrics$Zc[i]), c(metrics$nH2O[iphylum], metrics$nH2O[i]), col = newcol, lwd = lwd)
    lwd
  }

  # Get "majorcellular" phyla used in Figure 1
  phyla <- metrics[metrics$rank == "phylum" & metrics$parent != "Viruses", ]
  phyla <- phyla[phyla$ntaxa > 60, ]
  phyla <- phyla[order(phyla$ntaxa, decreasing = TRUE), ]
  # Swap Chloroflexi and Crenarchaeota so latter doesn't have same color as Euryarchaeota 20210527
  phyla[14:15, ] <- phyla[15:14, ]

  # Plot first 8 phyla
  plot(xlim, ylim, xlab = cplab$Zc, ylab = cplab$nH2O, type = "n", xaxs = "i", yaxs = "i")
  lwd <- lapply(1:8, function(i) {plotit(phyla$group[i], col = i)} )
  legend("topright", phyla$group[1:8], col = 1:8, lwd = lwd, cex = 0.8, bg = "white")
  label.figure("A", font = 2, cex = 1.5, xfrac = 0.03)

  # Plot second 8 phyla
  plot(xlim, ylim, xlab = cplab$Zc, ylab = cplab$nH2O, type = "n", xaxs = "i", yaxs = "i")
  lwd <- lapply(9:16, function(i) {plotit(phyla$group[i], col = i)} )
  legend("topright", phyla$group[9:16], col = 1:8, lwd = lwd, cex = 0.8, bg = "white")
  label.figure("B", font = 2, cex = 1.5, xfrac = 0.03)

  # Plot 8 more phyla 20220126
  phyla <- metrics[metrics$rank == "phylum" & metrics$parent != "Viruses", ]
  phyla <- phyla[phyla$ntaxa >= 18 & phyla$ntaxa <= 60, ]
  phyla <- phyla[order(phyla$ntaxa, decreasing = TRUE), ]
  plot(xlim, ylim, xlab = cplab$Zc, ylab = cplab$nH2O, type = "n", xaxs = "i", yaxs = "i")
  lwd <- lapply(1:8, function(i) {plotit(phyla$group[i], col = i)} )
  legend <- phyla$group[1:8]
  legend <- gsub("Candidatus Thermoplasmatota", "Candidatus", legend)
  legend <- c(legend, "Thermoplasmatota")
  legend("topright", legend, col = c(1:8, NA), lwd = lwd, cex = 0.8, bg = "white")
  label.figure("C", font = 2, cex = 1.5, xfrac = 0.03)

  if(pdf) dev.off()

}

# Venn diagrams for phylum and genus names in the RefSeq (NCBI), RDP, and SILVA taxonomies 20220117
geo16S_S4 <- function(pdf = FALSE) {

  # Read lists of RDP and SILVA names
  datadir <- system.file("extdata/geo16S/taxonomy", package = "JMDplots")
  RDPphyla <- readLines(file.path(datadir, "RDPphyla.txt"))
  RDPgenera <- readLines(file.path(datadir, "RDPgenera.txt"))
  SILVAphyla <- readLines(file.path(datadir, "SILVAphyla.txt"))
  SILVAgenera <- readLines(file.path(datadir, "SILVAgenera.txt"))
  # Read NCBI names
  NCBI <- read.csv(system.file("RefDB/RefSeq_206/taxonomy.csv.xz", package = "JMDplots"))
  NCBIphyla <- unique(na.omit(NCBI$phylum))
  NCBIgenera <- unique(na.omit(NCBI$genus))

  pl <- list()

  # Make Venn diagrams
  fillsRDP <- list(fill = c("transparent", "slategray3"))
  fillsSILVA <- list(fill = c("transparent", "lightpink2"))

  A <- length(setdiff(NCBIphyla, RDPphyla))
  B <- length(setdiff(RDPphyla, NCBIphyla))
  AB <- length(intersect(NCBIphyla, RDPphyla))
  pl[[1]] <- plot( euler(c(A = A, B = B, "A&B" = AB)), legend = list(labels = c("RefSeq (NCBI)", "RDP"), side = "bottom"), quantities = TRUE, fills = fillsRDP )

  A <- length(setdiff(NCBIphyla, SILVAphyla))
  B <- length(setdiff(SILVAphyla, NCBIphyla))
  AB <- length(intersect(NCBIphyla, SILVAphyla))
  pl[[2]] <- plot( euler(c(A = A, B = B, "A&B" = AB)), legend = list(labels = c("RefSeq (NCBI)", "SILVA"), side = "bottom"), quantities = TRUE, fills = fillsSILVA )

  A <- length(setdiff(NCBIgenera, RDPgenera))
  B <- length(setdiff(RDPgenera, NCBIgenera))
  AB <- length(intersect(NCBIgenera, RDPgenera))
  pl[[3]] <- plot( euler(c(A = A, B = B, "A&B" = AB)), legend = list(labels = c("RefSeq (NCBI)", "RDP"), side = "bottom"), quantities = TRUE, fills = fillsRDP )

  A <- length(setdiff(NCBIgenera, SILVAgenera))
  B <- length(setdiff(SILVAgenera, NCBIgenera))
  AB <- length(intersect(NCBIgenera, SILVAgenera))
  pl[[4]] <- plot( euler(c(A = A, B = B, "A&B" = AB)), legend = list(labels = c("RefSeq (NCBI)", "SILVA"), side = "bottom"), quantities = TRUE, fills = fillsSILVA )

  # Make the plot
  if(pdf) pdf("geo16S_S4.pdf", width = 8, height = 6)

  lg <- gridExtra::tableGrob(c("A. phyla", "B. genera"), theme = gridExtra::ttheme_minimal(base_size = 18))
  rg <- gridExtra::arrangeGrob(grobs = pl, ncol = 2)
  grid.draw(cbind(lg, rg, size = "last"))
  if(pdf) dev.off()

  # Save RDP and SILVA names not in NCBI for Supplementary Table
  out <- list(
    RDP_phylum_name_not_in_RefSeq = sort(setdiff(RDPphyla, NCBIphyla)),
    RDP_genus_name_not_in_RefSeq = sort(setdiff(RDPgenera, NCBIgenera)),
    SILVA_phylum_name_not_in_RefSeq = sort(setdiff(SILVAphyla, NCBIphyla)),
    SILVA_genus_name_not_in_RefSeq = sort(setdiff(SILVAgenera, NCBIgenera))
  )
  # Pad each list element to the same length with NA
  len <- max(sapply(out, length))
  out <- lapply(out, "[", 1:len)
  out <- do.call(cbind, out) 
  invisible(out)

}

# Correlations between Zc estimated from MG and 16S 20220112
geo16S_S5 <- function(pdf = FALSE, H2O = FALSE) {

  if(pdf) pdf("geo16S_S5.pdf", width = 10, height = 6)
  par(mfrow = c(2, 3))
  par(mar = c(4, 4, 4, 1), mgp = c(2.8, 1, 0))
  if(H2O) xylim <- c(-0.8, -0.70) else xylim <- c(-0.22, -0.12)

  # Plot regression line and R2
  lmfun <- function(dat, xylim) {
    x <- unlist(lapply(dat, "[[", "metric_MG"))
    y <- unlist(lapply(dat, "[[", "metric_16S"))
    # Add points and linear regression
    thislm <- lm(y ~ x)
    lines(xylim, predict(thislm, data.frame(x = xylim)), col = "#00000080")
    # Add R2 value
    R2 <- summary(thislm)$r.squared
    R2txt <- bquote(italic(R)^2 == .(formatC(R2, digits = 3, format = "f")))
    legend("topleft", legend = R2txt, bty = "n")
  }

  for(row in 1:2) {

    if(row == 2) lineage = "genus" else lineage <- NULL
    # Not used: show the estimates using only phylum-level classifications
    if(row == 3) lowest.level <- "phylum" else lowest.level <- NULL

    ## Panel 1: Various Environments
    if(H2O) {
      xlab <- quote(italic(n)*H[2]*O~"from shotgun metagenome or metatranscriptome")
      ylab <- quote(italic(n)*H[2]*O~"estimated from 16S rRNA")
    } else {
      xlab <- quote(italic(Z)[C]~"from shotgun metagenome or metatranscriptome")
      ylab <- quote(italic(Z)[C]~"estimated from 16S rRNA")
    }
    plot(xylim, xylim, type = "n", xlab = xlab, ylab = ylab)
    lines(xylim, xylim, lty = 2, col = "gray40")
    cex <- c(1, 1, 1, 1.4, 1, 1, 1.4, 0.9)
    dat <- mapply(MG16S, which = c("Guerrero_Negro", "ETNP_MG", "ETNP_MT", "Bison_Pool", "Mono_Lake", "Marcellus_Shale", "Manus_Basin", "Black_Sea"),
                   cex = cex, plot.lines = FALSE, MoreArgs = list(lowest.level = lowest.level, lineage = lineage, H2O = H2O), SIMPLIFY = FALSE)
    lmfun(dat, xylim)
    # Add title and figure label
    title("Various Environments + Marcellus + Manus + Black Sea     ", font.main = 1, cex.main = 1.1, line = 1)

    ## Panel 2: Human Microbiome Project
    if(H2O) xlab <- quote(italic(n)*H[2]*O~"from shotgun metagenome") else xlab <- quote(italic(Z)[C]~"from shotgun metagenome")
    if(H2O) xylimHMP <- c(-0.95, -0.70) else xylimHMP <- xylim
    plot(xylimHMP, xylimHMP, type = "n", xlab = xlab, ylab = ylab)
    lines(xylimHMP, xylimHMP, lty = 2, col = "gray40")
    dat <- lapply("HMP", MG16S, plot.lines = FALSE, lowest.level = lowest.level, lineage = lineage, rm.outliers = TRUE, H2O = H2O)
    lmfun(dat, xylimHMP)
    title("Human Microbiome Project", font.main = 1, cex.main = 1.1, line = 1)
    par(xpd = NA)
    if(row == 1) title("A. Lowest-level classifications (genus to phylum)", line = 2.5)
    if(row == 2) title("B. Genus-level classifications only (higher-level classifications discarded)", line = 2.5)
    if(row == 3) title("C. Phylum-level classifications (lineages truncated at phylum)", line = 2.5)
    par(xpd = FALSE)

    ## Panel 3: Soils and Mammamlian Guts
    plot(xylim, xylim, type = "n", xlab = xlab, ylab = ylab)
    lines(xylim, xylim, lty = 2, col = "gray40")
    dat <- lapply(c("Guts", "Soils"), MG16S, plot.lines = FALSE, lowest.level = lowest.level, lineage = lineage, H2O = H2O)
    lmfun(dat, xylim)
    title("Soils and Mammalian Guts", font.main = 1, cex.main = 1.1, line = 1)

  }

  if(pdf) dev.off()

}

# Correlation of Zc with GC content of metagenomic and 16S amplicon reads 20220126
geo16S_S6 <- function(pdf = FALSE) {

  if(pdf) pdf("geo16S_S6.pdf", width = 8, height = 7)
  par(mfrow = c(2, 2))
  par(mar = c(4, 4, 2, 1))
  par(mgp = c(2.8, 1, 0))

  # Change these to extract specific parts of the taxonomy
  lowest.level <- NULL
  lineage <- NULL

  # For MG datasets analyzed in this study (others are from gradox paper)
  ARASTdir <- system.file("extdata/geo16S/ARAST", package = "JMDplots")
  # Read data for paired metagenomes and amplicon sequences expanded from Tax4Fun paper (Aßhauer et al., 2015)
  AWDM15file <- system.file("extdata/geo16S/AWDM15.csv", package = "JMDplots")
  AWDM15 <- read.csv(AWDM15file)

  # Use semi-transparent colors 20220122
  c1 <- adjustcolor(1, alpha.f = 0.5)
  c2 <- adjustcolor(2, alpha.f = 0.69)
  c4 <- adjustcolor(4, alpha.f = 0.69)
  c5 <- adjustcolor(5, alpha.f = 0.69)
  c6 <- adjustcolor(6, alpha.f = 0.69)
  c8 <- adjustcolor(8, alpha.f = 0.69)

  for(name in c("Marcellus", "HMP")) {

    if(name == "HMP") {
      # HMP 16S
      met <- getmetrics_geo16S("HMP12", lowest.level = lowest.level, lineage = lineage)
      # NOTE: don't use 'metrics' argument here in order to get metadata for *all* samples
      metadata <- getmdat_geo16S("HMP12")
      # HMP metagenomes
      aa <- read.csv(file.path(ARASTdir, "HMP_AA.csv"))
      # Put data in same order
      dat <- AWDM15[AWDM15$Name == "HMP", ]
      imet <- match(dat$Amplicon, met$Run)
      met <- met[imet, ]
      iaa <- match(dat$Metagenome, aa$protein)
      aa <- aa[iaa, ]
      # Make sure the 16S and metagenomes are paired correctly
      stopifnot(all(na.omit(met$Run == dat$Amplicon)))
      stopifnot(all(aa$protein == dat$Metagenome))
      # Don't plot MG with low numbers of protein fragments 20220122
      ilow <- aa$chains < 50000
      if(any(ilow)) {
        metadata <- metadata[!ilow, ]
        met <- met[!ilow, ]
        aa <- aa[!ilow, ]
        dat <- dat[!ilow, ]
      }
      # Get Zc
      Zc_16S <- met$Zc
      Zc_MG <- Zc(aa)
      # Get GC
      GC_16S <- dat$GC_16S
      GC_MG <- dat$GC_MG
      # Colors: blue (Skin), green (Nasal cavity), gray (Oral cavity), red (GI tract), magenta (UG tract)
      col <- sapply(metadata$"Body site", switch, "Skin" = c5, "Nasal cavity" = c4, "Oral cavity" = c8, "GI tract" = c2, "UG tract" = c6)
      # Symbols: up triangle (skin, GI tract), circle (Oral cavity), down triangle (Nasal cavity, UG tract)
      pch <- sapply(metadata$"Body site", switch, "Skin" = 24, "Nasal cavity" = 25, "Oral cavity" = 21, "GI tract" = 24, "UG tract" = 25)
      legend.x <- "bottomright"
    }

    if(name == "Marcellus") {
      ## Marcellus metagenomes (Daly et al., 2016) 20211218
      aa <- read.csv(file.path(ARASTdir, "Marcellus_Shale_AA.csv"))
      Zc_MG <- Zc(aa)
      # Marcellus 16S (Cluff et al., 2014)
      metrics <- getmetrics_geo16S("CHM+14", lowest.level = lowest.level, lineage = lineage)
      mdat <- getmdat_geo16S("CHM+14", metrics)
      dat_16S <- mdat$metrics
      metadata <- mdat$metadata
      # Time points: input, T7, T13, T82, T328
      Sample = paste("Day", c(0, 7, 13, 82, 328))
      # List run IDs here
      Metagenome = c("SRR3111417", "SRR3111625", "SRR3111724", "SRR3111729", "SRR3111737")
      Amplicon = c("SRR1184016", "SRR1184060", "SRR1184062", "SRR1184081", "SRR1184083")
      # GC content from https://trace.ncbi.nlm.nih.gov/Traces/sra/?run=SRR******* 20220126
      GC_MG <- c(58.4, 37.7, 57, 39.6, 53.1)
      GC_16S <- c(53.9, 55, 54.8, 51.8, 53.2)
      # Make sure metagenomes are in correct order
      stopifnot(all(aa$protein == Metagenome))
      # Get 16S runs corresponding to metagenomes
      idat <- match(Amplicon, dat_16S$Run)
      dat_16S <- dat_16S[idat, ]
      Zc_16S <- dat_16S$Zc
      # Assign colors: open circle for injected fluid, gray for flowback, red for produced
      metadata <- metadata[idat, ]
      col <- rep(4, nrow(metadata))
      col[metadata$type == "flowback fluid"] <- 8
      col[metadata$type == "produced fluid"] <- 2
      pch <- 23
      legend.x <- "topleft"
    }

  #  ## Plot A: GC 16S vs GC MG
  #  plot(range(GC_MG), range(GC_16S), xlab = "GC content of shotgun metagenome reads (%)", ylab = "GC content of 16S amplicon reads (%)", type = "n")
  #  points(GC_MG, GC_16S, pch = pch, bg = col, col = c1)

    lmfun <- function(x, y) {
      # Add points and linear regression
      thislm <- lm(y ~ x)
      lines(range(x), predict(thislm, data.frame(x = range(x))), col = "#00000080")
      # Show R2 and slope
      R2 <- summary(thislm)$r.squared
      R2txt <- bquote(italic(R)^2 == .(formatC(R2, digits = 3, format = "f")))
      R2txt
    }

    ## Plot A: Zc MG vs GC MG
    plot(range(GC_MG), range(Zc_MG, na.rm = TRUE), xlab = "%GC content - metagenome", ylab = quote("Protein"~italic(Z)[C]~"- metagenome"), type = "n")
    points(GC_MG, Zc_MG, pch = pch, bg = col, col = c1)
    R2txt <- lmfun(GC_MG, Zc_MG)
    legend(legend.x, legend = R2txt, bty = "n")

    if(name == "Marcellus") legend("bottomright", c("Injected", "Flowback", "Produced"), pch = 23, pt.bg = c(4, 8, 2))

    ## Plot B: Zc 16S vs GC 16S
    plot(range(GC_16S), range(Zc_16S, na.rm = TRUE), xlab = "%GC content - 16S amplicon", ylab = quote("Protein"~italic(Z)[C]~"- 16S amplicon"), type = "n")
    points(GC_16S, Zc_16S, pch = pch, bg = col, col = c1)
    R2txt <- lmfun(GC_16S, Zc_16S)
    legend(legend.x, legend = R2txt, bty = "n")

    if(name == "HMP") legend("topleft", c("Skin", "Nasal cavity", "Oral cavity", "GI tract", "UG tract"), pch = c(24, 25, 21, 24, 25), pt.bg = c(c5, c4, c8, c2, c6), col = c1)

    # Add title
    par(xpd = NA)
    if(name == "Marcellus") title("A. Marcellus Shale                                                                                       ")
    if(name == "HMP") title("B. Human Microbiome Project                                                                                       ")
    par(xpd = FALSE)

  }

  if(pdf) dev.off()

}

# Get metadata for a study, appending columns for pch and col 20200914
getmdat_geo16S <- function(study, metrics = NULL, dropNA = TRUE) {
  # Read metadata file
  # Remove suffix after underscore 20200929
  studyfile <- gsub("_.*", "", study)
  datadir <- system.file("extdata/geo16S", package = "JMDplots")
  file <- file.path(datadir, "metadata", paste0(studyfile, ".csv"))
  metadata <- read.csv(file, as.is = TRUE, check.names = FALSE)

  if(dropNA) {
    # Exclude samples with NA name (e.g. outliers?) 20200916
    iname <- match("name", tolower(colnames(metadata)))
    noname <- is.na(metadata[, iname])
    if(any(noname)) {
      print(paste0("getmetadata [", study, "]: dropping ", sum(noname), " samples with NA name"))
      metadata <- metadata[!is.na(metadata[, iname]), ]
    }
  }
  # Use NULL pch as flag for unavailable dataset 20210820
  pch <- NULL

  ## Identify samples for computing differences in each study

  # Natural environments
  if(study == "BGPF13") { # Heart Lake Geyser Basin, Yellowstone
    pch <- sapply(metadata$cohort, switch, Bacteria = 22, Archaea = 23)
    col <- sapply(metadata$cohort, switch, Bacteria = 5, Archaea = 6)
  }
  if(study == "MPB+17") { # Manus Basin
    type <- metadata$type
    iwater <- type == "water/fluid"
    type[iwater][metadata$T[iwater] > 50] <- "highT"
    type[iwater][metadata$T[iwater] < 50] <- "lowT"
    pch <- sapply(type, switch, lowT = 21, highT = 23, "fauna surface" = 8, "rock/chimney" = 20, NA)
    # For fauna, use a darkened yellow4 with transparency 20210609
    col <- sapply(type, switch, lowT = 4, highT = 2, "fauna surface" = "#757500C0", "rock/chimney" = 1, NA)
  }
  if(study == "SVH+19") { # Black Sea
    pch <- sapply(metadata$Type, switch, Oxic = 24, Suboxic = 20, Euxinic = 25, NA)
    col <- sapply(metadata$Type, switch, Oxic = 4, Suboxic = 1, Euxinic = 2, NA)
  }
  if(study == "SVH+19_O2") {
    # oxic and anoxic groups for geo16S4() 20220118
    pch <- ifelse(metadata$O2 > 0.5, 24, 25)
    col <- ifelse(metadata$O2 > 0.5, 4, 2)
  }
  if(study == "XDZ+17") { # Qarhan Salt Lake
    pch <- sapply(metadata$type, switch, normal = 24, saline = 21)
    col <- sapply(metadata$type, switch, normal = 3, saline = 4)
  }
  if(study == "JHM+16") { # Lake Fryxell microbial mats
    pch <- sapply(metadata$type, switch, oxic = 24, transition = 20, anoxic = 25)
    col <- sapply(metadata$type, switch, oxic = 4, transition = 1, anoxic = 2)
  }
  if(study == "JHM+16_O2") {
    # oxic and anoxic groups for geo16S4() 20220118
    pch <- ifelse(metadata$O2 > 0.5, 24, 25)
    col <- ifelse(metadata$O2 > 0.5, 4, 2)
  }
  if(study == "HLA+16") { # Baltic Sea
    #pch <- sapply(metadata$type, switch, Oligohaline = 24, Mesohaline = 20, Marine = 21)
    #col <- sapply(metadata$type, switch, Oligohaline = 3, Mesohaline = 1, Marine = 4)
    type <- rep("moderate", nrow(metadata))
    type[metadata$salinity < 6] <- "low"
    type[metadata$salinity > 20] <- "high"
    pch <- sapply(type, switch, low = 24, moderate = 20, high = 21)
    col <- sapply(type, switch, low = 3, moderate = 1, high = 4)
  }
  if(study == "ZLM+16") { # Tibetan Plateau Lakes
    type <- rep("moderate", nrow(metadata))
    type[metadata$lake %in% c("Keluke", "Qing")] <- "low"
    type[metadata$lake == "Gasikule"] <- "high"
    pch <- sapply(type, switch, low = 24, moderate = 20, high = 21)
    col <- sapply(type, switch, low = 3, moderate = 1, high = 4)
  }
  if(study == "HCW+13") { # Guerrero Negro microbial mat
    pch <- sapply(metadata$zone, switch, A = 24, B = 20, C = 25)
    col <- sapply(metadata$zone, switch, A = 4, B = 1, C = 2)
  }

  # Shale gas datasets
  if(study == "UKD+18.water") {
    pch <- sapply(metadata$type, switch, "MSA+" = 21, "MSA-" = 1)
    col <- sapply(metadata$type, switch, "MSA+" = 2, "MSA-" = 1)
  }
  if(study == "UKD+18.sediment") {
    pch <- sapply(metadata$type, switch, "MSA+" = 21, "MSA-" = 1)
    col <- sapply(metadata$type, switch, "MSA+" = 2, "MSA-" = 1)
  }
  if(grepl("UKD\\+18.*2012", study)) {
    metadata <- metadata[metadata$year == 2012, ]
    pch <- sapply(metadata$type, switch, "MSA+" = 21, "MSA-" = 1)
    col <- sapply(metadata$type, switch, "MSA+" = 2, "MSA-" = 1)
  }
  if(grepl("UKD\\+18.*2013", study)) {
    metadata <- metadata[metadata$year == 2013, ]
    pch <- sapply(metadata$type, switch, "MSA+" = 21, "MSA-" = 1)
    col <- sapply(metadata$type, switch, "MSA+" = 2, "MSA-" = 1)
  }
  if(grepl("UKD\\+18.*2014", study)) {
    metadata <- metadata[metadata$year == 2014, ]
    pch <- sapply(metadata$type, switch, "MSA+" = 21, "MSA-" = 1)
    col <- sapply(metadata$type, switch, "MSA+" = 2, "MSA-" = 1)
  }
  if(grepl("UKD\\+18.*2015", study)) {
    metadata <- metadata[metadata$year == 2015, ]
    pch <- sapply(metadata$type, switch, "MSA+" = 21, "MSA-" = 1)
    col <- sapply(metadata$type, switch, "MSA+" = 2, "MSA-" = 1)
  }
  if(grepl("UKD\\+18.*2016", study)) {
    metadata <- metadata[metadata$year == 2016, ]
    pch <- sapply(metadata$type, switch, "MSA+" = 21, "MSA-" = 1)
    col <- sapply(metadata$type, switch, "MSA+" = 2, "MSA-" = 1)
  }
  if(study == "CUN+18") {
    pch <- sapply(metadata$type, switch, "UOG+" = 21, "UOG-" = 1, 0)
    col <- sapply(metadata$type, switch, "UOG+" = 2, "UOG-" = 1, 1)
  }
  if(study == "MMA+20") {
    # Exclude AMD streams
    metadata <- metadata[!(grepl("Bark_Camp_Sed", metadata$sample) | grepl("Boone_Sed", metadata$sample) | grepl("Boone_Dup_Sed", metadata$sample)), ]
#    # Include only streams categorized as "low/lowest" or "high/highest" disturbance intensity
#    metadata <- metadata[metadata$sDII < 15 | metadata$sDII > 30, ]
    # Include only streams categorized as "lowest" or "highest" disturbance intensity
    metadata <- metadata[metadata$sDII < 5 | metadata$sDII > 40, ]
    pch <- ifelse(metadata$sDII >= 20, 21, 1)
    col <- ifelse(metadata$sDII >= 20, 2, 1)
  }
  if(study == "MMA+20_spring") {
    # Exclude AMD streams
    metadata <- metadata[!(grepl("Bark_Camp_Sed", metadata$sample) | grepl("Boone_Sed", metadata$sample) | grepl("Boone_Dup_Sed", metadata$sample)), ]
    # Include only streams categorized as "lowest" or "highest" disturbance intensity
    metadata <- metadata[metadata$sDII < 5 | metadata$sDII > 40, ]
    # Include only spring samples
    ispring <- grep("^04", sapply(strsplit(metadata$sample, "_"), tail, 1))
    metadata <- metadata[ispring, ]
    pch <- ifelse(metadata$sDII >= 20, 21, 1)
    col <- ifelse(metadata$sDII >= 20, 2, 1)
  }
  if(study == "MMA+20_fall") {
    # Exclude AMD streams
    metadata <- metadata[!(grepl("Bark_Camp_Sed", metadata$sample) | grepl("Boone_Sed", metadata$sample) | grepl("Boone_Dup_Sed", metadata$sample)), ]
    # Include only streams categorized as "lowest" or "highest" disturbance intensity
    metadata <- metadata[metadata$sDII < 5 | metadata$sDII > 40, ]
    # Include only fall samples
    ifall <- grep("^09", sapply(strsplit(metadata$sample, "_"), tail, 1))
    metadata <- metadata[ifall, ]
    pch <- ifelse(metadata$sDII >= 20, 21, 1)
    col <- ifelse(metadata$sDII >= 20, 2, 1)
  }
  if(grepl("CHM+14", study, fixed = TRUE)) {
    # Injected fluids and later produced water
    if(study == "CHM+14_injected-49") metadata <- metadata[metadata$day == 0 | metadata$day >= 49, ]
    pch <- ifelse(metadata$day >= 49, 21, 1)
    col <- ifelse(metadata$day >= 49, 2, 1)
  }
  if(grepl("HRR+18", study, fixed = TRUE)) {
    if(study == "HRR+18_injected-22") metadata <- metadata[metadata$day == 0 | metadata$day >= 22, ]
    pch <- ifelse(metadata$day > 10, 21, 1)
    col <- ifelse(metadata$day > 10, 2, 1)
  }
  if(grepl("ZLF+19", study, fixed = TRUE)) {
    # Source water and flowback day 18
    if(study == "ZLF+19_injected-18") metadata <- metadata[metadata$day %in% c(-1, 18), ]
    pch <- ifelse(metadata$day >= 1, 21, 1)
    col <- ifelse(metadata$day >= 1, 2, 1)
  }

  # Stratified water datasets
  if(study == "MZG+20") {
    # Identify shallowest and deepest samples from Lakes Zug and Lugano
    newdat <- lapply(c("Lake Zug", "Lake Lugano"), function(lake) {
      ilake <- metadata$lake == lake
      range <- range(metadata$depth[ilake])
      iext <- metadata$depth[ilake] %in% range
      extdat <- metadata[ilake, ][iext, ]
      extdat$type <- ifelse(extdat$depth == range[1], "shallowest", "deepest")
      extdat
    })
    newdat <- do.call(rbind, newdat)
    # Keep all samples, but only assign pch and col shallowest and deepest samples
    metadata$type <- newdat$type[match(metadata$Run, newdat$Run)]
    pch <- ifelse(metadata$type == "shallowest", 24, 25)
    col <- ifelse(metadata$type == "shallowest", 4, 2)
  }
  if(study == "MZG+20_Zug") {
    metadata <- metadata[metadata$lake == "Lake Zug", ]
    pch <- ifelse(metadata$O2 > 0.5, 24, 25)
    col <- ifelse(metadata$O2 > 0.5, 4, 2)
  }
  if(study == "MZG+20_Lugano") {
    metadata <- metadata[metadata$lake == "Lake Lugano", ]
    pch <- ifelse(metadata$O2 > 0.5, 24, 25)
    col <- ifelse(metadata$O2 > 0.5, 4, 2)
  }
  if(study == "HXZ+20") {
    type <- rep("transition", nrow(metadata))
    type[metadata$O2 > 100] <- "oxic"
    type[metadata$O2 == 0] <- "anoxic"
    pch <- sapply(type, switch, oxic = 24, transition = 20, anoxic = 25)
    col <- sapply(type, switch, oxic = 4, transition = 1, anoxic = 2)
    type[metadata$station == "C4"] <- NA
    col[metadata$station == "C4"] <- NA
  }
  if(study == "GBL+15") {
    pch <- ifelse(metadata$O2 > 0.5, 24, 25)
    col <- ifelse(metadata$O2 > 0.5, 4, 2)
    pch[metadata$size != "0.2-1.6micron"] <- NA
    col[metadata$size != "0.2-1.6micron"] <- NA
  }
  if(study == "BCA+21") {
    # "type" column is for orp16S paper, not geo16S
    type <- rep("transition", nrow(metadata))
    type[metadata$depth < 3] <- "oxic"
    type[metadata$depth > 4] <- "anoxic"
    # pch is for geo16S4() 20220118
    pch <- ifelse(metadata$O2 > 0.5, 24, 25)
    col <- ifelse(metadata$O2 > 0.5, 4, 2)
  }
  if(study == "EH18") {
    pch <- ifelse(metadata$O2 > 0.5, 24, 25)
    col <- ifelse(metadata$O2 > 0.5, 4, 2)
  }

  # Dataets for metagenome comparison 20220111
  if(study == "MKK+11") {
    pch <- rep(NA, nrow(metadata))
    col <- rep(NA, nrow(metadata))
  }
  if(study == "FLA+12") {
    pch <- rep(NA, nrow(metadata))
    col <- rep(NA, nrow(metadata))
  }
  if(study == "HMP12") {
    pch <- rep(NA, nrow(metadata))
    col <- rep(NA, nrow(metadata))
  }
  if(study == "SMS+12") {
    # pch is for geo16S4() 20220118
    pch <- ifelse(metadata$O2 > 0.5, 24, 25)
    col <- ifelse(metadata$O2 > 0.5, 4, 2)
  }

  if(is.null(pch)) stop(paste(study, "metadata file exists, but not set up for processing"))

  metadata <- cbind(metadata, pch, col)
  # Return both metadata and metrics, if provided 20220506
  if(is.null(metrics)) metadata else {
    # Keep metadata only for samples with metrics 20201006
    metadata <- metadata[metadata$Run %in% metrics$Run, ]
    # Put metrics in same order as metadata 20220505
    imet <- match(metadata$Run, metrics$Run)
    metrics <- metrics[imet, ]
    # Insert sample column in metrics
    # Use first column name starting with "sample" or "Sample" 20210818
    sampcol <- grep("^sample", colnames(metadata), ignore.case = TRUE)[1]
    metrics <- cbind(data.frame(Run = metrics$Run, sample = metadata[, sampcol]), metrics[, -1, drop = FALSE])
    list(metadata = metadata, metrics = metrics)
  }
}

# Function to calculate metrics for a given study 20220506
getmetrics_geo16S <- function(study, quiet = TRUE, ...) {
  # Remove suffix after underscore 20200929
  studyfile <- gsub("_.*", "", study)
  datadir <- system.file("extdata/geo16S/RDP", package = "JMDplots")
  RDPfile <- file.path(datadir, paste0(studyfile, ".tab.xz"))
  # If there is no .xz file, look for a .tab file 20210607
  if(!file.exists(RDPfile)) RDPfile <- file.path(datadir, paste0(studyfile, ".tab"))
  RDP <- read_RDP(RDPfile, quiet = quiet, ...)
  map <- map_taxa(RDP, refdb = "RefSeq_206", quiet = quiet)
  get_metrics(RDP, map = map, refdb = "RefSeq_206", taxon_AA = taxon_AA$RefSeq_206)
}

# Function to calculate and plot metrics for a given study 20220506
plotmet_geo16S <- function(study, quiet = TRUE, ...) {
  metrics <- getmetrics_geo16S(study, quiet = quiet)
  mdat <- getmdat_geo16S(study, metrics)
  pm <- plot_metrics(mdat, ...)
  # Prepend study column
  cbind(study = study, pm)
}