JMDplots: Plots from Papers by Jeffrey M. Dick

Documented in genoGOE_1

# JMDplots/genoGEO.R
# Make plots for paper:
# Genomes record the Great Oxidation Event
# 20231206 jmd first version
# 20240328 Moved to JMDplots
# 20240409 Add Rubisco plots
# 20240528 Analyze methanogen genomes
# 20240803 Compare Rubisco proteins and unrelated genomes (Fig. 3C)

# Figure 1: Genome-wide differences of oxidation state between two lineages of methanogens
genoGOE_1 <- function(pdf = FALSE) {

  if(pdf) pdf("Figure_1.pdf", width = 8, height = 6)
  mat <- matrix(c(1,2,3, 1,2,4, 5,5,5), nrow = 3, byrow = TRUE)
  layout(mat, heights = c(1, 1, 2))
  opar <- par(mgp = c(2.8, 1, 0), mar = c(5.1, 4.1, 2.1, 2.1))

  # Read methanogen genomes information
  mgfile <- system.file("extdata/genoGOE/GTDB/methanogen_genomes.csv", package = "JMDplots")
  mg <- read.csv(mgfile)
  # Genomes in Halobacteriota
  Halo <- mg$Genome[mg$Methanogen_class == "II"]
  # Genomes in Methanobacteriota
  Methano <- mg$Genome[mg$Methanogen_class == "I"]

  # Panels A-B: Zc and GC of marker genes 20240528

  # Read data for GTDB marker genes
  markerfile <- system.file("extdata/genoGOE/GTDB/ar53_msa_marker_info_r220_XHZ+06.csv", package = "JMDplots")
  markerdat <- read.csv(markerfile)
  markerid <- sapply(strsplit(markerdat$Marker.Id, "_"), "[", 2)
  methanogendir <- system.file("extdata/genoGOE/methanogen", package = "JMDplots")

  # NULL values for variables used in subset()
  protein <- gene <- NULL

  get_Zc <- function(phylum = "Halo") {
    files <- file.path(methanogendir, "marker", "faa", paste0(markerid, ".faa.xz"))
    aalist <- lapply(files, function(file) {
      aa <- suppressMessages(read_fasta(file))
      fivenum(Zc(subset(aa, protein %in% get(phylum))))
    })
    do.call(rbind, aalist)
  }

  get_GC <- function(phylum = "Halo") {
    files <- file.path(methanogendir, "marker", "fna", paste0(markerid, ".fna.xz"))
    nalist <- lapply(files, function(file) {
      na <- suppressMessages(read_fasta(file, molecule = "DNA"))
      na <- subset(na, gene %in% get(phylum))
      GC <- (na$G + na$C) / (na$G + na$C + na$A + na$T)
      fivenum(GC)
    })
    do.call(rbind, nalist)
  }

  # Get Zc for species in each phylum
  Zc_Halo <- get_Zc("Halo")
  Zc_Methano <- get_Zc("Methano")

  # Order by median Zc of Methanobacteriota
  iord <- order(Zc_Methano[, 3])
  Zc_Halo <- Zc_Halo[iord, ]
  Zc_Methano <- Zc_Methano[iord, ]

  # Plot IQR of Zc
  plot(c(1, 53), c(-0.28, -0.04), xlab = "Marker gene", ylab = quote("Protein"~italic(Z)[C]), type = "n")
  for(i in 1:53) {
    lines(c(i, i) - 0.1, Zc_Methano[i, c(2, 4)], col = 2)
    lines(c(i, i) + 0.1, Zc_Halo[i, c(2, 4)], col = 4)
  }
  # Add legend for Class I and II methanogens
  legend("bottomright", "Class I", lty = 1, col = 2, bty = "n")
  legend("topleft", "Class II", lty = 1, col = 4, bty = "n")
  label.figure("A", font = 2, cex = 1.6)
  # Calculate p-value 20250304
  # Use median value in each group (3rd column) and paired observations
  p <- t.test(Zc_Halo[, 3], Zc_Methano[, 3], paired = TRUE)$p.value
  ptext <- bquote(italic(p) == .(signif(p, 2)))
  text(5, par("usr")[3], ptext, adj = c(0, -0.5))

  # Get GC for species in each phylum
  GC_Halo <- get_GC("Halo")
  GC_Methano <- get_GC("Methano")
  GC_Halo <- GC_Halo[iord, ]
  GC_Methano <- GC_Methano[iord, ]

  # Plot IQR of GC
  plot(c(1, 53), c(0.25, 0.65), xlab = "Marker gene", ylab = "GC content", type = "n")
  for(i in 1:53) {
    lines(c(i, i) - 0.1, GC_Methano[i, c(2, 4)], col = 2)
    lines(c(i, i) + 0.1, GC_Halo[i, c(2, 4)], col = 4)
  }
  label.figure("B", font = 2, cex = 1.6)
  # Calculate p-value 20250304
  # Use median value in each group (3rd column) and paired observations
  p <- t.test(GC_Halo[, 3], GC_Methano[, 3], paired = TRUE)$p.value
  ptext <- bquote(italic(p) == .(signif(p, 2)))
  text(5, par("usr")[3], ptext, adj = c(0, -0.5))

  # Panel C: Delta Zc for marker genes

  # Plot Delta Zc vs Delta GC
  par(mar = c(4.1, 4.1, 1.1, 2.1))
  Delta_Zc <- na.omit(Zc_Halo[, 3] - Zc_Methano[, 3])
  Delta_GC <- na.omit(GC_Halo[, 3] - GC_Methano[, 3])
  plot(Delta_GC, Delta_Zc, xlab = quote(Delta*"GC"),
    ylab = quote(Delta*italic(Z)[C]~"(Class II - Class I)                                                         "),
    pch = 19, col = adjustcolor(1, alpha.f = 0.5), xpd = NA)
  # Calculate linear fit
  mylm <- lm(Delta_Zc ~ Delta_GC)
  x <- range(Delta_GC)
  y <- predict.lm(mylm, data.frame(Delta_GC = x))
  # Plot linear fit and show R2
  lines(x, y, lty = 2, lwd = 1.5, col = 8)
  R2 <- summary(mylm)$r.squared
  R2_txt <- bquote(italic(R)^2 == .(formatC(R2, digits = 2, format = "f")))
  legend("topleft", legend = R2_txt, bty = "n", inset = c(-0.05, 0))
  label.figure("C", font = 2, cex = 1.6, yfrac = 0.9)

  # Plot Delta Zc vs log10 protein abundance in M. maripaludis 20240531
  Delta_Zc <- Zc_Halo[, 3] - Zc_Methano[, 3]
  abundance <- markerdat$Redundant.Peptides / markerdat$MW
  log10a <- log10(abundance)
  plot(log10a, Delta_Zc, xlab = quote(log[10]~"protein abundance in"~italic("M. maripaludis")), ylab = "", pch = 19, col = adjustcolor(1, alpha.f = 0.5))
  # Calculate linear fit
  mylm <- lm(Delta_Zc ~ log10a)
  x <- range(log10a)
  y <- predict.lm(mylm, data.frame(log10a = x))
  # Plot linear fit and show R2
  lines(x, y, lty = 2, lwd = 1.5, col = 8)
  R2 <- summary(mylm)$r.squared
  R2_txt <- bquote(italic(R)^2 == .(formatC(R2, digits = 2, format = "f")))
  legend("topleft", legend = R2_txt, bty = "n", inset = c(-0.05, 0))

  par(opar)

  # Panel D: Zc controlled for various factors 20240529
   
  # Get values of Zc, GC, and Cost
  genomes <- mg$Genome
  values <- lapply(genomes, function(genome) {
    aa <- read.csv(file.path(methanogendir, "aa", paste0(genome, "_aa.csv.xz")))
    data.frame(
      Zc = Zc(aa),
      GC = aa$abbrv,
      Cost = Cost(aa)
    )
  })
  names(values) <- genomes

  # NULL values for variables used in subset()
  GC <- Cost <- NULL
  # Get mean Zc for segment (phylum x condition)
  get_mean_Zc <- function(phylum = "Halo", condition = "all") {
    genomes <- get(phylum)
    myval <- values[genomes]
    if(condition == "low_GC") myval <- lapply(myval, subset, GC < 0.34)
    if(condition == "mid_GC") myval <- lapply(myval, subset, GC >= 0.34 & GC <= 0.36)
    if(condition == "high_GC") myval <- lapply(myval, subset, GC > 0.36)
    if(condition == "low_Cost") myval <- lapply(myval, subset, Cost < 23)
    if(condition == "mid_Cost") myval <- lapply(myval, subset, Cost >= 23 & Cost <= 24)
    if(condition == "high_Cost") myval <- lapply(myval, subset, Cost > 25)
    print(paste("median", median(sapply(myval, nrow)), "proteins for", phylum, condition))
    sapply(sapply(myval, "[", "Zc"), "mean")
  }

  Zc <- data.frame(
    Methano_all = get_mean_Zc("Methano", "all"),
    Halo_all = get_mean_Zc("Halo", "all"),
    Methano_low_GC = get_mean_Zc("Methano", "low_GC"),
    Halo_low_GC = get_mean_Zc("Halo", "low_GC"),
    Methano_mid_GC = get_mean_Zc("Methano", "mid_GC"),
    Halo_mid_GC = get_mean_Zc("Halo", "mid_GC"),
    Methano_high_GC = get_mean_Zc("Methano", "high_GC"),
    Halo_high_GC = get_mean_Zc("Halo", "high_GC"),
    Methano_low_Cost = get_mean_Zc("Methano", "low_Cost"),
    Halo_low_Cost = get_mean_Zc("Halo", "low_Cost"),
    Methano_mid_Cost = get_mean_Zc("Methano", "mid_Cost"),
    Halo_mid_Cost = get_mean_Zc("Halo", "mid_Cost"),
    Methano_high_Cost = get_mean_Zc("Methano", "high_Cost"),
    Halo_high_Cost = get_mean_Zc("Halo", "high_Cost")
  )

  # Don't plot overall line
  what <- c(FALSE, TRUE, TRUE, TRUE)
  # Start beanplot with all proteins
  par(mar = c(3.1, 4.1, 4.1, 2.1))
  bp <- beanplot(Zc[, 1:2], side = "both", col = list(c(2, 7, 2, 2), c(4, 3, 4, 4)), xlim = c(0.5, 7.5), what = what, names = "")
  # Add means for species in each phylum
  abline(h = bp$stats[1], col = 2, lty = 2)
  abline(h = bp$stats[2], col = 4, lty = 2)
  # Add labels for methanogen classes
  text(0.6, bp$stats[1] + 0.008, "Class I")
  text(1.6, bp$stats[2] + 0.008, "Class II")
  # Add beans for GC and Cost
  beanplot(Zc[, 3:14], side = "both", col = list(c(2, 7, 2, 2), c(4, 3, 4, 4)), xlim = c(0.5, 7.5), what = what, names = character(6), add = TRUE, at = 2:7)
  mtext(quote("Protein"~italic(Z)[C]), 2, line = 2.8, cex = par("cex"))


  # Add group names
  axis(1, at = 2:4, labels = c("GC < 0.34", "0.34 < GC < 0.36", "GC > 0.36"))
  axis(1, at = 5:7, labels = c("Cost < 23", "23 < Cost < 25", "Cost > 25"))
  axis(3, at = c(1, 3, 6), labels = c("Entire genomes", "Control for GC content", "Control for metabolic cost"), tick = FALSE, font = 2)

  label.figure("D", font = 2, cex = 1.6, xfrac = 0.018)

  if(pdf) dev.off()

}

# Figure 2: Carbon oxidation state of proteins in eukaryotic gene age groups
genoGOE_2 <- function(pdf = FALSE, metric = "Zc") {

  if(pdf) pdf("Figure_2.pdf", width = 7, height = 6)
  layout(matrix(1:2), heights = c(1.2, 1.7))

  # Gene ages from Liebeskind et al. (2016)
  datadir <- system.file("extdata/evdevH2O/LMM16", package = "JMDplots")
  modeAges <- read.csv(file.path(datadir, "modeAges_names.csv"))
  # Read list of reference proteomes
  refprot <- read.csv(file.path(datadir, "reference_proteomes.csv"))
  # Read summed amino acid compositions for proteins in each modeAge in each organism 20231218
  aa <- read.csv(file.path(datadir, "modeAges_aa.csv"))

  if(metric == "Zc") {
    ylab <- "Carbon oxidation state of proteins"
    ylim <- c(-0.18, -0.06)
    yOE <- -0.07
    ytick.at <- seq(-0.18, -0.06, 0.04)
    ylabels.at <- c(-0.18, -0.06)
  }
  if(metric == "nO2") {
    ylab <- "Stoichiometric oxidation state of proteins"
    ylim <- c(-0.75, -0.55)
    yOE <- -0.01
    ytick.at <- seq(-0.75, -0.55, 0.05)
    ylabels.at <- c(-0.75, -0.55)
  }
  if(metric == "nH2O") {
    ylab <- "Stoichiometric hydration state of proteins"
    ylim <- c(-0.8, -0.65)
    yOE <- -0.67
    ytick.at <- seq(-0.8, -0.65, 0.05)
    ylabels.at <- c(-0.8, -0.65)
  }
  if(metric == "Cost") {
    ylab <- "Metabolic cost"
    ylim <- c(22, 24)
    yOE <- 11
    ytick.at <- seq(22, 24, 0.5)
    ylabels.at <- c(22, 24, 1)
  }

  # Loop over lineages
  col <- c(adjustcolor("blue1", 0.6), adjustcolor("green4", 0.6))
  coltext <- c("blue1", "green4")
  lineages <- c("Mammalia", "Saccharomyceta")
  for(j in seq_along(lineages)) {
    # Adjust margins and colors
    if(j == 1) {
      par(mar = c(2.5, 4.4, 2, 3))
    }
    if(j == 2) {
      par(mar = c(7.3, 4.4, 2.5, 3))
    }
    # Start plot
    plot(c(1, 7), ylim, xaxt = "n", xlab = "", yaxt = "n", ylab = "", yaxs = "i", type = "n", font.lab = 2)
    axis(2, ytick.at, labels = FALSE)
    axis(2, ylabels.at, tick = FALSE)
    # Loop over organisms in this lineage
    ilineage <- which(modeAges$X8 %in% lineages[j])
    for(i in ilineage) {
      # Get modeAge and amino acid composition for this organism
      OSCODE <- refprot$OSCODE[i]
      myaa <- aa[aa$organism == OSCODE, ]
      modeAge <- myaa$protein
      # Get chemical metric for each modeAge
      vals <- get(metric)(myaa)
      # Remove Euk+Bacteria (non-phylogenetic age category) 20231210
      vals <- vals[-3]
      modeAge <- head(modeAge, -1)
      # Add lines to plot
      lines(modeAge, vals, col = col[j])
    }
    if(j == 1) inset <- c(-0.02, 0.5) else inset <- c(-0.02, 0.28)
    legend("topleft", lineages[j], bty = "n", inset = inset)
    # Lines for GOE
    lines(c(2.1, 2.5), c(yOE, yOE), lwd = 4, col = 2)
    if(j == 1) {
      # Top axis: GOE and NOE
      mtext("GOE", side = 3, at = 2.3, line = 0.5, font = 2)
      mtext("NOE", side = 3, at = 5, line = 0.5, font = 2)
      # Lines for NOE
      lines(c(4.5, 6.5), c(yOE, yOE), lwd = 4, col = "red3")
      # Divergence times (TimeTree 5)
      Mya <- c(4250, "3500-2600", 1598, 1275, 743, 563, 180)
      # Put Mya labels on bottom of first panel ...
      Mya_axis <- 1
    } else {
      # ... or top of second panel
      Mya_axis <- 3
      Mya <- c(4250, "3500-2600", 1598, 1275, 642, 528, 523)
      # Lines for NOE
      lines(c(4.5, 5.5), c(yOE, yOE), lwd = 4, col = 2)
      # Bottom axis: tick marks and names for shared ancestry
      axis(1, at = 1:7, labels = FALSE)
      labels <- c("Cellular organisms", "Eukaryotes + Archaea", "Eukaryota", "Opisthokonta")
      text(x = 1:4, y = par()$usr[3] - 1.5 * strheight("A"), labels = labels, srt = 45, adj = 1, xpd = TRUE)
      # Bottom axis: names for Mammalia and Saccharomyceta lineages
      dx <- 0.2
      labels_slash <- c("/", "/", "/")
      text(x = (5:7) - dx, y = par()$usr[3] - 1.5 * strheight("A"), labels = labels_slash, srt = 45, adj = 1, xpd = TRUE)
      labels_Mammalia <- c("Eumetazoa  ", "Vertebrata  ", "Mammalia  ")
      text(x = (5:7) - dx, y = par()$usr[3] - 1.5 * strheight("A"), labels = labels_Mammalia, srt = 45, adj = 1, xpd = TRUE, col = coltext[1])
      labels_Saccharomyceta <- c("Dikarya", "Ascomycota", "Saccharomyceta")
      text(x = (5:7) + dx, y = par()$usr[3] - 1.5 * strheight("A"), labels = labels_Saccharomyceta, srt = 45, adj = 1, xpd = TRUE, col = coltext[2])
    }
    # Add labels for divergence times
    at <- seq_along(Mya)
    axis(Mya_axis, at, Mya)
    # Add plot label 20240803
    label.plot(LETTERS[j], font = 2, cex = 1.2, xfrac = 0.04)
  }
  # Outer axis labels
  mtext(ylab, side = 2, line = 3, adj = -0.68, font = 2)
  mtext("Gene\nage\n(Ma)", side = 4, line = 1.5, font = 2, las = 1, at = -0.0215, adj = 0.5)

  if(pdf) dev.off()

}

# Figure 3: Evolutionary oxidation of ancestral Rubiscos and thermodynamic prediction of redox boundaries around the GOE
genoGOE_3 <- function(pdf = FALSE) {

  if(pdf) pdf("Figure_3.pdf", width = 9, height = 11)
  layout(matrix(c(0,1,1,0, 2,2,3,3, 4,4,5,5), nrow = 3, byrow = TRUE))
  par(cex = 1)

  # Read amino acid compositions
  fasta_file <- system.file("extdata/fasta/KHAB17.fasta", package = "canprot")
  aa <- read_fasta(fasta_file)
  # Assign protein names
  aa$protein <- sapply(strsplit(aa$protein, "_"), "[", 2)

  # Panel A: Zc vs ancestry

  xlab <- "Ancestral sequences (older to younger)"
  par(mar = c(4.0, 4.0, 2.5, 1.0), mgp = c(2.5, 1, 0))
  # Get point locations
  xs <- 1:6
  ys <- Zc(aa)
  # Start plot
  plot(xs, ys, type = "n", xaxt = "n", xlab = xlab, ylab = cplab$Zc)
  # Plot main branch (excluding Anc I/III')
  lines(xs[-3], ys[-3], type = "b", pch = NA)
  # Add points
  points(xs[-3], ys[-3], pch = 19)
  points(xs[3], ys[3], pch = 19, col = 8)
  axis(1, at = 1:6, aa$protein)
  abline(v = 3.5, lty = 2, col = 4, lwd = 2)
  text(2.5, -0.126, "GOE (proposed)")
  title("Evolutionary oxidation of Rubiscos", font.main = 1)
  label.figure("A", cex = 1.5, font = 2, yfrac = 0.936)

  # Panel B: Pairwise stability boundaries for Rubisco

  # Add proteins to CHNOSZ
  ip <- add.protein(aa, as.residue = TRUE)
  # Setup basis species and swap O2 for e- to make Eh-pH diagram
  basis("QEC+")
  swap.basis("O2", "e-")
  # Set resolution
  res <- 300
  
  # Loop over individual pairs
  for(pre in 1:3) {
    for(post in 4:6) {
      add <- TRUE
      if(pre == 1 & post == 4) add <- FALSE
      a <- affinity(pH = c(0, 14, res), Eh = c(-0.5, 0.8, res), iprotein = ip[c(pre, post)])
      d <- diagram(a, names = "", lty = 2, col = "#000000b0", add = add, limit.water = !add, fill.NA = "gray80", xlab = "pH", ylab = axis.label("Eh"))
      # Only label lines for reaction with Anc. I
      if(post == 4) {
        # Sort x values and get x and y values of boundary line
        order <- order(d$linesout[[1]])
        xs <- d$linesout[[1]][order]
        ys <- d$linesout[[2]][order]
        # Get a single value along the length of the line
        ilab <- floor(length(xs) * pre * 3 / 10)
        x <- xs[ilab]
        y <- ys[ilab]
        text(x, y - 0.03, aa$protein[pre], cex = 0.6)
        text(x, y + 0.03, aa$protein[post], cex = 0.6)
      }
    }
  }

  text(5.5, 0.64, hyphen.in.pdf("Higher AFFINITY\nfor post-GOE protein\nin each pair"), cex = 0.8)
  text(4.5, -0.15, hyphen.in.pdf("Higher AFFINITY for\npre-GOE protein in each pair"), cex = 0.8, srt = -24)
  title("Pairwise relative stabilities of Rubiscos", font.main = 1)
  label.figure("B", cex = 1.5, font = 2, yfrac = 0.936)

  # Panel C: Groupwise stability boundaries for Rubisco

  # Calculate affinity of composition reactions for all proteins
  aout <- affinity(pH = c(0, 14, res), Eh = c(-0.5, 0.8, res), iprotein = ip)
  # Set up groups for affinity ranking:
  # 3 pre-GOE and 3 post-GOE proteins
  groups <- list(pre = c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE), post = c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE))
  arank <- rank.affinity(aout, groups = groups)
  diagram(arank, lwd = 2, col = 4, names = "", xlab = "pH", ylab = axis.label("Eh"))
  text(3.5, -0.07, hyphen.in.pdf("Higher affinity RANKING\nfor pre-GOE proteins"), col = 4, font = 2, cex = 0.8, srt = -24)
  text(3.5, 0.25, hyphen.in.pdf("Higher affinity RANKING\nfor post-GOE proteins"), col = 4, font = 2, cex = 0.8, srt = -24)
  title("Groupwise relative stabilities of Rubiscos", font.main = 1)
  # Add arrow
  arrows(7, -0.2, 7, 0.1, length = 0.2, lwd = 2, col = 4)
  text(9.5, 0.15, "Rubiscos became\nmore oxidized\nover the GOE", cex = 0.9)
  label.figure("C", cex = 1.5, font = 2, yfrac = 0.936)


  # Panel D: Comparison between Rubiscos and methanogen and Nitrososphaeria genomes
  yvar <- "O2"
  genoGOE_3D(yvar, res)

  # Label lines
  if(yvar == "Eh") {
    text(8.1, -0.07, "Rubiscos", cex = 0.8)
    text(7.4, -0.081, hyphen.in.pdf("Post-GOE"), srt = -27, cex = 0.75)
    text(7.33, -0.105, hyphen.in.pdf("Pre-GOE"), srt = -27, cex = 0.75)

    text(8.3, -0.19, "Methanogen\ngenomes", cex = 0.8)
    text(7.35, -0.215, "Class I", srt = -33, cex = 0.75)
    text(7.5, -0.2, "Class II", srt = -33, cex = 0.75)

    text(4.9, -0.18, "Nitrososphaeria\ngenomes", cex = 0.8)
    text(6.03, -0.17, "Basal", srt = -33, cex = 0.75)
    text(6.2, -0.155, "Terrestrial", srt = -33, cex = 0.75)
  }
  if(yvar == "O2") {
    text(7, -61, " Rubiscos", cex = 0.8, adj = 0)
    text(7, -61, hyphen.in.pdf("Pre-GOE "), cex = 0.75, adj = 1, srt = 20)
    text(7, -59.8, hyphen.in.pdf("Post-GOE "), cex = 0.75, adj = 1, srt = 20)

    text(7, -67, " Methanogen genomes", cex = 0.8, adj = 0)
    text(6.5, -67.8, "Class I ", cex = 0.75)
    text(6.5, -66.7, "Class II ", cex = 0.75)

    text(7, -69.2, " Nitrososphaeria genomes", cex = 0.8, adj = 0)
    text(6.5, -70.2, "Basal ", cex = 0.75)
    text(6.5, -69.2, "Terrestrial ", cex = 0.75)
  }

  title("Groupwise relative stabilities of Rubiscos\nand unrelated genomes", font.main = 1, xpd = NA)
  label.figure("D", cex = 1.5, font = 2, yfrac = 0.936)

  # Add more arrows 20240812
  plot.new()
  par(xpd = NA)
  if(yvar == "Eh") {
    arrows(0, 0.1, 0, 0.3, length = 0.2, lwd = 2, col = 7)
    arrows(0.1, 0.1, 0.1, 0.3, length = 0.2, lwd = 2, col = 2)
    arrows(0.05, 0.4, 0.05, 0.6, length = 0.2, lwd = 2, col = 4)
    text(0.5, 0.35, "1: Proteins in many organisms\nbecame more oxidized\nover the GOE")
    text(0.05, 0.85, hyphen.in.pdf("2: Rubisco records\nmore oxidizing conditions\ncompared to genomes of\nnon-photosynthetic organisms\nat the time of the GOE"))
  }
  if(yvar == "O2") {
    arrows(-0.08, 0.12, -0.08, 0.32, length = 0.2, lwd = 2, col = 7)
    arrows(-0.02, 0.18, -0.02, 0.38, length = 0.2, lwd = 2, col = 2)
    arrows(-0.05, 0.7, -0.05, 0.9, length = 0.2, lwd = 2, col = 4)
    text(0.05, 0.25, "1: Protein sequences in many lineages\nwere oxidized over the GOE", adj =0)
    text(0.05, 0.8, hyphen.in.pdf("2: Rubisco records more oxidizing conditions\ncompared to genomes of non-photosynthetic\norganisms at the time of the GOE"), adj = 0)
  }

  if(pdf) dev.off()

}


# Code for Figure 3D
# Comparison of Rubiscos and methanogen and Nitrososphaeria genomes
genoGOE_3D <- function(yvar = "Eh", res = 400, add = FALSE, lwd = 2, pHlim = c(4, 10), Ehlim = c(-0.3, 0.1), O2lim = c(-72.5, -58), alpha.f = 1, Eh7_las = 1) {

  # Setup basis species
  basis("QEC+")
  # Swap O2 for e- to make Eh-pH diagram
  if(yvar == "Eh") swap.basis("O2", "e-")

  for(i in 1:3) {

    if(i == 1) {
      # Methanogen proteomes 20220424
      # Read amino acid composition and compute Zc
      aa <- read.csv(system.file("extdata/utogig/methanogen_AA.csv", package = "JMDplots"))
      # Indices of Class I and Class II methanogens
      iI <- 20:36
      iII <- 1:19
      # Get the species in each group
      groups <- list("Class I" = iI, "Class II" = iII)
      col <- 2
      add <- add
    }

    if(i == 2) {
      # Thaumarchaeota 20220414
      # Amino acid compositions of predicted (Glimmer) and database (NCBI or IMG) proteomes
      predicted <- read.csv(system.file("extdata/utogig/Thaumarchaeota_predicted_AA.csv", package = "JMDplots"))
      database <- read.csv(system.file("extdata/utogig/Thaumarchaeota_database_AA.csv", package = "JMDplots"))
      # If both are available, use predicted instead of database
      aa <- rbind(predicted, database)
      aa <- aa[!duplicated(aa$organism), ]
      groupnames <- c("Basal", "Terrestrial", "Shallow", "Deep")
      # Get the species in each group
      groups <- sapply(groupnames, function(group) aa$protein == group, simplify = FALSE)
      # Compare Basal to Terrestrial 20240802
      groups <- groups[1:2]
      col <- 7
      add <- TRUE
    }

    if(i == 3) {
      # Rubisco 20240802
      # Read amino acid compositions
      fasta_file <- system.file("extdata/fasta/KHAB17.fasta", package = "canprot")
      aa <- read_fasta(fasta_file)
      # Assign protein names
      aa$protein <- sapply(strsplit(aa$protein, "_"), "[", 2)
      # Set up groups for affinity ranking:
      # 3 pre-GOE and 3 post-GOE proteins
      groups = list(pre = c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE), post = c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE))
      col <- 4
      add <- TRUE
    }

    # Make affinity ranking plot 20220602
    # Load proteins and calculate affinity
    ip <- add.protein(aa, as.residue = TRUE)
    if(yvar == "Eh") aout <- affinity(pH = c(pHlim, res), Eh = c(Ehlim, res), iprotein = ip)
    if(yvar == "O2") aout <- affinity(pH = c(pHlim, res), O2 = c(O2lim, res), iprotein = ip)
    # Calculate average ranking for each group and make diagram
    arank <- rank.affinity(aout, groups)
    diagram(arank, col = adjustcolor(col, alpha.f = alpha.f), lwd = lwd, add = add, names = "")

    # Add Eh7 axis 20241218
    if(yvar == "O2") {
      # Calculate range of pe at pH = 7
      # H2O = 0.5 O2(gas) + 2 H+ + 2 e- 
      # logK = -41.55
      # --> pe7 = 0.25 * logfO2 + 13.775
      # --> logfO2 = 4 * pe7 - 55.1
      Eh7_to_logfO2 <- function(Eh7) {
        pe7 <- convert(Eh7, "pe")
        logfO2 <- 4 * pe7 - 55.1
        logfO2
      }
      Eh7ticks <- seq(-0.25, 0.05, 0.05)
      logfO2ticks <- Eh7_to_logfO2(Eh7ticks)
      axis(4, at = logfO2ticks, labels = Eh7ticks, tcl = -0.3, mgp = c(2, 0.5, 0))
      mtext("Eh7 (V)", 4, line = 3, las = Eh7_las)
    }

  }

}

# Evolutionary oxidation and relative stabilities for genomes with S-cycling genes 20241211
genoGOE_4 <- function(pdf = FALSE) {
  # genoGOE/sulfur_genomes.R
  # 20241211 add Eh-pH affinity ranking
  # 20241223 convert to logfO2-pH

  # List genomes with single sulfur-cycling genes
  genomes <- list(
    dsrAB = c("GCA_002782605.1", "GCF_000517565.1", "GCA_002878135.1"),
    soxC = c("GCF_000153205.1", "GCF_000024725.1", "GCA_001914955.1", "GCA_002731275.1", 
      "GCA_002007425.1", "GCA_001780165.1", "GCA_002721445.1", "GCF_900129635.1", 
      "GCA_002162915.1", "GCA_002712885.1", "GCF_000484535.1", "GCF_000969705.1", 
      "GCF_002148795.1", "GCF_002514725.1", "GCF_900106035.1", "GCA_002687025.1", 
      "GCA_003222815.1", "GCF_900187885.1", "GCA_002712165.1", "GCA_002705185.1", 
      "GCA_003228115.1"),
    soxABXYZ = c("GCF_000021565.1", "GCF_900142435.1", "GCA_000830255.1", "GCF_000227215.1"),
    aprAB = c("GCA_001800245.1", "GCA_002898195.1", "GCA_002717185.1", "GCF_000328625.1", 
      "GCF_002252565.1", "GCA_001805205.1", "GCA_001784555.1", "GCA_001443375.1"),
    dmsA = c("GCF_000384115.1", "GCA_001593855.1", "GCF_000020005.1", "GCA_003242675.1", 
      "GCA_001771285.1", "GCA_002717245.1", "GCA_003223635.1", "GCF_001051235.1", 
      "GCA_001304035.1", "GCF_000772535.1", "GCA_002898895.1", "GCF_001049895.1", 
      "GCF_001860525.1", "GCA_001775395.1", "GCA_001775995.1", "GCA_001830835.1", 
      "GCF_000487995.1", "GCA_002747435.1", "GCA_002839495.1", "GCA_001515205.2", 
      "GCA_001742785.1", "GCA_001775755.1", "GCA_001768675.1"),
    mddA = c("GCA_003223145.1", "GCA_001872725.1", "GCF_000018105.1", "GCA_001447805.1", 
      "GCA_002400775.1", "GCA_001563325.1", "GCA_002746235.1", "GCA_001780825.1", 
      "GCF_000970205.1", "GCA_002746185.1", "GCA_002790835.1", "GCF_001886815.1", 
      "GCF_000192575.1", "GCA_002256595.1", "GCF_900111015.1", "GCA_001664505.1", 
      "GCA_002841995.1", "GCA_002699105.1", "GCF_002563855.1", "GCA_003219195.1"),
    dmdA = c("GCA_002707655.1", "GCF_900102465.1", "GCA_001800745.1", "GCF_001029505.1", 
      "GCA_002717565.1", "GCA_002701885.1", "GCA_002722565.1")
  )

  # Get amino acid compositions and Zc for genomes
  aa <- read.csv(system.file("extdata/genoGOE/MCK+23/genome_aa.csv", package = "JMDplots"))
  Zcvals <- Zc(aa)
  # List Zc for each genome in list
  Zclist <- lapply(genomes, function(genome) Zcvals[aa$organism %in% genome])
  # Use colors from Mateos et al., 2023
  dsr <- "#bcb2ce"
  sox <- "#45b78d"
  mdd <- "#c24a96"
  # Colors for protein groups
  col <- c(dsr, sox, sox, dsr, mdd, mdd, mdd)

  # Plot Zc of genomes with S-cycling genes from Mateos et al. (2023)  20240916
  sulfur_Zc <- function() {
    # Setup plot
    par(mar = c(4.1, 4.1, 4.1, 2.1))
    n <- length(Zclist)
    boxplot(Zclist, col = col, names = character(n), xlab = "Age of earliest gene event (Ga)", ylab = "Zc of all proteins in genome")
    text(2.5, -0.24, hyphen.in.pdf("Dissimilatory sulfate-sulfite-sulfide oxidation/reduction"), col = dsr)
    text(2.8, -0.12, hyphen.in.pdf("Sulfate-thiosulfate\noxidation/reduction"), col = sox)
    text(6, -0.22, "Organic sulfur cycling", col = mdd)
    # Ages from Table 2 of Mateos et al.
    ages <- c("3.3-3.35", "2.65-2.88", "2.6", "2.33-2.47", "2.28", "1.77", "0-2.38")
    axis(1, at = 1:n, labels = ages, lwd = 0)
    axis(3, at = 1:n, labels = names(Zclist), line = 0.5, lwd = 0, font = 3)
    # Add number of genomes to labels
    n_genomes <- paste0("(", sapply(Zclist, length), ")")
    axis(3, at = 1:n, labels = n_genomes, line = -0.5, lwd = 0)
    title(hyphen.in.pdf("Sulfur-cycling gene or gene cluster (# of exclusive genomes)"), font.main = 1, line = 3)
  }

  # Affinity ranking for genomes with different S-cycling genes
  sulfur_affinity <- function() {
    par(mar = c(4.1, 4.1, 4.1, 4.1))
    basis("QEC+")
    # Keep genomes with single sulfur-cycling genes listed above
    myaa <- aa[aa$organism %in% unlist(genomes), ]
    # Load proteins for each genome
    ip <- add.protein(myaa, as.residue = TRUE)
    # Set resolution
    res <- 150
    # Calculate affinity of composition reactions as a function of Eh and pH
    a <- affinity(pH = c(3, 10, res), O2 = c(-72.5, -58, res), iprotein = ip)
    # Group genomes according to presence of sulfur-cycling genes
    groups <- sapply(genomes, function(genome) match(genome, myaa$organism))
    # Calculate normalized sum of ranks for each group and make diagram
    arank <- rank.affinity(a, groups)
    # Lighten colors
    fill <- adjustcolor(col, alpha.f = 0.3)
    # Adjust labels
    names <- names(genomes)
    names[5] <- ""
    diagram(arank, fill = fill, lty = 1, lwd = 1.5, font = 3, names = names)
    text(4.3, -68.6, "dmsA", font = 3)
    lines(c(4.04, 3.68), c(-68.60, -68.76))
    lines(c(3.28, 3.17), c(-65.30, -66.25))
  }

  if(pdf) pdf("Figure_4.pdf", width = 8, height = 12)
  layout(matrix(1:2), heights = c(5, 7))
  sulfur_Zc()
  label.figure("A", font = 2, cex = 1.6)
  sulfur_affinity()
  # Overlay stability boundaries for other genomes
  genoGOE_3D("O2", add = TRUE, lwd = 4, pHlim = c(3, 10), alpha.f = 0.7, Eh7_las = 0)
  title(main = hyphen.in.pdf("Groupwise relative stabilities of proteins in\ngenomes with different S-cycling genes"), font.main = 1)
  label.figure("B", font = 2, cex = 1.6)
  if(pdf) dev.off()

}

# Carbon oxidation state of ancestral and extant nitrogenases
# 20240216 first version
# 20250325 added to JMDplots
# 20250327 use more representative extant nitrogenases (Garcia et al. Fig. 6)
genoGOE_5 <- function(pdf = FALSE) {

  if(pdf) pdf("Figure_5.pdf", width = 7, height = 5.5)

  ## Read FASTA file of ancient and extant sequences,
  ## downloaded from https://github.com/kacarlab/AncientNitrogenase.git
  #aa <- canprot::read_fasta("GMKK20/Extant-MLAnc_Align.fasta")
  # Get amino acid sequences precomputed from Extant-MLAnc_Align.fasta
  aa <- read.csv(system.file("extdata/genoGOE/GMKK20/nitrogenase_aa.csv", package = "JMDplots"))

  # List forms of nitrogenase and their ancestors
  form_to_anc <- list(
    "Clfx" = "D",
    "F-Mc" = "C",
    "Mb-Mc" = "B",
    "Anf" = "A",
    "Vnf" = "A",
    "Nif-II" = "E",
    "Nif-I" = "E"
  )
  # Use colors from Garcia et al. (2020) Fig. 2
  cols <- c(A = "#df674f", B = "#b779dc", C = "#27a08d", D = "#e8c44c", E = "#759dce")
  pt_cols <- adjustcolor(cols, alpha.f = 0.75)
  names(pt_cols) <- names(cols)

  # Start plot
  plot(extendrange(c(1, 7)), c(-0.20, -0.12), xlab = "Form of nitrogenase", ylab = axis.label("ZC"), type = "n", axes = FALSE)
  axis(side = 1, at = seq_along(form_to_anc), labels = hyphen.in.pdf(names(form_to_anc)))
  axis(side = 2)
  box()

  # Loop over nitrogenase forms
  for(iform in seq_along(form_to_anc)) {

    # Calculate Zc of the ancestral proteins
    node <- form_to_anc[[iform]]
    ianc <- grepl(paste0("^Anc", node), aa$protein)
    Zc_anc <- Zc(aa[ianc, ])

    # Plot lines for ancestral proteins
    dx <- 0.25
    for(Zc in Zc_anc) lines(c(iform-dx, iform+dx), c(Zc, Zc), col = cols[node])

    # Calculate Zc of the extant proteins
    form <- names(form_to_anc[iform])
    iext <- which(aa$ref == form)
    Zc_ext <- Zc(aa[iext, ])
    points(rep(iform, length(Zc_ext)), Zc_ext, pch = 19, col = pt_cols[node])

  }

  # Add legend and title
  legend("bottomright", c("Extant", "Ancestral"), lty = c(NA, 1), pch = c(19, NA), bty = "n")
  title("Carbon oxidation state of nitrogenase sequences", font.main = 1)

  if(pdf) dev.off()

}