R/plots.R

## plots.R
##
## apr-10-2016 (exe) created
## feb-14-2017 (exe) added b-value, occupany, normalized b-value, and mobility
##                   summary histograms
## mar-16-2017 (exe) updated the documentation (rMarkdown)
## apr-12-2017 (exe) updated plotting functions to take results from ConservedWaters()
## apr-12-2017 (exe) added multi-page facet plot examples
## apr-13-2017 (exe) updated plotting functions to take results from CleanProteinStructures()
## apr-16-2017 (exe) updated documentation of ClusterSummaryPlots() sub-plots
## jul-07-2017 (exe) updated functions to accept ConservedWaters() output
## jul-25-2017 (exe) updated documentation
## jul-29-2017 (exe) MobNormBvalEvalPlots() uses multiple facets to create plot
## jul-31-2017 (exe) updated *.summ() plot examples for ggforce
## jul-31-2017 (exe) updated *.summ() plot examples with \dontrun{}
## jul-31-2017 (exe) updated CleanProteinStructure() -> CleanProteinStructures() in documentation
## jul-31-2017 (exe) updated ClusterSummaryPlots() documentation
## aug-04-2017 (exe) added protein mobility to BoundWaterEnvSummaryPlot()
## aug-04-2017 (exe) removed protein hydrophilicity from BoundWaterEnvSummaryPlot()
## jan-17-2019 (exe) updated BvalueBarplot.summ() to accomodate fringe values
## jan-17-2019 (exe) updated normBvalueBarplot.summ() to accomodate fringe values
## jan-17-2019 (exe) updated OccupancyBarplot.summ() to accomodate fringe values
##
## Please direct all questions to Emilio Xavier Esposito, PhD
## exeResearch LLC, East Lansing, Michigan 48823 USA
## http://www.exeResearch.com
## emilio AT exeResearch DOT com
## emilio DOT esposito AT gmail DOT com
##
## Copyright (c) 2019, Emilio Xavier Esposito
##
## Permission is hereby granted, free of charge, to any person obtaining
## a copy of this software and associated documentation files (the
## "Software"), to deal in the Software without restriction, including
## without limitation the rights to use, copy, modify, merge, publish,
## distribute, sublicense, and/or sell copies of the Software, and to
## permit persons to whom the Software is furnished to do so, subject to
## the following conditions:
##
## The above copyright notice and this permission notice shall be
## included in all copies or substantial portions of the Software.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
## EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
## MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
## NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
## LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
## OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
## WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
##
## Based on http://opensource.org/licenses/MIT
##


## SUMMARY BARPLOTS (log10) ----------------------------------------------------

## Occupancy Summary Barplots (log10) docs -------------------------------------
#' @title Occupancy Summary Barplots
#' @description Occupancy summary barplots for the PDB structures. The plots are
#'   faceted and displays the binned occupancy values for all the structures.
#'   The counts are presented on a `log10` scale.
#'
#' @param data The results from the [CleanProteinStructures()] function. Will
#'   use the binned occupancy data.
#'
#' @export
#'
#' @import ggplot2
#' @import reshape2
#'
#' @examples
#'   \dontrun{
#'   OccupancyBarplot.summ(data)
#'
#'   ##----- multiple pages
#'   library(ggforce)
#'   occ.barplots.summary <- OccupancyBarplot.summ(data)
#'   num.pages <- ceiling(nrow(data$occupancy.counts) / 10)
#'
#'   pdf(file="multiple_pages.pdf", height=11, width=8.5)
#'   for (page in seq_len(num.pages)) {
#'     print(occ.barplots.summary +
#'           ggforce::facet_wrap_paginate(~PDBid,
#'                                        ncol = 2, nrow = 5, page = page) )
#'   }
#'   dev.off()
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
OccupancyBarplot.summ <- function(data) {

  ##_ extract occupancy.counts from cleaning results -----
  occupancy.counts <- data$occupancy.counts

  ##_ melt the data -----
  occ.summ.melt <- reshape2::melt(occupancy.counts, id.var = "PDBid")

  ##_ construct x-axis tick labels -----
  x.axis.tick.labels <- c(-1, seq(from = 0, to = 1, by = 0.1), 2)
  x.axis.tick.labels[1] <- expression(paste('\u003c', "0", sep = ""))
  x.axis.tick.labels[length(x.axis.tick.labels)] <-
    expression(paste('\u003e', "1", sep = ""))

  ##_ construct the plot -----
  occ.summ.plots <- ggplot2::ggplot(data = occ.summ.melt,
                                    aes_string(x = "variable", y = "value")) +
    ggplot2::geom_bar(stat = "identity") +
    ggplot2::facet_wrap(~PDBid, ncol = 2) +
    ggplot2::scale_y_log10() +
    ggplot2::labs(title = "Occupancy Summary (Initial Structures)",
                  x = "Occupancy Values",
                  y = "Counts (log10)") +
    ggplot2::scale_x_discrete(labels = x.axis.tick.labels) +
    ggplot2::theme(legend.position = "none",
                   axis.text.x = element_text(angle = 90,
                                              size = 10,
                                              hjust = 1,
                                              vjust = 0.5))

  ##_ return plot -----
  occ.summ.plots

}


## B-value Summary Barplots (log10) docs -------------------------------------
#' @title B-value Summary Barplots
#' @description B-value summary barplots for the PDB structures. The plots are
#'   faceted and displays the binned B-value values for all the structures.
#'   The counts are presented on a `log10` scale.
#'
#' @param data The results from the [CleanProteinStructures()] function. Will use
#'   the binned B-value data.
#'
#' @export
#'
#' @import ggplot2
#' @import reshape2
#'
#' @examples
#'   \dontrun{
#'   BvalueBarplot.summ(data)
#'
#'   ##----- multiple pages
#'   library(ggforce)
#'   Bvalue.barplots.summary <- BvalueBarplot.summ(data)
#'   num.pages <- ceiling(nrow(data$Bvalue.counts) / 10)
#'
#'   pdf(file="multiple_pages.pdf", height=11, width=8.5)
#'   for (page in seq_len(num.pages)) {
#'     print(Bvalue.barplots.summary +
#'           ggforce::facet_wrap_paginate(~PDBid,
#'                                        ncol = 2, nrow = 5, page = page) )
#'   }
#'   dev.off()
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
BvalueBarplot.summ <- function(data) {

  ##_ extract Bvalue.counts from cleaning results -----
  Bvalue.counts <- data$Bvalue.counts

  ##_ melt the data -----
  bvalue.summ.melt <- reshape2::melt(Bvalue.counts, id.var = "PDBid")

  ##_ construct x-axis tick labels -----
  x.axis.tick.labels <- c(-100, seq(from = 0, to = 100, by = 5), 200)
  x.axis.tick.labels[1] <- expression(paste('\u003c', "0", sep = ""))
  x.axis.tick.labels[length(x.axis.tick.labels)] <-
    expression(paste('\u003e', "100", sep = ""))

  ##_ construct the plot -----
  bvalue.summ.plots <- ggplot2::ggplot(data=bvalue.summ.melt,
                                       aes_string(x = "variable", y = "value")) +
    ggplot2::geom_bar(stat = "identity") +
    ggplot2::facet_wrap(~PDBid, ncol = 2) +
    ggplot2::scale_y_log10() +
    ggplot2::labs(title = "B-value Summary (Initial Structures)",
                  x = "B-values",
                  y = "Counts (log10)") +
    ggplot2::scale_x_discrete(labels = x.axis.tick.labels ) +
    ggplot2::theme(legend.position = "none",
                   axis.text.x = element_text(angle = 90, size = 10,
                                              hjust = 1, vjust = 0.5))

  ##_ return plot -----
  bvalue.summ.plots

}


## Normalized B-value Summary Barplots (log10) docs ---------------------------
#' @title Normalized B-value Summary Barplots
#' @description B-value summary barplots for the PDB structures. The plots are
#'   faceted and displays the binned B-value values for all the structures.
#'   The counts are presented on a `log10` scale.
#'
#' @param data The results from the [CleanProteinStructures()] function. Will
#'   use the binned normalized B-value data.
#'
#' @export
#'
#' @import ggplot2
#' @import reshape2
#'
#' @examples
#'   \dontrun{
#'   normBvalueBarplot.summ(data)
#'
#'   ##----- multiple pages
#'   library(ggforce)
#'   nBvalue.barplots.summary <- normBvalueBarplot.summ(data)
#'   num.pages <- ceiling(nrow(data$normBvalue.counts) / 10)
#'
#'   pdf(file="multiple_pages.pdf", height=11, width=8.5)
#'   for (page in seq_len(num.pages)) {
#'     print(nBvalue.barplots.summary +
#'           ggforce::facet_wrap_paginate(~PDBid,
#'                                        ncol = 2, nrow = 5, page = page) )
#'   }
#'   dev.off()
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
normBvalueBarplot.summ <- function(data) {

  ##----- extract occupancy.counts from cleaning results
  normBvalue.counts <- data$normBvalue.counts

  ##----- sum the less populated z-scores
  ##--- -7 to -2.5 z-scores
  zScores.lower.sums <- rowSums(normBvalue.counts[, 2:47])
  ##--- 3 to 7 z-scores
  zScores.upper.sums <- rowSums(normBvalue.counts[, 102:142])
  ##--- construct a new data.frame
  nBvalue.counts.short <- data.frame(PDBid=normBvalue.counts[, 1],
                                     zScores.lower.sums,
                                     normBvalue.counts[, 48:101],
                                     zScores.upper.sums,
                                     stringsAsFactors = FALSE)
  ##--- construct x-axis tick labels
  x.axis.tick.labels <- round(seq(from = -2.5, to = 3, by = 0.1), digits = 1)
  x.axis.tick.minor.tf <- rep(c(FALSE, TRUE, TRUE, TRUE, TRUE), 11)
  x.axis.tick.labels[x.axis.tick.minor.tf] <- ""
  x.axis.tick.labels[1] <- expression(paste('\u2264', "-2.5", sep = ""))
  x.axis.tick.labels[length(x.axis.tick.labels)] <-
    expression(paste('\u2265', "3", sep = ""))

  ##----- melt the data
  nBvalue.counts.melt <- reshape2::melt(nBvalue.counts.short, id.var = "PDBid")

  ##----- construct the plot
  nBvalue.counts.plots <- ggplot2::ggplot(data = nBvalue.counts.melt,
                                          aes_string(x = "variable", y = "value")) +
    ggplot2::geom_bar(stat = "identity") +
    ggplot2::facet_wrap(~PDBid, ncol = 2) +
    ggplot2::scale_y_log10() +
    ggplot2::labs(title = "Normalized B-value Summary (Initial Structures)",
                  x = "Z-Score",
                  y = "Counts (log10)") +
    ggplot2::scale_x_discrete(labels = x.axis.tick.labels) +
    ggplot2::theme(legend.position = "none",
                   axis.text.x = element_text(size = 10))

  ##----- return plot
  nBvalue.counts.plots

}


## Mobility Summary Barplots (log10) docs --------------------------------------
#' @title Mobility Summary Barplots
#' @description Mobility summary barplots for PDB structures. The plots are
#'   faceted and displays the binned B-value values for all the structures. The
#'   counts are presented on a `log10` scale. The function will automatically
#'   plot ten plots per page.
#'
#' @param data The results from the [CleanProteinStructures()] function. Will
#'   use the binned mobility data.
#'
#' @export
#'
#' @import ggplot2
#' @import reshape2
#'
#' @examples
#'   \dontrun{
#'   MobilityBarplot.summ(data)
#'
#'   ##----- multiple pages
#'   library(ggforce)
#'   mob.barplots.summary <- MobilityBarplot.summ(data)
#'   num.pages <- ceiling(nrow(data$mobility.counts) / 10)
#'
#'   pdf(file="multiple_pages.pdf", height=11, width=8.5)
#'   for (page in seq_len(num.pages)) {
#'     print(mob.barplots.summary +
#'           ggforce::facet_wrap_paginate(~PDBid,
#'                                        ncol = 2, nrow = 5, page = page) )
#'   }
#'   dev.off()
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
MobilityBarplot.summ <- function(data) {

  ##----- extract occupancy.counts from cleaning results
  mobility.counts <- data$mobility.counts

  ##----- sum the less populated z-scores
  ##--- 3 to 6 z-scores
  zScores.upper.sums <- rowSums(mobility.counts[, 32:62])
  ##--- construct a new data.frame
  mobility.counts.short <- data.frame(PDBid=mobility.counts[, 1],
                                      mobility.counts[, 2:31],
                                      zScores.upper.sums,
                                      stringsAsFactors = FALSE)
  ##--- construct x-axis tick labels
  x.axis.tick.labels <- round(seq(from = 0, to = 3, by = 0.1), digits = 1)
  x.axis.tick.minor.tf <- rep(c(FALSE, TRUE, TRUE, TRUE, TRUE), 6)
  x.axis.tick.labels[x.axis.tick.minor.tf] <- ""
  x.axis.tick.labels[length(x.axis.tick.labels)] <-
    expression(paste('\u2265', "3", sep = ""))

  ##----- melt the data
  mobility.counts.melt <- reshape2::melt(mobility.counts.short, id.var = "PDBid")

  ##----- construct the plot
  mobility.counts.plots <- ggplot2::ggplot(data = mobility.counts.melt,
                                           aes_string(x = "variable",
                                                      y = "value")) +
    ggplot2::geom_bar(stat = "identity") +
    ggplot2::facet_wrap(~PDBid, ncol = 2) +
    ggplot2::scale_y_log10() +
    ggplot2::labs(title = "Mobility Summary (Initial Structures)",
                  x = "Z-Score",
                  y = "Counts (log10)") +
    ggplot2::scale_x_discrete(labels = x.axis.tick.labels) +
    ggplot2::theme(legend.position = "none",
                   axis.text.x = element_text(size = 10
                                              # , angle = 90,
                                              # hjust = 1,
                                              # vjust = 0.5
                   ) )

  ##----- return plot
  mobility.counts.plots

}


## RESULTS FOCUSED PLOTS -------------------------------------------------------

## Conservation Plot docs ------------------------------------------------------
#' @title Conservation Plot (Number of Waters Per Cluster Histogram)
#' @description Histogram and density plots for number of cluster with number of
#'   atoms
#' @details Constructs a histogram for the number of waters per cluster.
#'   Clusters with less than 50\% conservation are light grey, clusters with 50
#'   to 69\% water conseration are dark red, clusters with 70 to 79\%
#'   conservation are red, 80 to 89\% conservation are light blue, 90 to 99\%
#'   conservation are blue, and 100\% conservation (waters from all structures)
#'   are dark blue.
#'
#' @param data The `h2o.clusters.summary` data.frame from the
#'   [ClusterWaters()] function containing the `num.waters` information.
#'   The `num.waters` values are integers.
#' @param passed.waters Logical indicator to plot results for waters **passing**
#'   [Mobility()] and [NormalizedBvalue()] _**OR**_ using **all** waters within
#'   the `PDB` files.
#'
#' @export
#'
#' @import ggplot2
#'
#' @examples
#'   \dontrun{
#'   Conservation.plot <- ConservationPlot(data=thrombin10.conservedWaters,
#'                                         passed.waters=TRUE)
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family "Results Visualization" "Results Plots"
#'
#' @references
#'   Paul C Sanschagrin and Leslie A Kuhn. Cluster analysis of
#'   consensus water sites in thrombin and trypsin shows conservation between
#'   serine proteases and contributions to ligand specificity. _Protein
#'   Science_, 1998, **7** (_10_), pp 2054-2064.
#'   [DOI: 10.1002/pro.5560071002](http://doi.org/10.1002/pro.5560071002)
#'   [PMID: 9792092](http://www.ncbi.nlm.nih.gov/pubmed/9792092)
#'   [WatCH webpage](http://www.kuhnlab.bmb.msu.edu/software/watch/index.html)
#'
ConservationPlot <- function(data, passed.waters=TRUE) {

  ##_ get the PASSED or ALL waters plot data -----
  if (passed.waters == TRUE) {
    plot.data <- data$h2o.cluster.passed$h2o.clusters.summary
  } else {
    plot.data <- data$h2o.cluster.all$h2o.clusters.summary
  }

  ##_ add the conservation sets if not present -----
  if ( !any(colnames(plot.data) == "conserve.set") ) {
    plot.data$conserve.set <- ConservationSet(plot.data$pct.conserved)
  }

  ##_ determine binwidth based on the largest number of waters in a cluster -----
  num.waters.max <- max(plot.data$num.waters)
  if ( num.waters.max >= 100 ) bin.width <- 5
  if ( num.waters.max <  100 ) bin.width <- 2
  if ( num.waters.max <=  25 ) bin.width <- 1

  ##_ create the plot -----
  hist.plot <- ggplot2::ggplot(data = plot.data,
                               aes_string(x = "num.waters",
                                          fill = "conserve.set")) +
    ggplot2::geom_histogram(stat = "bin", binwidth = bin.width) +
    ##__ set the fill and line colors -----
    ggplot2::scale_fill_manual(values = cons.color6,
                               name = cons.color6.legend,
                               breaks = cons.color6.breaks,
                               labels = cons.color6.labels) +
    ##__ format axis labels -----
    # hist.plot <- hist.plot + scale_y_continuous(breaks=NULL)
    ggplot2::labs(title = "Number of Waters per Cluster",
                  x="Number of Protein Structures",
                  y="Number of Water Clusters") +
    ggplot2::theme(legend.position = "bottom")

  ##_ return plot -----
  hist.plot

}


## Occupancy Barplot docs ------------------------------------------------------
#' @title Occupancy Barplots
#' @description Occupancy Barplots for Cluster with at least 50\% Conservation
#' @details Constructs a barplot with corresponding density plot for the mean
#'   occupancy value for all water within each cluster with at least 50\% water
#'   conservation. Clusters with 50 to 69\% water conseration are dark red,
#'   clusters with 70 to 79\% conservation are red, 80 to 89\% conservation are
#'   light blue, 90 to 99\% conservation are blue, and 100\% conservation (waters
#'   from all structures) are dark blue.
#'
#'   This plot was inspired by Figure 1 of Sanschagrin and Kuhn (1998).
#'
#' @param data The `h2o.clusters.summary` data.frame from the
#'   [ClusterWaters()] function containing the `o.mu` information.
#' @param passed.waters Logical indicator to plot results for waters **passing**
#'   [Mobility()] and [NormalizedBvalue()] _**OR**_ using **all** waters within
#'   the `PDB` files.
#'
#' @export
#'
#' @import ggplot2
#'
#' @examples
#'   \dontrun{
#'   occupancy.plot <- OccupancyBarplot(data=thrombin10.conservedWaters,
#'                                      passed.waters=TRUE)
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
#' @references
#'   Paul C Sanschagrin and Leslie A Kuhn. Cluster analysis of
#'   consensus water sites in thrombin and trypsin shows conservation between
#'   serine proteases and contributions to ligand specificity. _Protein
#'   Science_, 1998, **7** (_10_), pp 2054-2064.
#'   [DOI: 10.1002/pro.5560071002](http://doi.org/10.1002/pro.5560071002)
#'   [PMID: 9792092](http://www.ncbi.nlm.nih.gov/pubmed/9792092)
#'   [WatCH webpage](http://www.kuhnlab.bmb.msu.edu/software/watch/index.html)
#'
OccupancyBarplot <- function(data, passed.waters=TRUE) {

  ##_ get the PASSED or ALL waters plot data -----
  if (passed.waters == TRUE) {
    plot.data <- data$h2o.cluster.passed$h2o.clusters.summary
  } else {
    plot.data <- data$h2o.cluster.all$h2o.clusters.summary
  }

  ##_ only the waters with 50% or greater conservation ------
  plot.data <- plot.data[plot.data$pct.conserved >= 50, ]

  ##_ determine the minimum and maximum x-axis values -----
  x.axis.min <- floor(min(plot.data$o.mu))
  x.axis.max <- ceiling(max(plot.data$o.mu))

  ##_ add the conservation sets if not present -----
  if ( !any(colnames(plot.data) == "conserve.set") ) {
    plot.data$conserve.set <- ConservationSet(plot.data$pct.conserved)
  }

  ##_ construct the plot -----
  occ.plot <- ggplot2::ggplot(data = plot.data,
                              aes_string(x = "o.mu",
                                         fill = "conserve.set",
                                         colour = "conserve.set")) +
    ggplot2::geom_density(alpha = 0.2, size = 1) +
    ggplot2::geom_histogram(aes_string(y = "..density.."),
                            position = "dodge", stat = "bin",
                            binwidth = 0.05, colour = "white") +
    ##__ set the fill and line colors -----
    ggplot2::scale_fill_manual(values = cons.color5,
                               name = cons.color5.legend,
                               breaks = cons.color5.breaks,
                               labels = cons.color5.labels) +
    ggplot2::scale_colour_manual(values=cons.color5,
                                 name = cons.color5.legend,
                                 breaks = cons.color5.breaks,
                                 labels = cons.color5.labels) +
    ##__ format axis labels -----
    # occ.plot <- occ.plot + scale_y_continuous(breaks=NULL)
    ggplot2::labs(title = "Occupancy",
                  x = "Mean Occupancy Value", y = "") +
    ggplot2::theme(legend.position = "bottom")

  ##_ return plot -----
  occ.plot

}


## Mobility Barplot docs --------------------------------------------------
#' @title Mobility Barplots
#' @description Mobility Barplots for Cluster with at least 50\% Conservation
#' @details Constructs a barplot with corresponding density plot for the mean
#'   mobility value for all water within each cluster with at least 50\% water
#'   conservation. Clusters with 50 to 69\% water conseration are dark red,
#'   clusters with 70 to 79\% conservation are red, 80 to 89\% conservation are
#'   light blue, 90 to 99\% conservation are blue, and 100\% conservation (waters
#'   from all structures) are dark blue.
#'
#'   The mobility values are calculated by the [Mobility()] function.
#'
#'   This plot was inspired by Figure 1 of Sanschagrin and Kuhn (1998).
#'
#' @param data The `h2o.clusters.summary` data.frame from the
#'   [ClusterWaters()] function containing the `mobility.mu`
#'   information.
#' @param passed.waters Logical indicator to plot results for waters **passing**
#'   [Mobility()] and [NormalizedBvalue()] _**OR**_ using **all** waters within
#'   the `PDB` files.
#'
#' @export
#'
#' @import ggplot2
#'
#' @examples
#'   \dontrun{
#'   mobility.plot <- MobilityBarplot(data=thrombin10.conservedWaters,
#'                                    passed.waters=TRUE)
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
#' @references
#'   Paul C Sanschagrin and Leslie A Kuhn. Cluster analysis of
#'   consensus water sites in thrombin and trypsin shows conservation between
#'   serine proteases and contributions to ligand specificity. _Protein
#'   Science_, 1998, **7** (_10_), pp 2054-2064.
#'   [DOI: 10.1002/pro.5560071002](http://doi.org/10.1002/pro.5560071002)
#'   [PMID: 9792092](http://www.ncbi.nlm.nih.gov/pubmed/9792092)
#'   [WatCH webpage](http://www.kuhnlab.bmb.msu.edu/software/watch/index.html)
#'
MobilityBarplot <- function(data, passed.waters=TRUE) {

  ##_ get the PASSED or ALL waters plot data -----
  if (passed.waters == TRUE) {
    plot.data <- data$h2o.cluster.passed$h2o.clusters.summary
  } else {
    plot.data <- data$h2o.cluster.all$h2o.clusters.summary
  }

  ##_ only the waters with 50% or greater conservation ------
  plot.data <- plot.data[plot.data$pct.conserved >= 50, ]

  ##_ determine the minimum and maximum x-axis values -----
  x.axis.min <- floor(min(plot.data$mobility.mu))
  x.axis.max <- ceiling(max(plot.data$mobility.mu))

  ##_ add the conservation sets if not present -----
  if ( !any(colnames(plot.data) == "conserve.set") ) {
    plot.data$conserve.set <- ConservationSet(plot.data$pct.conserved)
  }

  ##_ construct the plot -----
  mobility.plot <- ggplot2::ggplot(data = plot.data,
                                   aes_string(x = "mobility.mu",
                                              fill = "conserve.set",
                                              colour = "conserve.set")) +
    ggplot2::geom_density(alpha = 0.4, size = 1) +
    ggplot2::geom_histogram(aes_string(y = "..density.."),
                            position = "dodge", stat = "bin",
                            binwidth = 0.10, colour = "white") +
    ##__ set the fill and line colors -----
    ggplot2::scale_fill_manual(values = cons.color5,
                               name = cons.color5.legend,
                               breaks = cons.color5.breaks,
                               labels = cons.color5.labels) +
    ggplot2::scale_colour_manual(values = cons.color5,
                                 name = cons.color5.legend,
                                 breaks = cons.color5.breaks,
                                 labels = cons.color5.labels) +
    ##__ format axis labels ----- -----
    # mobility.plot <- mobility.plot + scale_y_continuous(breaks=NULL)
    ggplot2::scale_x_continuous(breaks = seq(x.axis.min, x.axis.max, 0.5)) +
    ggplot2::labs(title = "Mobility",
                  x = "Mean Mobility Value", y = "") +
    ggplot2::theme(legend.position = "bottom")

  ##_ return the plot -----
  mobility.plot

}


## B-value Barplot docs -------------------------------------------------------
#' @title B-value Barplots
#' @description B-value Barplots for Cluster with at least 50\% Conservation
#' @details Constructs a barplot with corresponding density plot for the mean
#'   B-value value for all water within each cluster with at least 50\% water
#'   conservation. Clusters with 50 to 69\% water conseration are dark red,
#'   clusters with 70 to 79\% conservation are red, 80 to 89\% conservation are
#'   light blue, 90 to 99\% conservation are blue, and 100\% conservation (waters
#'   from all structures) are dark blue.
#'
#'   This plot was inspired by Figure 1 of Sanschagrin and Kuhn (1998).
#'
#' @param data The `h2o.clusters.summary` data.frame from the
#'   [ClusterWaters()] function containing the `b.exp.mu`
#'   information.
#' @param passed.waters Logical indicator to plot results for waters **passing**
#'   [Mobility()] and [NormalizedBvalue()] _**OR**_ using **all** waters within
#'   the `PDB` files.
#' @param calc.values Plot the calculated B-values the mean experimental B-values; default: TRUE
#'
#' @export
#'
#' @import ggplot2
#'
#' @examples
#'   \dontrun{
#'   Bvalue.plot <- BvalueBarplot(data=thrombin10.conservedWaters,
#'                                passed.waters=TRUE)
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
#' @references
#'   Paul C Sanschagrin and Leslie A Kuhn. Cluster analysis of
#'   consensus water sites in thrombin and trypsin shows conservation between
#'   serine proteases and contributions to ligand specificity. _Protein
#'   Science_, 1998, **7** (_10_), pp 2054-2064.
#'   [DOI: 10.1002/pro.5560071002](http://doi.org/10.1002/pro.5560071002)
#'   [PMID: 9792092](http://www.ncbi.nlm.nih.gov/pubmed/9792092)
#'   [WatCH webpage](http://www.kuhnlab.bmb.msu.edu/software/watch/index.html)
#'
BvalueBarplot <- function(data, passed.waters=TRUE, calc.values=TRUE) {

  ##_ get the PASSED or ALL waters plot data -----
  if (passed.waters == TRUE) {
    plot.data <- data$h2o.cluster.passed$h2o.clusters.summary
  } else {
    plot.data <- data$h2o.cluster.all$h2o.clusters.summary
  }

  ##_ experimental mean or calculated B-values -----
  if (calc.values == TRUE) {
    x.column.name <- "b.calc"
    x.axis.name <- "Calculated B-value"
    title <- "B-value (Calculated from cluster's RMSF)"
  } else {
    x.column.name <- "b.exp.mu"
    x.axis.name <- "Mean B-value"
    title <- "B-value (Mean calculated from cluster's experimental B-values)"
  }

  ##_ only the waters with 50% or greater conservation ------
  plot.data <- plot.data[plot.data$pct.conserved >= 50, ]

  ##----- determine the minimum and maximum x-axis values
  # x.axis.min <- floor(min(plot.data[[x.column.name]]))
  # x.axis.max <- ceiling(max(plot.data[[x.column.name]]))

  ##_ add the conservation sets if not present -----
  if ( !any(colnames(plot.data) == "conserve.set") ) {
    plot.data$conserve.set <- ConservationSet(plot.data$pct.conserved)
  }

  ##----- construct the plot
  Bvalue.plot <- ggplot2::ggplot(data = plot.data,
                                 aes_string(x = x.column.name,
                                            fill = "conserve.set",
                                            colour = "conserve.set")) +
    ggplot2::geom_density(alpha = 0.2, size = 1) +
    ggplot2::geom_histogram(aes_string(y = "..density.."),
                            position = "dodge", stat = "bin",
                            binwidth = 5, colour = "white") +
    ##--- set the fill and line colors
    ggplot2::scale_fill_manual(values = cons.color5,
                               name = cons.color5.legend,
                               breaks = cons.color5.breaks,
                               labels = cons.color5.labels) +
    ggplot2::scale_colour_manual(values = cons.color5,
                                 name = cons.color5.legend,
                                 breaks = cons.color5.breaks,
                                 labels = cons.color5.labels) +
    ##--- format axis labels
    # scale_y_continuous(breaks = NULL) +
    # scale_x_continuous(breaks = seq(x.axis.min, x.axis.max, 0.250)) +
    ggplot2::labs(title = title,
                  x = x.axis.name, y = "") +
    ggplot2::theme(legend.position = "bottom")

  ##----- return the plot
  Bvalue.plot

}


## Normalized B-value Barplot docs --------------------------------------------
#' @title Normalized B-value Barplots
#' @description Normalized B-value Barplots for Cluster with at least 50\%
#'   Conservation
#' @details Constructs a barplot with corresponding density plot for the mean
#'   normalized B-value value for all water within each cluster with at least
#'   50\% water conservation. Clusters with 50 to 69\% water conseration are dark
#'   red, clusters with 70 to 79\% conservation are red, 80 to 89\% conservation
#'   are light blue, 90 to 99\% conservation are blue, and 100\% conservation
#'   (waters from all structures) are dark blue.
#'
#'   The normalized B-value values are calculated by the
#'   [NormalizedBvalue()] function.
#'
#'   This plot was inspired by Figure 1 of Sanschagrin and Kuhn (1998).
#'
#' @param data The `h2o.clusters.summary` data.frame from the
#'   [ClusterWaters()] function containing the `nBvalue.mu`
#'   information.
#' @param passed.waters Logical indicator to plot results for waters **passing**
#'   [Mobility()] and [NormalizedBvalue()] _**OR**_ using **all** waters within
#'   the `PDB` files.
#'
#' @export
#'
#' @import ggplot2
#'
#' @examples
#'   \dontrun{
#'   nBvalue.plot <- nBvalueBarplot(data=thrombin10.conservedWaters,
#'                                  passed.waters=TRUE)
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
#' @references
#'   Paul C Sanschagrin and Leslie A Kuhn. Cluster analysis of
#'   consensus water sites in thrombin and trypsin shows conservation between
#'   serine proteases and contributions to ligand specificity. _Protein
#'   Science_, 1998, **7** (_10_), pp 2054-2064.
#'   [DOI: 10.1002/pro.5560071002](http://doi.org/10.1002/pro.5560071002)
#'   [PMID: 9792092](http://www.ncbi.nlm.nih.gov/pubmed/9792092)
#'   [WatCH webpage](http://www.kuhnlab.bmb.msu.edu/software/watch/index.html)
#'
nBvalueBarplot <- function(data, passed.waters=TRUE) {

  ##_ get the PASSED or ALL waters plot data -----
  if (passed.waters == TRUE) {
    plot.data <- data$h2o.cluster.passed$h2o.clusters.summary
  } else {
    plot.data <- data$h2o.cluster.all$h2o.clusters.summary
  }

  ##_ only the waters with 50% or greater conservation ------
  plot.data <- plot.data[plot.data$pct.conserved >= 50, ]

  ##_ determine the minimum and maximum x-axis values -----
  x.axis.min <- floor(min(plot.data$nBvalue.mu))
  x.axis.max <- ceiling(max(plot.data$nBvalue.mu))

  ##_ add the conservation sets if not present -----
  if ( !any(colnames(plot.data) == "conserve.set") ) {
    plot.data$conserve.set <- ConservationSet(plot.data$pct.conserved)
  }

  ##_ construct the plot -----
  nBvalue.plot <- ggplot2::ggplot(data = plot.data,
                                  aes_string(x = "nBvalue.mu",
                                             fill = "conserve.set",
                                             colour = "conserve.set")) +
    ggplot2::geom_density(alpha = 0.2, size =1) +
    ggplot2::geom_histogram(aes_string(y = "..density.."),
                            position = "dodge", stat = "bin",
                            binwidth = 0.25, colour = "white") +
    ##--- set the fill and line colors
    ggplot2::scale_fill_manual(values = cons.color5,
                               name = cons.color5.legend,
                               breaks = cons.color5.breaks,
                               labels = cons.color5.labels) +
    ggplot2::scale_colour_manual(values = cons.color5,
                                 name = cons.color5.legend,
                                 breaks = cons.color5.breaks,
                                 labels = cons.color5.labels) +
    ##__ format axis labels -----
    # nBvalue.plot <- nBvalue.plot + scale_y_continuous(breaks=NULL)
    ggplot2::scale_x_continuous(breaks = seq(x.axis.min, x.axis.max, 0.5)) +
    ggplot2::labs(title = "Normalized B-value",
                  x = "Mean Normalized B-value", y = "") +
    ggplot2::theme(legend.position = "bottom")

  ##_ return the plot -----
  nBvalue.plot

}


## Cluster Summary Plots docs --------------------------------------------------
#' @title Cluster Summary Plots
#' @description Collection of cluster summary plots.
#' @details The Number of Water Cluster (see [ConservationPlot()]), Occupancy
#'   (see [OccupancyBarplot()]), Mobility (see [MobilityBarplot()]), B-value
#'   (see [BvalueBarplot()]), and Normalized B-value (see [nBvalueBarplot()])
#'   plots are combined into a single plot image. The ability to label each plot
#'   with capital letters (upper-case) or lower-case letters is available.
#'
#' @param data The results from the [ConservedWaters()] function.
#' @param passed.waters Logical indicator to plot results for waters **passing**
#'   [Mobility()] and [NormalizedBvalue()] _**OR**_ using **all** waters within
#'   the `PDB` files.
#' @param plot.labels Using the same options as [cowplot::plot_grid()] plus
#'   `NULL`. The option `"AUTO"` labels each plot with upper-case letters
#'   (_e.g._, A, B, C, D, E), `"auto"` labels each plot with lower-case letters
#'   (_e.g._, a, b, c, d, e), and `NULL` returns plots without labels. Default
#'   is `NULL`.
#'
#' @export
#'
#' @import ggplot2
#' @importFrom cowplot plot_grid
#'
#' @examples
#'   \dontrun{
#'   cluster.summary.plot <- ClusterSummaryPlots(data=thrombin10.conservedWaters,
#'                                              passed.waters=TRUE,
#'                                              labels=NULL)
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
#' @references
#'   Paul C Sanschagrin and Leslie A Kuhn. Cluster analysis of
#'   consensus water sites in thrombin and trypsin shows conservation between
#'   serine proteases and contributions to ligand specificity. _Protein
#'   Science_, 1998, **7** (_10_), pp 2054-2064.
#'   [DOI: 10.1002/pro.5560071002](http://doi.org/10.1002/pro.5560071002)
#'   [PMID: 9792092](http://www.ncbi.nlm.nih.gov/pubmed/9792092)
#'   [WatCH webpage](http://www.kuhnlab.bmb.msu.edu/software/watch/index.html)
#'
ClusterSummaryPlots <- function(data, passed.waters=TRUE, plot.labels=NULL) {

  ##_ check plot.labels parameter -----
  plot.labels.available <- c("AUTO", "auto")
  if ( !any(plot.labels.available %in% plot.labels) & !is.null(plot.labels) ) {
    mess <- paste("The provided plot.labels value of", plot.labels,
                  "is not valid and has been replace with NULL. Please",
                  "consider using \"AUTO\", \"auto\", or NULL.", sep = " ")
    message(mess)
    plot.labels <- NULL
  }

  ##_ construct the individual plots -----
  h2o.per.cluster.hist <- ConservationPlot(data=data, passed.waters=passed.waters)
  occupancy.plot <- OccupancyBarplot(data=data, passed.waters=passed.waters)
  mobility.plot <- MobilityBarplot(data=data, passed.waters=passed.waters)
  Bvalue.plot <- BvalueBarplot(data=data, passed.waters=passed.waters)
  nBvalue.plot <- nBvalueBarplot(data=data, passed.waters=passed.waters)

  ##_ combine the plots -----
  cluster.summary.plots <- cowplot::plot_grid(
    h2o.per.cluster.hist + ggplot2::theme(legend.position="none"),
    occupancy.plot + ggplot2::theme(legend.position="none"),
    mobility.plot + ggplot2::theme(legend.position="none"),
    Bvalue.plot + ggplot2::theme(legend.position="none"),
    nBvalue.plot + ggplot2::theme(legend.position="none"),
    align = 'v',
    labels = plot.labels,
    hjust = -1,
    nrow = 5)

  ##_ legend at the bottom of all plots -----
  ##  extract the legend from the water per cluster histogram
  grobs <- ggplot2::ggplotGrob(h2o.per.cluster.hist +
                                 # guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
                                 ggplot2::theme(legend.position = "bottom",
                                                legend.box = "horizontal"))$grobs
  legend.bottom <- grobs[[which(sapply(grobs, function(x) x$name) == "guide-box")]]
  ##__ add the legend under the last plot in the column. the legend is given a -----
  ##   of 10% of the height of a plot row using the rel_heights option
  cluster.summary.plots <- cowplot::plot_grid(cluster.summary.plots,
                                              legend.bottom,
                                              ncol=1,
                                              rel_heights=c(1, .1))

  ##_ return the plot -----
  cluster.summary.plots

}


## Mobility and Normalized B-values evaluation plots -----------------------------
#' @title Mobility and Normalized B-values Evaluation Plots
#' @description Mean bound water environment summary per percent conservation
#' @details Constructs a series of scatterplots illustrating the relationship
#'   between mobility and normalized B-values and (i) percent water
#'   conservation, (ii) mean distance between waters in a cluster, and (iii)
#'   mean distance between waters in a cluster and the cluster's centroid. The
#'   dots are colored based on water cluster "percent conservation":
#'   - **light grey dots**: less than 50\% conservation
#'   - **dark red dots**: 50\% to 69\% conservations
#'   - **red dots**: 70\% to 79\% conservation
#'   - **light blue dots**: 80\% to 89\% conservation
#'   - **blue dots**: 90\% to 99\% conservation
#'   - **dark blue dots**: 100\% conservation (all structures contribute to the
#'     water cluster).
#'
#'   The mean distance plots will have a column of dots at a distance of 0.0 if
#'   there are clusters composed of a single water molecule. Thus, these
#'   clusters have a zero distance between and to other waters in their cluster
#'   because there are _**no other waters**_ in their cluster.
#'
#'   This plot was inspired by Figure 2 of Ogata and Wodak (2002).
#'
#' @param data The `h2o.clusters.summary` data.frame from the
#'   [ConservedWaters()] function containing the `nBvalue.mu`
#'   information. This data.frame is found within the `h2o.cluster.passed`
#'   and `h2o.cluster.all`
#' @param passed.waters Logical indicator to plot results for waters **passing**
#'   [Mobility()] and [NormalizedBvalue()] _**OR**_ using **all** waters within
#'   the `PDB` files.
#' @param title The title for the plot
#'
#' @export
#'
#' @import ggplot2
#' @import reshape2
#' @import scales
#' @importFrom cowplot ggdraw draw_label plot_grid
#'
#' @examples
#'   \dontrun{
#'   bwe.summary.plot <- MobNormBvalEvalPlots(data=thrombin10.conservedWaters,
#'                                            passed.waters=TRUE,
#'                                            title="Mobility and Normalized B-value Evaluation")
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
#' @references
#'   Koji Ogata and Shoshana J Wodak. Conserved water molecules in MHC
#'   class-I molecules and their putative structural and functional roles.
#'   _Protein Engineering_, 2002, **15** (_8_), pp 697-705.
#'   [DOI: 10.1093/protein/15.8.697](http://doi.org/10.1093/protein/15.8.697)
#'   [PMID: 12364585](http://www.ncbi.nlm.nih.gov/pubmed/12364585)
#'
MobNormBvalEvalPlots <- function(data,
                                 passed.waters = TRUE,
                                 title = "Mobility and Normalized B-value Evaluation") {

  ##_ get the PASSED or ALL waters plot data -----
  if (passed.waters == TRUE) {
    plot.data <- data$h2o.cluster.passed$h2o.clusters.summary
  } else {
    plot.data <- data$h2o.cluster.all$h2o.clusters.summary
  }

  ##_ add the conservation sets if not present -----
  if ( !any(colnames(plot.data) == "conserve.set") ) {
    plot.data$conserve.set <- ConservationSet(plot.data$pct.conserved)
  }

  ##_ convert percent to fraction -----
  # plot.data$pct.conserved <- (plot.data$pct.conserved / 100)

  ##_ setup the names for the facet labeling -----
  conserve.set.names <- c("set0", "set1", "set2", "set3", "set4", "set5")
  facet.labels <- c("Mobility", "Normalized B-values")

  ##_ create the needed data.frames -----
  ##__ percent conservation -----
  df.conserv <- plot.data[, c("pct.conserved",
                              "mobility.mu",
                              "nBvalue.mu",
                              "conserve.set")]
  df.conserv$group <- "PercentConserved"
  colnames(df.conserv)[1] <- "X.values"
  ##__ mean distance between waters -----
  df.dist <- plot.data[, c("dist.mu",
                           "mobility.mu",
                           "nBvalue.mu",
                           "conserve.set")]
  df.dist$group <- "DistWaters"
  colnames(df.dist)[1] <- "X.values"
  ##__ mean distance between the water and the centroid -----
  df.cent <- plot.data[, c("dist.centroid.mu",
                           "mobility.mu",
                           "nBvalue.mu",
                           "conserve.set")]
  df.cent$group <- "DistCentroid"
  colnames(df.cent)[1] <- "X.values"

  ##__ combine all the data -----
  df.plot.data <- rbind(df.conserv,
                        df.dist,
                        df.cent)

  ##_ melt the plot.data -----
  df.plot.data.melt <- reshape2::melt(df.plot.data[c(nrow(df.plot.data):1), ],
                                      id.vars = c("X.values", "conserve.set", "group") )
  df.plot.data.melt$conserve.set <- factor(df.plot.data.melt$conserve.set,
                                           levels = conserve.set.names)
  df.plot.data.melt$variable <- factor(df.plot.data.melt$variable,
                                       labels = c(expression(paste("Mobility", sep="")),
                                                  expression(paste("Normalized B-values", sep=""))
                                       )
  )
  df.plot.data.melt$group <- factor(df.plot.data.melt$group,
                                    levels = c("PercentConserved",
                                               "DistWaters",
                                               "DistCentroid")
                                    )
  df.plot.data.melt$group <- factor(df.plot.data.melt$group,
                                       labels = c(expression(paste("Percent Conservation", sep="")),
                                                  expression(paste("Mean Distance Between Waters (", ring(A), ")", sep = "")),
                                                  expression(paste("Mean Distance to Centroid (", ring(A), ")", sep = ""))
                                                  )
                                    )

  ##_ make the plot -----
  summary.plots <- ggplot2::ggplot(df.plot.data.melt,
                                   aes_string(x = "X.values", y = "value")) +
    ggplot2::geom_point(aes_string(colour = "conserve.set"), size = 3) +
    ggplot2::facet_grid(variable ~ group, scales = "free", labeller = label_parsed) +
    ggplot2::labs(x = "", y = "") +
    ggplot2::theme(legend.position = "bottom", legend.box = "horizontal") +
    ggplot2::scale_colour_manual(values = cons.color6,
                                 name = cons.color6.legend,
                                 breaks = cons.color6.breaks,
                                 labels = cons.color6.labels)

  ##_ add the title -----
  title.main <- cowplot::ggdraw() + cowplot::draw_label(title, fontface = 'bold')
  ## rel_heights values control title margins
  summary.plots <- cowplot::plot_grid(title.main, summary.plots,
                                      ncol = 1, rel_heights = c(0.04, 1))


  ##_ return the plot -----
  summary.plots

}


## bound water environment plots -----------------------------------------------
#' @title Bound Water Environment Barplots
#' @description Normalized B-value Barplots for Cluster with at least 50\%
#'   Conservation
#' @details Constructs a barplot with corresponding density plot for the mean
#'   normalized B-value value for all water within each cluster with at least
#'   50\% water conservation. Clusters with 50 to 69\% water conseration are dark
#'   red, clusters with 70 to 79\% conservation are red, 80 to 89\% conservation
#'   are light blue, 90 to 99\% conservation are blue, and 100\% conservation
#'   (waters from all structures) are dark blue.
#'
#'   The normalized B-value values are calculated by the
#'   \code{NormalizedBvalue} function.
#'
#'   This plot was inspired by Figure 1 of Sanschagrin and Kuhn (1998).
#'
#' @param data The \code{h2o.clusters.summary} data.frame from the
#'   \code{ClusterWaters} function containing the \code{nBvalue.mu}
#'   information.
#' @param passed.waters Logical indicator to plot results for waters **passing**
#'   [Mobility()] and [NormalizedBvalue()] _**OR**_ using **all** waters within
#'   the `PDB` files.
#' @param pct.conserved.gte minimum percent conservation within a water cluster;
#'   default: `50.0`If the number of clusters is less than the number of
#'   clusters defined by `num.clusters`, then the number of clusters defined by
#'   `pct.conserved.gte` is displayed.
#' @param num.clusters number (integer) of clusters to display. If the number of
#'   clusters is less than the number of clusters defined by
#'   `pct.conserved.gte`, then the number of clusters defined by `num.clusters`
#'   is displayed. A value of `NULL` results in the provided value for
#'   `pct.conserved.gte` being used.
#'
#' @export
#'
#' @import ggplot2
#' @import reshape2
#' @import scales
#'
#' @examples
#'   \dontrun{
#'   bwe.plots <- BoundWaterEnvPlots(data=thrombin10.conservedWaters,
#'                                   passed.waters=TRUE,
#'                                   pct.conserved.gte = 50.0,
#'                                   num.clusters = 50)
#'   }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
#' @references
#'   Paul C Sanschagrin and Leslie A Kuhn. Cluster analysis of
#'   consensus water sites in thrombin and trypsin shows conservation between
#'   serine proteases and contributions to ligand specificity. _Protein
#'   Science_, 1998, **7** (_10_), pp 2054-2064.
#'   [DOI: 10.1002/pro.5560071002](http://doi.org/10.1002/pro.5560071002)
#'   [PMID: 9792092](http://www.ncbi.nlm.nih.gov/pubmed/9792092)
#'   [WatCH webpage](http://www.kuhnlab.bmb.msu.edu/software/watch/index.html)
#'
BoundWaterEnvPlots <- function(data,
                               passed.waters = TRUE,
                               pct.conserved.gte = 50.0,
                               num.clusters = 50) {

  ##_ get the PASSED or ALL waters plot data -----
  if (passed.waters == TRUE) {
    plot.data <- data$h2o.cluster.passed$h2o.clusters.summary
  } else {
    plot.data <- data$h2o.cluster.all$h2o.clusters.summary
  }

  ##_ check user provided parameters -----
  ##__ percentage of conserved waters -----
  if ( pct.conserved.gte <= 1.0) {
    mess <- paste("Looks like you provided a fraction and not a percentage. ",
                  "The provided value (", pct.conserved.gte,
                  ") has been converted to ", pct.conserved.gte * 100, "%.",
                  sep = "")
    message(mess)
    pct.conserved.gte <- pct.conserved.gte * 100
  }

  ##__ number of clusters -----
  num.clusters <- as.integer(num.clusters)
  if ( num.clusters <= 1 ) {
    mess <- paste("The provided num.cluster value of", num.clusters,
                  "is less than or equal to 1 and has been changed to NULL.",
                  "The percent conversed value will be used instead.",
                  sep = " ")
    message(mess)
    num.clusters <- NULL
  }

  ##_ determine the clusters passing the pct.conserved.gte value -----
  pct.conserved.gte.tf <- plot.data$pct.conserved >= pct.conserved.gte
  num.pct.conserved.gte <- sum(pct.conserved.gte.tf)

  if ( !is.null(num.clusters) && (num.clusters <= num.pct.conserved.gte) ) {
    plot.data <- plot.data[1:num.clusters, ]
    mess <- paste("The number of requested conserved water clusters (",
                  num.clusters,
                  ") is less than the number (", num.pct.conserved.gte,
                  ") of conserved water clusters with ",
                  pct.conserved.gte, "% conservation. Thus, only ",
                  "the results for the top ",
                  num.clusters, " conserved water clusters will be plotted.",
                  sep = "")
    message(mess)
  }

  if ( is.null(num.clusters) ) {
    plot.data <- plot.data[pct.conserved.gte.tf, ]
    mess <- paste("The number of conserved water clusters with ",
                  pct.conserved.gte, "% conservation is ",
                  num.pct.conserved.gte, ".", sep = "")
    message(mess)
  }

  ##_ bound water environment plots -----
  cluster.name <- paste("cluster_", plot.data$cluster, sep="")
  num.clusters <- length(cluster.name)
  cluster.breaks <- seq(from = 10, to = num.clusters, by = 10)
  cluster.breaks <- c(1, cluster.breaks)
  cluster.numbering <- seq(from = 1, to = num.clusters)
  cluster.numbering <- rep("", num.clusters)
  cluster.numbering[cluster.breaks] <- cluster.breaks

  ##_ create the plot's data.frame -----
  df.prot <- data.frame(name = cluster.name,
                        adn = plot.data$prot.adn / plot.data$num.waters,
                        ahp = plot.data$prot.ahp.mu,
                        hbonds = plot.data$prot.hbonds / plot.data$num.waters,
                        mobiity = plot.data$prot.mobility.mu,
                        nBvalue = plot.data$prot.nBvalue.mu,
                        type = "Protein",
                        stringsAsFactors = FALSE)

  df.h2o <- data.frame(name = cluster.name,
                       adn = plot.data$h2o.adn / plot.data$num.waters,
                       ahp = NA,
                       hbonds = plot.data$h2o.hbonds / plot.data$num.waters,
                       mobiity = plot.data$h2o.mobility.mu,
                       nBvalue = plot.data$h2o.nBvalue.mu,
                       type = "Water",
                       stringsAsFactors = FALSE)

  df.het <- data.frame(name = cluster.name,
                       adn = plot.data$het.adn / plot.data$num.waters,
                       ahp = NA,
                       hbonds = plot.data$het.hbonds / plot.data$num.waters,
                       mobiity = plot.data$het.mobility.mu,
                       nBvalue = plot.data$het.nBvalue.mu,
                       type = "Other",
                       stringsAsFactors = FALSE)

  ##__ combine the rows -----
  df.bwe <- rbind(df.prot, df.h2o, df.het)

  ##_ modify the data to make the plots look nice -----
  mobility <- plot.data$call$mobility
  nBvalue <- plot.data$call$nBvalue
  df.bwe$mobiity[df.bwe$mobiity >= mobility] <- mobility + 0.1
  df.bwe$nBvalue[df.bwe$nBvalue >= nBvalue] <- nBvalue + 0.1

  ##_ rename the columns -----
  colnames(df.bwe) <- c("name", "Atomic Density", "Hydrophilicity",
                        "Hydrogen Bonds", "Mobility", "Norm B-value",
                        "type")

  ##_ melt the data.frame and order the values using factor() -----
  df.bwe.melt <- reshape2::melt(df.bwe, id.vars = c("name", "type") )
  df.bwe.melt$name <- factor(df.bwe.melt$name, levels=cluster.name)
  df.bwe.melt$type <- factor(df.bwe.melt$type, levels=c("Protein",
                                                        "Water",
                                                        "Other"))

  ##_ color -----
  bwe.colors <- c("#33a02c", ## medium green
                  "#1f78b4", ## medium blue
                  "#b2df8a"  ## light green
  )

  ##_ make the facet plot -----
  bwe.plots <- ggplot2::ggplot(data=df.bwe.melt,
                               aes_string(x = "name",
                                          y = "value",
                                          fill = "type")) +
    ggplot2::geom_bar(position = "dodge", stat = "identity") +
    ggplot2::scale_fill_manual(values = bwe.colors, name = "") +
    ggplot2::theme(axis.text.x = element_text(angle = 90,
                                              hjust = 1, vjust = 0.5)) +
    ggplot2::facet_grid(variable ~ ., scales = "free_y") +
    ggplot2::theme(legend.position = "bottom") +
    ggplot2::labs(x = "Water Cluster", y = "Mean Value") +
    ggplot2::scale_x_discrete(labels = cluster.numbering)

  ##_ return the plot -----
  bwe.plots

}


## bound water environment summary plot ----------------------------------------
#' @title Bound Water Environment Summary Plot
#' @description Mean bound water environment summary per percent conservation
#' @details Constructs a line plot with the bound water environment measures for
#'   the nearby protein and water atoms. The protein atomic density (ADN),
#'   hydrophilicity, mobility, normalized B-values, and potential hydrogen bonds
#'   are summarized for protein heavy atoms with 3.6 Angstroms along with the
#'   mobility, normalized B-values, and hydrogen bonds are summarized for the
#'   waters within 3.6 Angstroms of the protein and water atoms of interest,
#'   respectively. The raw values are scaled to values between 0 and 1 and
#'   plotted for each of the percent conservation available. Thus if there are
#'   ten structures being analyzed the percent conservation can range from 10 to
#'   100\% in 10\% increments. The protein related values are shown as solid
#'   lines and the water related values are shown as dotted lines.
#'
#'   _Interpreting the plot_
#'   - **dark green**: protein atom density
#'   - **medium green**: protein atom hydrophilicity
#'   - **green**: protein mobility
#'   - **pale green**: protein nBvalue
#'   - **light green**: protein hydrogen bonds
#'   - **dark blue**: water mobility
#'   - **medium blue**: water nBvalue
#'   - **blue**: water hydrogen bonds
#'
#'   This plot is based on Figure 3 of Sanschagrin and Kuhn (1998). Please note
#'   the B-value have been replaced with normalized B-values and hydrophilicity
#'   has been removed. Hydrophilicity was removed because the range between
#'   average hydrophilicity values for the percent conservations would likely be
#'   narrow. Due to the way scaling works, the lowest value is scaled to zero
#'   and the greatest value is scaled to one. Scaling the mean hydrophilicity
#'   values works against our goal of showing an overall tread and instead
#'   creates confusion about the values.
#'
#' @param data The `h2o.clusters.summary` data.frame from the `ClusterWaters`
#'   function containing the `nBvalue.mu` information. This data.frame is found
#'   within the `h2o.cluster.passed` and `h2o.cluster.all`
#' @param passed.waters Logical indicator to plot results for waters **passing**
#'   [Mobility()] and [NormalizedBvalue()] _**OR**_ using **all** waters within
#'   the `PDB` files.
#' @param title The title for the plot
#'
#' @export
#'
#' @import ggplot2
#' @import reshape2
#' @import scales
#' @importFrom stats aggregate
#'
#' @examples
#'   \dontrun{
#'   bwe.summary.plot <- BoundWaterEnvSummaryPlot(data=thrombin10.conservedWaters,
#'                                                passed.waters=TRUE,
#'                                                title="Bound Water Environment per Conservation")
#'  }
#'
#' @author Emilio Xavier Esposito \email{emilio@@exeResearch.com}
#'
#' @family plots
#'
#' @references
#'   Paul C Sanschagrin and Leslie A Kuhn. Cluster analysis of
#'   consensus water sites in thrombin and trypsin shows conservation between
#'   serine proteases and contributions to ligand specificity. _Protein
#'   Science_, 1998, **7** (_10_), pp 2054-2064.
#'   [DOI: 10.1002/pro.5560071002](http://doi.org/10.1002/pro.5560071002)
#'   [PMID: 9792092](http://www.ncbi.nlm.nih.gov/pubmed/9792092)
#'   [WatCH webpage](http://www.kuhnlab.bmb.msu.edu/software/watch/index.html)
#'
BoundWaterEnvSummaryPlot <- function(data,
                                     passed.waters = TRUE,
                                     title = "Bound Water Environment per Conservation") {

  ##_ get the PASSED or ALL waters plot data -----
  if (passed.waters == TRUE) {
    plot.data <- data$h2o.cluster.passed$h2o.clusters.summary
  } else {
    plot.data <- data$h2o.cluster.all$h2o.clusters.summary
  }

  ##_ construct the data.frame for the plot -----
  ##__ calculate the mean values per the percent conservation -----
  list.pct.conserved <- list(plot.data$pct.conserved)
  mean.pct.conserved <- aggregate(plot.data, list.pct.conserved, mean, na.rm = TRUE)
  ##__ get the needed values
  df.plot <- mean.pct.conserved[, c("pct.conserved",
                                    "prot.adn",
                                    "prot.mobility.mu", "prot.nBvalue.mu",
                                    "prot.hbonds",
                                    "mobility.mu", "nBvalue.mu", "h2o.hbonds")]
  ##__ normalize the values -----
  df.plot.norm <- apply(df.plot[, -1], 2, RescaleValues)
  ##__ construct the normalized data.frame
  df.plot.norm <- data.frame(pct.conserved = (df.plot[,1] / 100),
                             df.plot.norm,
                             stringsAsFactors = FALSE)

  ##_ information for the plot -----
  ##__ make easy to read legend labels -----
  legend.labels <- c("Protein Atom Density",
                     "Protein Mobility", "Protein nBvalues", "Protein H-bonds",
                     "Water Mobility", "Water nBvalues",
                     "Water H-bonds"
  )
  ##__ color values | colorbrewer2.org; single hue; 9 data classes; only 7 used -----
  prot.h2o.color <- c("#006d2c", ## dark green (protein atom density)
                      "#238b45", ## medium green (protein mobility)
                      "#41ab5d", ## green (protein nBvalue mu)
                      "#74c476", ## pale green (protein hydrogen bonds)
                      "#08519c", ## dark blue (water mobility mu)
                      "#2171b5", ## medium blue (water nBvalue mu)
                      "#4292c6"  ## blue (water hydrogen bonds)
  )
  ##__ define the line types -----
  prot.h2o.line.types <- c(rep("solid", 5), rep("dotted", 3))

  ##_ melt the data for ggplot2 -----
  ##__ protein values -----
  df.plot.prot <- df.plot.norm[, c("pct.conserved",
                                   "prot.adn",
                                   "prot.mobility.mu", "prot.nBvalue.mu",
                                   "prot.hbonds")]
  ##__ add the type column -----
  df.plot.prot <- data.frame(df.plot.prot, type = "protein",
                             stringsAsFactors = FALSE)
  ##__ melt the data.frame -----
  df.plot.prot.melt <- reshape2::melt(df.plot.prot,
                                      id.vars = c("pct.conserved", "type") )
  ##__ easy to read legend labels -----
  df.plot.prot.melt$variable <- factor(df.plot.prot.melt$variable,
                                       labels = legend.labels[1:4])
  ##__ water values -----
  df.plot.h2o <- df.plot.norm[, c("pct.conserved",
                                  "mobility.mu", "nBvalue.mu",
                                  "h2o.hbonds")]
  ##__ add the type column -----
  df.plot.h2o <- data.frame(df.plot.h2o, type = "water",
                            stringsAsFactors = FALSE)
  ##__ melt the data.frame -----
  df.plot.h2o.melt <- reshape2::melt(df.plot.h2o,
                                     id.vars = c("pct.conserved", "type") )
  ##__ easy to read legend labels -----
  df.plot.h2o.melt$variable <- factor(df.plot.h2o.melt$variable,
                                      labels = legend.labels[5:7])
  ##__ combine the protein and water data -----
  df.plot.norm.melt <- rbind(df.plot.prot.melt, df.plot.h2o.melt)

  ##_ make the plot -----
  bwe.plot <- ggplot2::ggplot(data = df.plot.norm.melt,
                              aes_string(x = "pct.conserved",
                                         y = "value",
                                         colour = "variable",
                                         linetype = "variable")) +
    ggplot2::geom_line(size = 2, lineend = "round") +
    ggplot2::scale_x_continuous(labels = scales::percent) +
    ggplot2::scale_colour_manual(values = prot.h2o.color,
                                 name = "",
                                 labels = legend.labels) +
    ggplot2::scale_linetype_manual(values = prot.h2o.line.types,
                                   name = "",
                                   labels = legend.labels) +
    ggplot2::labs(x = "Percent Conservation", y = "Scaled Values",
                  title = title) +
    ggplot2::theme(legend.position = "bottom")

  ##_ return the plot -----
  bwe.plot

}

Try the vanddraabe package in your browser

Any scripts or data that you put into this service are public.

vanddraabe documentation built on June 8, 2019, 1:03 a.m.