R/Plots.R

Defines functions plot_SP_summary plot_dissimilarity_summary plot_similarity_summary plot_dissimilarity_matrix plot_similarity_heatmap percent

Documented in plot_dissimilarity_matrix plot_dissimilarity_summary plot_similarity_heatmap plot_similarity_summary plot_SP_summary

############
# Pre-reqs #
############

globalVariables(c("Reference", "Comparison","Proportion","Change","value","Identity","PropCys"))

percent <- function(x, digits = 1, format = "f", ...) {
  paste0(formatC(100 * x, format = format, digits = digits, ...), "%")
}



#########
# Plots #
#########

#' A heatmap plot of the column identities between two multiple sequence alignments
#'
#' @param x          an object of type "pairwise alignment comparison" (typically the summary file generated by compare_alignments)
#' @param scale      scale data to proportion of characters that are not conserved gaps (default = TRUE)
#' @param display    display this plot (default = TRUE)
#' 
#' @export
#' @examples
#' data(reference_alignment)
#' data(comparison_alignment)
#' PAC <- compare_alignments(reference_alignment,comparison_alignment)
#' plot_similarity_heatmap(PAC)
#'
#' @note This function displays the similarity between each pairwise column comparison for the reference and comparison MSAs. Colour density is determined by the proportion of identical character matches between the columns, normalised to the number of characters that are not merely conserved gaps. This gives a representation of which columns are well agreed upon by the MSAs, and which columns are split by one MSA relative to the other.
#'
plot_similarity_heatmap <- function(x,scale=TRUE,display=TRUE){
  
  hm_data      <- t(x$similarity_S) 
  if (scale){
  hm_data      <- t(x$similarity_S)/(1-x$results_R[2,]) # Similarity, excluding conserved gaps
  }
  md           <- reshape2::melt(hm_data)
  colnames(md) <- c('Reference','Comparison','value')

  p <- ggplot2::ggplot(md)                                                    +
       ggplot2::geom_tile(ggplot2::aes(x=Reference,y=Comparison,fill=value))  + 
       ggplot2::scale_fill_gradient("Similarity",low="white",high="black")    +
       ggplot2::labs(x = "Reference MSA column", y = "Comparison MSA column") +
       ggplot2::scale_x_continuous(expand = c(0, 0))                          +
       ggplot2::scale_y_reverse   (expand = c(0, 0))                          +
       ggplot2::theme(plot.background  = ggplot2::element_rect(fill="white"),
                      panel.background = ggplot2::element_rect(fill="white"))

  if (display){
    print(p)
  }
  p
}



#' A heatmap plot of the dissimilarity matrix of two multiple sequence alignments
#'
#' @param x          an object of type "pairwise alignment comparison" (typically the summary file generated by compare_alignments)
#' @param display    display this plot (default = TRUE)
#' 
#' @export
#' @examples
#' data(reference_alignment)
#' data(comparison_alignment)
#' PAC <- compare_alignments(reference_alignment,comparison_alignment)
#' plot_dissimilarity_matrix(PAC)
#'
#' @note This function displays the dissimilarity categories for all characters in the reference alignment. This gives a representation of which columns are well agreed upon by the MSAs, and which sequence regions of the reference alignment are split, merged, or shifted.
#'
plot_dissimilarity_matrix <- function(x,display=TRUE){

  hm_data      <- as.matrix(t(x$dissimilarity_simple))
  hm_data      <- gsub(hm_data, pattern = "M", replacement = "Match")
  hm_data      <- gsub(hm_data, pattern = "g", replacement = "Gap")
  hm_data      <- gsub(hm_data, pattern = "m", replacement = "Merge")
  hm_data      <- gsub(hm_data, pattern = "s", replacement = "Split")
  hm_data      <- gsub(hm_data, pattern = "x", replacement = "Shift")
  md           <- reshape2::melt(hm_data)
  names        <- rownames(x$reference_P)
  colnames(md) <- c('Position','Sequence','Dissimilarity')

  p <- ggplot2::ggplot(md)                                                                     +
       ggplot2::geom_tile(ggplot2::aes_string(x="Position",y="Sequence",fill="Dissimilarity")) + 
       ggplot2::scale_x_continuous(expand = c(0, 0))                                           +
       ggplot2::scale_y_reverse(expand = c(0, 0),
                                labels=names,
                                breaks=1:length(names))                                        +
       ggplot2::labs(x = "Reference MSA column")                                               +
       ggplot2::scale_fill_manual(values=c("white",
                                           "black",
                                           "darkred",
                                           "firebrick2",
                                           "pink"))                                            +
       ggplot2::theme(plot.background  = ggplot2::element_rect(fill="white"),
                      panel.background = ggplot2::element_rect(fill="white"))
  
  if (display){
    print(p)
  }
  p
}



#' A line plot summary of column similarity between two multiple sequence alignments 
#'
#' @param x          an object of type "pairwise alignment comparison" (typically the summary file generated by compare_alignments)
#' @param scale      scale data to proportion of characters that are not conserved gaps (default = TRUE)
#' @param CS         additionally indicate columns with 100 percent identity using markers on the x-axis (default = FALSE)
#' @param cys        additionally show the cysteine abundance for each column (default = FALSE)
#' @param display    display this plot (default = TRUE)
#' 
#' @export
#' @examples
#' data(reference_alignment)
#' data(comparison_alignment)
#' PAC <- compare_alignments(reference_alignment, comparison_alignment, CS=TRUE)
#' plot_similarity_summary(PAC, CS=TRUE, cys=TRUE)
#'
#' @note This function generates a plot that summarises the similarity between the two multiple sequence alignments for each column of the reference alignment. For each column, it plots the proportion of identical character matches as a proportion of the characters that are not merely conserved gaps. The overall average proportion of identical characters that are not conserved gaps is overlaid as a percentage. For alignments of cysteine-rich proteins, the cysteine abundance for each column may also be plotted to indicate columns containing conserved cysteines (`cys=TRUE`).
#'
plot_similarity_summary <- function(x, scale=TRUE, CS=FALSE, cys=FALSE, display=TRUE){
  
  identity       <- x$results_R[1,]
  if (scale){
    identity       <- x$results_R[1,]/(1-x$results_R[2,]) # Similarity, excluding conserved gaps
  }
  proportion_cys <- 0.2*(x$cys)-0.2
  if ( !is.na(x$column_score) ){
    columnwise.CS  <- x$column_score$columnwise.column.score==1
    sum.CS         <- x$column_score$column.score
  } else {
    columnwise.CS <- FALSE
    sum.CS <-  NA
  }
  score          <- x$similarity_score
  col            <- 1:ncol(x$results)
  plot_data      <- data.frame(Identity=identity,columnwise.CS=columnwise.CS,PropCys=proportion_cys,Position=col)

  p <- ggplot2::ggplot(plot_data,ggplot2::aes(x=Position))                 +
       ggplot2::geom_line(ggplot2::aes(y=identity,colour="Similarity"))    +
       ggplot2::labs(x = "Reference MSA column", y = "Proportion")         +
       ggplot2::scale_x_continuous(expand = c(0, 0))                       +
       ggplot2::scale_y_continuous(expand = c(0, 0),breaks=seq(0,1,1/10))  +
       ggplot2::theme_classic()                                            +
       ggplot2::theme(legend.title = ggplot2::element_text(face = "bold")) +
       ggplot2::scale_colour_discrete(breaks=c("Similarity","Cysteines"),
                                      name=paste("Score =",percent(score),
                                                 "\nCS score =",percent(sum.CS)))
  
  if(cys) {
    p  <- p + ggplot2::geom_line(ggplot2::aes(y=PropCys,colour="Cysteines")) +
              ggplot2::geom_line(ggplot2::aes(y=0))                          +
              ggplot2::geom_line(ggplot2::aes(y=0))
  }
  
  if(CS) {
    cs_data <- plot_data[which(plot_data$columnwise.CS),]
    if ( nrow(cs_data)>0){
      p  <- p + ggplot2::geom_point(data=cs_data, ggplot2::aes(y=0)) 
    }
  }
  
  if (display){
    print(p)
  }
  p
}



#' An area plot summary of the different causes of column dissimilarity between two multiple sequence alignments
#'
#' @param x          an object of type "pairwise alignment comparison" (typically the summary file generated by compare_alignments)
#' @param scale      scale data to proportion of characters that are not conserved gaps (default = TRUE)
#' @param stack      stacked area plot in stead of line plot (default = TRUE)
#' @param display    display this plot (default = TRUE)
#' 
#' @export
#' @examples
#' data(reference_alignment)
#' data(comparison_alignment)
#' PAC <- compare_alignments(reference_alignment, comparison_alignment)
#' plot_dissimilarity_summary(PAC, stack=TRUE)
#'
#' @note This function generates a detailed breakdown of the differences between the multiple sequence alignments for each column of the reference alignment. For each column, the relative proportions of merges, splits and shifts is plotted as a proportion of characters that are not merely conserved gaps.
#'
plot_dissimilarity_summary <- function(x, scale=TRUE, stack=TRUE, display=TRUE){
  
  plot_data    <- data.frame(Merge=x$results_R[3,],
                             Shift=x$results_R[5,],
                             Split=x$results_R[4,],
                             Position=1:ncol(x$results))
  if (scale){
  plot_data    <- data.frame(Merge=x$results_R[3,]/(1-x$results_R[2,]), # Dissimilarity, excluding conserved gaps
                             Shift=x$results_R[5,]/(1-x$results_R[2,]), # Dissimilarity, excluding conserved gaps
                             Split=x$results_R[4,]/(1-x$results_R[2,]), # Dissimilarity, excluding conserved gaps
                             Position=1:ncol(x$results))
  }
  md           <- reshape2::melt(plot_data,id.vars='Position')
  colnames(md) <- c('Position','Dissimilarity','Proportion')
    
  if (stack) {
      p <- ggplot2::ggplot(md,ggplot2::aes(x=Position,y=Proportion))                               + 
           ggplot2::geom_area(ggplot2::aes_string(fill="Dissimilarity"),position='stack')          + 
           ggplot2::geom_line(ggplot2::aes_string(data="Dissimilarity", ymax=1), position='stack') +
           ggplot2::scale_x_continuous(expand = c(0, 0))                                           +
           ggplot2::scale_y_continuous(expand = c(0, 0))                                           +
           ggplot2::labs(x = "Reference MSA column")                                               +
           ggplot2::scale_fill_manual(values=c("darkred", "firebrick2", "pink"))                   +
           ggplot2::theme_classic()
  }
  else {
      p <- ggplot2::ggplot(md,ggplot2::aes(x=Position,y=Proportion))      + 
           ggplot2::geom_line(ggplot2::aes_string(color="Dissimilarity")) +
           ggplot2::scale_x_continuous(expand = c(0, 0))                  +
           ggplot2::scale_y_continuous(expand = c(0, 0))                  +
           ggplot2::theme_classic()
  }
  
  if (display){
      print(p)
  }
  p
}



#' A line plot summary of sum of pairs score between two multiple sequence alignments 
#'
#' @param x          an object of type "pairwise alignment comparison" (typically the summary file generated by compare_alignments)
#' @param CS         indicate columns with 100 percent identity using markers on the x-axis (default = TRUE)
#' @param display    display this plot (default = TRUE)
#' 
#' @export
#' @examples
#' data(reference_alignment)
#' data(comparison_alignment)
#' PAC <- compare_alignments(reference_alignment, comparison_alignment, SP=TRUE)
#' plot_SP_summary(PAC)
#'
#' @note This function generates a plot that summarises the columnwise sums of pairs for the two multiple sequence alignments. For each column of the comparison alignment, it plots the proportion of conserved residue pairs as a proportion of the poassible residue pairs. The overall sum of pairs score, reverse sum of pairs score, and column score are also reported as percentages.
#'
plot_SP_summary <- function(x, CS=TRUE, display=TRUE){
  
  columnwise.SPS  <- x$sum_of_pairs$columnwise.SPS
  columnwise.CS.y <- -0.05*(x$sum_of_pairs$columnwise.CS)
  columnwise.CS   <- x$sum_of_pairs$columnwise.CS
  sum.SP          <- x$sum_of_pairs$sum.of.pairs.score
  sum.PS          <- x$sum_of_pairs$reverse.sum.of.pairs.score
  sum.CS          <- x$sum_of_pairs$column.score 
  col             <- 1:length(x$sum_of_pairs$columnwise.SPS)
  plot_data       <- data.frame(columnwise.SPS=columnwise.SPS,columnwise.CS=columnwise.CS,columnwise.CS.y=columnwise.CS.y,Position=col)
  
  p <- ggplot2::ggplot(plot_data,ggplot2::aes(x=Position))                            +
       ggplot2::geom_line(ggplot2::aes(y=columnwise.SPS,colour="Sum of pairs score")) +
       ggplot2::labs(x = "Comparison MSA column", y = "Proportion")                   +
       ggplot2::scale_x_continuous(expand = c(0, 0))                                  +
       ggplot2::theme_classic()                                                       +
       ggplot2::theme(legend.title = ggplot2::element_text(face = "bold"))            +
       ggplot2::scale_colour_discrete(breaks=c("Sum of pairs score","Column score"),
                                      name=paste("SP score =", percent(sum.SP),
                                                 "\nPS score =", percent(sum.PS),
                                                 "\nCS score =", percent(sum.CS)))
  
  if(CS) {
    cs_data <- plot_data[which(plot_data$columnwise.CS),]
    p  <- p + ggplot2::geom_point(data=cs_data, ggplot2::aes(y=0)) 
  }
  
  if (display){
    print(p)
  }
  p
}
#########
TS404/AlignStat documentation built on Oct. 13, 2021, 4:13 p.m.