R/clustering_plots.R

Defines functions get_theme_blank get_stats mutate_colors get_color_for_nodes plot_NA get_colors plot_condition_compositions plot_clusters plot_clusters_all

Documented in get_color_for_nodes get_colors get_stats get_theme_blank mutate_colors plot_clusters plot_clusters_all plot_condition_compositions plot_NA

##############################################
# The deveroper and the maintainer:          #
#   Lang Ho Lee (lhlee@bwh.harvard.edu)      #
#   Sasha A. Singh (sasingh@bwh.harvard.edu) #
##############################################

# Mute warnings
options(warn=1)

#' @title get_theme_blank
#' @description Predefined ggplot theme for removing ticks, titles and labels of X and Y axis
#' @import ggplot2
#' @return A ggplot theme
get_theme_blank <- function(){
  return(theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.x=element_blank(),
    axis.text.y=element_blank(),
    axis.ticks.x=element_blank(),
    axis.ticks.y=element_blank(),
    panel.border = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(colour = "white")
  ))
}

#' @title length2
#' @description Customized function for vector length calculation
#' @param x A vector
#' @param na.rm If it is FALSE, no exclusion of NA values.
#' @return A vector length
length2 <- function (x, na.rm=FALSE) {
  if (na.rm) sum(!is.na(x))
  else       length(x)
}

#' @title get_stats
#' @description Calculate statistics of the given data for XINA network analysis
#' @param centrality_results Network centrality score data frame calculated by XINA network module
#' @param na.rm If it is FALSE, no exclusion of NA values.
#' @import plyr
#' @return A data frame containing statistics of XINA network centrality scores
get_stats <- function(centrality_results, na.rm=FALSE) {
  tgc <- ddply(data.frame(d=centrality_results$Score, r=centrality_results$Rank), c("r"), .drop=TRUE,
               .fun = function(xx, col) {
                 c(N    = length2(xx[[col]], na.rm=na.rm),
                   max  = max(xx[[col]], na.rm=na.rm),
                   min  = min(xx[[col]], na.rm=na.rm),
                   mean = mean   (xx[[col]], na.rm=na.rm)
                 )
               }, "d")
  return(tgc)
}

#' @title mutate_colors
#' @description 'mutate_colors' generates new color scheme for XINA clustering plot based on condition composition results (\link[XINA]{plot_condition_compositions}).
#' If any clusters have higher percentage than the 'threshold_percent', XINA will assign new colors in accordance to 'color_for_condition'.
#' If not, XINA will give 'gray' color or user-defined color via 'null_color' parameter.
#' @param condition_composition A data frame generated by \link[XINA]{plot_condition_compositions}
#' @param color_for_condition A vector like 'color_for_condition' of \link[XINA]{xina_clustering}
#' @param threshold_percent Default is 50.  The percentage threshold for giving new colors
#' @param null_color Default is 'gray'. This color is for clusters that are not biased to any of experimental conditions
#' @return A data frame containing statistics of XINA network centrality scores
#' @export
#' @examples
#' # load XINA example data
#' data(xina_example)
#'
#' # Plot condition composition pie-chart with default option
#' condition_composition <- plot_condition_compositions(example_clusters)
#' example_clusters$color_for_clusters <- mutate_colors(condition_composition,
#' example_clusters$color_for_condition)
#' plot_clusters(example_clusters, xval=c(0,2,6,12,24,48,72), xylab=FALSE)
#'
mutate_colors <- function(condition_composition, color_for_condition, null_color='gray', threshold_percent=50) {
  Cluster <- Condition <- NULL
  color_for_clusters <- c()
  conditions <- unique(condition_composition$Condition)
  cl=1
  for (cl in seq_len(max(condition_composition$Cluster))) {
    sub_df <- subset(condition_composition, Cluster==cl)
    color_for_cluster <- NULL
    for (condition in conditions) {
      sub_df2 <- subset(sub_df, Condition==condition)
      if (nrow(sub_df2) > 0)
        if (sub_df2$Percent_ratio >= threshold_percent)
          color_for_cluster <- as.character(color_for_condition[condition])
    }
    if (is.null(color_for_cluster)) {
      color_for_cluster <- null_color
    }
    color_for_clusters <- c(color_for_clusters, color_for_cluster)
  }
  return(color_for_clusters)
}

#' @title get_color_for_nodes
#' @description Pre-defined 30 colors
#' @return A vector for color code of XINA graphics
get_color_for_nodes <- function(){
  # http://colorbrewer2.org/#type=qualitative&scheme=Set3&n=12
  return(c('#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854',
           '#ffd92f', '#e5c494', '#b3b3b3',
           '#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99',
           '#e31a1c', '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a',
           '#ffff99', '#b15928',
           '#8dd3c7', '#ffffb3', '#bebada',
           '#fb8072', '#80b1d3', '#fdb462', '#b3de69', '#fccde5',
           '#d9d9d9', '#bc80bd', '#ccebc5', '#ffed6f'))
}

#' @title plot_NA
#' @description Draw NULL plot
#' @import graphics
#' @return a empty plot
plot_NA <- function() {
  plot(1, type="n", axes=FALSE, xlab="", ylab="", main="Not Available")
}


#' @title get_colors
#' @description Generate color series for XINA graphics
#' @param nClusters The number of clusters
#' @param set Pre-defined color series set
#' @param colorset manually defined color codes
#' @return A vector for color code of XINA graphics
get_colors <- function(nClusters, set='', colorset=NULL)
{
  if (is.null(colorset)) {
    if (set == '') {
      # Pastel tone
      colorset <- c("#FFB5E8", "#FF9CEE", "#FFCCF9", "#FCC2FF", "#F6A6FF",
                    "#B28DFF", "#C5A3FF", "#D5AAFF", "#ECD4FF", "#FBE4FF",
                    "#DCD3FF", "#A79AFF", "#B5B9FF", "#97A2FF", "#AFCBFF",
                    "#AFF8DB", "#C4FAF8", "#85E3FF", "#ACE7FF", "#6EB5FF",
                    "#BFFBCB", "#DBFFD6", "#BEBEBE", "#E7FFAC", "#FFFFD1",
                    "#FFC9DE", "#FFABAB", "#FFBEBC", "#FFCBC1", "#FFF5BA")
      # #FFFFD1 --> #d3d300
      N <- 30
    } else {
      # Original color
      colorset <- c("#8B008B", "#F1C40F", "#00008B","#1E90FF", "#104E8B",
                    "#FF0000", "#8B0000", "#9ACD32", "#008B00", "#00FFFF",
                    "#008B8B","#0000FF", "#8B0A50", "#FFFF00", "#8B7500",
                    "#00FF00", "#FF69B4", "#A020F0", "#FFA500", "#FF00FF")
      N <- 20
    }
  }
  if (nClusters>N){
    # color_for_graph <- colorRampPalette(colorset)(nClusters)
    color_for_graph <- c(rep(colorset, nClusters/length(colorset)),
                         colorset[seq_len(nClusters%%N)])
  } else {
    color_for_graph <- colorset
  }
  return(color_for_graph)
}

#' @title plot_condition_compositions
#' @description
#'  computes condition composition of the XINA clustering results and draws pie-charts.
#' @param clustering_result A list containing XINA clustering results.
#' See \link[XINA]{xina_clustering}
#' @param bullseye If it is TRUE, draw bullseye plot instead of the pie-chart. Default is FALSE
#' @param ggplot_theme This is ggplot theme to modify condition composition pie-chart and bulles eye plots.
#' @return A condition composition plot and a data frame containing condition compositions of the clusters
#' @import ggplot2
#' @import plyr
#' @import gridExtra
#' @export
#' @examples
#'
#' # load XINA example data
#' data(xina_example)
#'
#' # Plot condition composition pie-chart with default option
#' plot_condition_compositions(example_clusters)
#'
#' # Make a new color code for conditions
#' condition_colors <- c("tomato","steelblue1","gold")
#' names(condition_colors) <- example_clusters$condition
#' example_clusters$color_for_condition <- condition_colors
#'
#' # Draw condition composition pie-chart with the new color code
#' plot_condition_compositions(example_clusters)
#'
#' # Draw condition composition bullseye plot
#' plot_condition_compositions(example_clusters, bullseye = TRUE)
#'
plot_condition_compositions <- function(clustering_result, bullseye=FALSE, ggplot_theme=NULL)
{
  Cluster <- Condition <- Percent_ratio <- NULL
  # Collect cluster information by proteins
  clustering_dir <- clustering_result$out_dir
  nClusters <- clustering_result$nClusters
  max_cluster <- clustering_result$max_cluster
  data_column <- clustering_result$data_column
  column_numbers <- seq_len(length(data_column))
  super_ds <- clustering_result$clusters
  num_conditions <- length(unique(super_ds$Condition))
  # Color setting
  color_for_nodes <- clustering_result$color_for_condition
  if (length(color_for_nodes)==length(clustering_result$condition)) {
    names(color_for_nodes) <- clustering_result$condition
  } else {
    stop(paste("Length of color_for_nodes vector is not same to the number of conditions",
               length(color_for_nodes), length(clustering_result$condition)))
  }
  # Draw plots
  list_output <- list()
  out_summary <- data.frame()
  for (cn in seq_len(max_cluster))
  {
    cl_subset <- subset(super_ds, Cluster == cn)
    plot_title <- paste("#",cn, " (n=",nrow(cl_subset),")", sep='')
    if (nrow(cl_subset)==0) {
      g_summary <- data.frame(Condition=as.vector(unique(super_ds$Condition)),N=NA,Percent_ratio=NA)
      list_output[[cn]] <- ggplot(g_summary, aes(x="", y=Percent_ratio, fill=Condition)) +
        labs(title = plot_title) + geom_blank() + theme_bw() + get_theme_blank()
    } else {
      g_summary <- ddply(cl_subset, c("Condition"), .drop=TRUE,
                         .fun = function(xx, col) {
                           c(N    = length2(xx[[col]], na.rm=TRUE))},
                         c("Condition"))
      g_summary$Percent_ratio <- round((g_summary$N/sum(g_summary$N))*100, 2)
      if (bullseye) {
        # Bullseye plot
        # http://ggplot2.tidyverse.org/reference/coord_polar.html
        list_output[[cn]] <- ggplot(cl_subset, aes(x = factor(1), fill = factor(Condition))) +
          geom_bar(width = 1) + coord_polar() +
          labs(title = plot_title) +
          scale_fill_manual(values=color_for_nodes) +
          get_theme_blank() +
          guides(fill=guide_legend(title=NULL))
      } else {
        # PieChart
        # http://www.sthda.com/english/wiki/ggplot2-pie-chart-quick-start-guide-r-software-and-data-visualization
        list_output[[cn]] <- ggplot(g_summary, aes(x="", y=Percent_ratio, fill=Condition)) +
          geom_bar(width = 1, stat = "identity") +
          coord_polar("y", start=0) +
          labs(title = plot_title) +
          scale_fill_manual(values=color_for_nodes) +
          get_theme_blank() +
          guides(fill=guide_legend(title=NULL))
      }
    }
    if (!is.null(ggplot_theme)) {
      list_output[[cn]] <- list_output[[cn]] + ggplot_theme
    }
    out_summary <- rbind(out_summary,cbind(Cluster=cn, g_summary))
  }
  do.call(grid.arrange,list_output)
  return(out_summary)
}

#' @title plot_clusters
#' @description Draw all the clustering results.
#' 'plot_clusters' draws two plots, scaled and unscaled line graphs.
#' Scaled graphs have same y limits that are 0 to 1 by default,
#' but can be changed via 'y_lim' parameter.
#' @param clustering_result A list containing XINA clustering results.
#' See \link[XINA]{xina_clustering}
#' @param y_lim Y axis limit. If you set y_lim=c(0,1),
#' 'plot_clusters' will plot line graphs scaled from 0 to 1 in y-axis
#' Default is NULL, which means unscaled line graphs.
#' @param xval XINA basically considers time points as a ordinary variable, like 1,2,3,4...n.
#' You can make the time points as a continuous variable using xval.
#' @param xtickmark Change X axis tick marks.
#' Default is data_column of the clustering result list.
#' @param xylab If it is FALSE, x and y labels will be blank.
#' If it is TRUE (defualt), x and y labels will be shown.
#' @param ggplot_theme This is ggplot theme to modify XINA clustering plot.
#' @return Line graphs of all the clusters
#' @import ggplot2
#' @export
#' @examples
#' library(ggplot2)
#'
#' # load XINA example data
#' data(xina_example)
#'
#' # Draw clustering plots
#' plot_clusters(example_clusters)
#'
#' # Apply theme to the clustering plot
#' theme1 <- theme(title=element_text(size=8, face='bold'),
#' axis.text.x = element_text(size=7),
#' axis.text.y = element_blank(),
#' axis.ticks.x = element_blank(),
#' axis.ticks.y = element_blank(),
#' axis.title.x = element_blank(),
#' axis.title.y = element_blank())
#' plot_clusters(example_clusters, ggplot_theme=theme1)
#'
plot_clusters <- function(clustering_result, y_lim=NULL, xval=NULL,
                          xtickmark=NULL, xylab=TRUE, ggplot_theme=NULL)
{
  Cluster <- indviduals <- x <- y <- NULL
  clustering_dir <- clustering_result$out_dir
  nClusters <- clustering_result$nClusters
  max_cluster <- clustering_result$max_cluster
  data_column <- clustering_result$data_column
  data <- clustering_result$clusters
  # X axis set up
  if (is.null(xval)) {
    column_numbers <- seq_len(length(data_column))
  } else {
    column_numbers <- xval
  }
  if (is.null(xtickmark)) {
    xtickmark <- data_column
  }
  # Plot parameter setting
  color_for_graph <- clustering_result$color_for_clusters
  if (length(color_for_graph)<max_cluster) {
    stop(paste("Length of color_for_clusters vector should be bigger than max_cluster",
               length(clustering_result$color_for_clusters), max_cluster))
  }
  # Draw line graphs and save it as a PNG image
  scaled_plots <- list()
  plot_out <- list()
  for (i in seq_len(max_cluster)) {
    sub_data<-as.matrix(subset(data, Cluster %in% i, select = data_column))
    nlines <- nrow(sub_data)
    if (nlines > 0) {
      data_Sds<-cbind(data.frame(expand.grid(x = column_numbers, indviduals = seq_len(nlines))),
                      y=as.numeric(t(sub_data[, data_column])))
      num_prots <- nrow(data_Sds)/length(data_column)
      plt_title <- paste("#",i, " (n=",num_prots,")", sep='')
      plot_out[[i]]<-ggplot(data_Sds, aes(x=x, y=y, group=indviduals)) +
        geom_line(colour = color_for_graph[i]) +
        geom_hline(yintercept=1/length(data_column), color="red", linetype="dashed") +
        labs(x="", y="", title=plt_title) +
        scale_x_continuous(breaks=column_numbers, labels=xtickmark)
      if(!xylab){
        plot_out[[i]] <- plot_out[[i]] + get_theme_blank()
      }
      if (!is.null(y_lim)) {
        plot_out[[i]] <- plot_out[[i]] + ylim(y_lim)
      }
      if (!is.null(ggplot_theme)) {
        plot_out[[i]] <- plot_out[[i]] + ggplot_theme
      }
    } else {
      plot_out[[i]] <- plot_NA()
      print(paste("Cluster #",i," doesn't have any members", sep=''))
    }
  }
  do.call(gridExtra::grid.arrange,plot_out)
  # return(plot_out)
}

#' @title plot_clusters_all
#' @description Draw line graphs of all the proteins in the given dataset
#' @param clustering_result A list containing XINA clustering results.
#' See \link[XINA]{xina_clustering}
#' @param selected_condition A condition name to draw the kinetics plot
#' @return a list containing clustering results and pdf file
#' containing a BIC plot in current working directory.
#' @import ggplot2
#' @export
#' @examples
#'
#' # load XINA example data
#' data(xina_example)
#'
#' # Plot kinetics of all the proteins in Control
#' plot_clusters_all(example_clusters, selected_condition="Control")
#'
#' # Plot kinetics of all the proteins in Stimulus1
#' plot_clusters_all(example_clusters, selected_condition="Stimulus1")
#'
#' # Plot kinetics of all the proteins in Stimulus2
#' plot_clusters_all(example_clusters, selected_condition="Stimulus2")
#'
#' # Plot kinetics of all the proteins in three data
#' plot_clusters_all(example_clusters)
#'
plot_clusters_all <- function(clustering_result, selected_condition=NULL)
{
  condition <- x <- y <- z_levels_Sds <- NULL
  data_column <- clustering_result$data_column
  data <- clustering_result$clusters
  column_numbers <- seq_len(length(data_column))
  color_for_condition <- clustering_result$color_for_condition
  all_prots <- data.frame(expand.grid(x = seq_len(length(data_column)), y=seq_len(nrow(data))),
                          condition = as.vector(data[,"Condition"]),
                          z_levels_Sds=as.numeric(t(data[data_column])))
  if (!is.null(selected_condition)) {
    all_prots <- subset(all_prots, condition==selected_condition)
    title2show <- paste("Unclustered profiles of",selected_condition)
  } else {
    title2show <- "Unclustered profiles of all"
  }
  ggplot(all_prots, aes(x=x, y=z_levels_Sds, group=y, colour=condition)) +
    geom_line() +
    scale_color_manual(values=color_for_condition) +
    geom_hline(yintercept=1/max(column_numbers), color="red", linetype="dashed") +
    labs(x = "", y = paste("Normalized intensity by",clustering_result$norm_method)) +
    labs(title =  title2show) +
    scale_x_continuous(breaks=column_numbers, labels=data_column)
}
langholee/XINA documentation built on March 17, 2020, 5:23 p.m.