cellwrangler: Supplemental toolkit for single-cell RNAseq analysis

Documented in plot_CoGAPS_pattern_cor_heatmap

#'Plot CoGAPS pattern correlation heatmap
#'
#' @description plot_CoGAPS_pattern_cor_heatmap() plots a heatmap showing the correlation of a CoGAPS pattern set 
#' with a grouping variable
#'
#' @param CoGAPS_res_set a CoGAPS result set. If set to NULL, a Pmeans matrix may be supplied directly as input
#' for the Pattern_set parameter.
#' @param Pattern_set a string specifying the name of a pattern set (e.g. "nP20") in a CoGAPS_res_set. 
#' Alternatively, if CoGAPS_res_set is set to NULL, a Pmeans matrix may be supplied directly.
#' @param pattern_subset a vector specifying a subset of the pattern set to be plotted. Defaults to NULL.
#' @param annotation an annotation dataframe. If group_column is set to NULL, must contain logical columns with 
#' column names corresponding to values in group_vector. Each logical column (1 or 0) should indicate whether an
#' annotation row is of the value (column name).
#' @param group_column column name of column in annotation object with values corresponding to those 
#' in group_vector to use for computing logical columns. Defaults to NULL
#' @param group_vector a vector of with pre-specified values corresponding to those in group_column in
#' the annotation object.
#' @param cluster_groups logical; if groups should be clustered.
#' @param order_subgroups logical; if subgroups should be ordered according to clustering of first subgroup. 
#' Subgroups can be specified in a string with separation by " - ".
#' @param cluster_groups_distance distance measure used in clustering groups. Possible values are "correlation",
#' "euclidean", "maximum", "manhattan", "canberra", "binary", or "minkowski" (defined in dist() function.)
#' @param cluster_groups_method clustering method used for groups. Accepts the same values as hclust: "ward.D", 
#' "ward.D2", "single", "complete", "average" (=UPGMA), "mcquitty" (=WPGMA), "median" (=WPGMC) or 
#' "centroid" (=UPGMC).
#' @param cluster_patterns logical; if patterns should be clustered.
#' @param cluster_patterns_vector a vector specifying the order of patterns. If NULL, patterns will be clustered
#' according to distance and method specified below.
#' @param cluster_patterns_distance distance measure used in clustering patterns. Possible values are the
#' same as cluster_groups_distance().
#' @param cluster_patterns_method clustering method used for patterns. Possible values are the
#' same as cluster_groups_method().
#' @param limits limits of the color scale for the heatmap.
#' @param square_tiles logical; if tiles should be made square
#' 
#' @keywords plot_CoGAPS_pattern_cor_heatmap
#' @export
#' @return a ggplot object
#' @examples
#' plot_CoGAPS_pattern_cor_heatmap(CoGAPS_res_set = my_CoGAPS_res, Pattern_set = "nP30",annotation=pData(cds), 
#' group_column=NULL, group_vector=genotype_vector, cluster_groups=T, order_subgroups=T, 
#' cluster_groups_distance = "correlation", cluster_groups_method = "complete", cluster_patterns= T, 
#' cluster_patterns_distance = "correlation", cluster_patterns_method = "complete")


#plot_CoGAPS_pattern_cor_heatmap
plot_CoGAPS_pattern_cor_heatmap <- function(CoGAPS_res_set=NULL, Pattern_set, pattern_subset= NULL, annotation, group_column=NULL, 
                                     group_vector, cluster_groups=F, order_subgroups=F ,cluster_groups_distance="correlation", 
                                     cluster_groups_method="complete", cluster_patterns=F, 
                                     cluster_patterns_vector=NULL, cluster_patterns_distance="correlation", 
                                     cluster_patterns_method="complete",limits=NULL, square_tiles=T) {
  if(is.null(CoGAPS_res_set)==F) {
    Pmeans <- CoGAPS_res_set[[Pattern_set]]@sampleFactors } else { Pmeans <- Pattern_set }
  if(is.null(pattern_subset) == F) {
    Pmeans <- Pmeans[,pattern_subset]
  } else { Pmeans <- Pmeans }
  Pmeans <- Pmeans[unique( as.character(rownames(Pmeans)) ),]
  #Create columns in annotation according to group_vector
  if(is.null(group_column) == F) {
  lapply(group_vector, function(x){
    tmp <- annotation[group_column] == x
    annotation[[x]] <<- as.numeric(tmp)
  }) } else {annotation <- annotation}
  #Merge Pmeans and annotation
  Pmeans_annot <- merge_by_rownames(annotation, Pmeans, all.x = F, all.y=T)
  for(i in c( (ncol(annotation)+1):(ncol(Pmeans)+ncol(annotation)) )){
    Pmeans_annot[,i]<-as.numeric(Pmeans_annot[,i])
  }
  patterns <- colnames(Pmeans_annot)[grep("Patt",colnames(Pmeans_annot))]
  
  #Calculate correlation
  tmp <- cor(Pmeans_annot[,patterns], Pmeans_annot[,group_vector])
  
  #Melt correlation matrix   
  tmp_melt <- melt(tmp)
  colnames(tmp_melt) <- c("pattern", "group_vector","Correlation")
  tmp_melt$group_vector <- factor(tmp_melt$group_vector,levels=rev(group_vector[order(group_vector)]))
  
  #cluster patterns
  if(cluster_patterns== T) {
    if(is.null(cluster_patterns_vector)==F){
      tmp_melt$pattern = factor(tmp_melt$pattern,levels=unique(tmp_melt$pattern)[cluster_patterns_vector])
    } else {
      pattern.order<-order.dendrogram(as.dendrogram(pheatmap:::cluster_mat((tmp), distance=cluster_patterns_distance,method=cluster_patterns_method)))
      tmp_melt$pattern = factor(tmp_melt$pattern, levels=unique(tmp_melt$pattern)[pattern.order])
    }
  } else { tmp_melt$pattern = factor(tmp_melt$pattern, levels=rev(unique(tmp_melt$pattern)[order(tmp_melt$pattern,decreasing=T)])) }
  
  #cluster groups
  if(cluster_groups == T) {
    ref_table <- as.data.frame(group_vector)
    colnames(ref_table) <- c("group_vector")
    group.order<-order.dendrogram(as.dendrogram(pheatmap:::cluster_mat(t(tmp), distance=cluster_groups_distance,
                                                                       method=cluster_groups_method)))
    ref_table$group_vector <- factor(ref_table$group_vector, levels=ref_table$group_vector[group.order])
    #order_subgroups
    if(order_subgroups == T) {
    n_subgroups <- length(stringr::str_split(group_vector, pattern = " - ")[[1]])
    for(i in 1:n_subgroups) {
      ref_table[paste("subgroup",i,sep="")] <- as.factor(stringr::str_split_fixed(group_vector, pattern=" - ",
                                                                                  n=n_subgroups)[,i])
    }
    #order by subgroup1,2 ..
    for(i in (n_subgroups+1):2) {
      ref_table <- ref_table[with(ref_table, order(ref_table[,i])),]
    }
    group_vector_ordered <- ref_table$group_vector
    tmp_melt$group_vector <-factor(tmp_melt$group_vector,levels=rev(group_vector_ordered)) 
    } else {
    tmp_melt$group_vector <-factor(tmp_melt$group_vector,levels=rev(ref_table$group_vector[group.order]))
    }
  } else { tmp_melt <- tmp_melt }
  
  #Check correlation range
  print(c(min(tmp_melt$`Correlation`),0, max(tmp_melt$`Correlation`)))
  
  #plot
  p<-ggplot(tmp_melt,aes(x=pattern,y=group_vector, fill=`Correlation`))
  p <- p + geom_tile(size=0.25) + theme_classic() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5)) 
  p <- p + scale_fill_gradientn(colours =  c("dodgerblue","ghostwhite","brown2"),values=scales::rescale(c(min(tmp_melt$`Correlation`), 0,max(tmp_melt$`Correlation`))), limits=limits,  oob=scales::squish)
  if(square_tiles==T) {
    p <- p + coord_equal()
  } else { p <- p }
  return(p)
}