R/bic_plots.R

c3 <- c("#5395b4", "#f1592a", "#85c440")
c48 <- c("#1d915c","#5395b4",
                 "#964a48",
                 "#2e3b42",
                 "#b14e72",
                 "#402630","#f1592a",
                 "#81aa90","#f79a70", # lt pink
                 "#b5ddc2",
                 "#8fcc8b", # lt purple
                 "#9f1f63", # lt orange
                 "#865444", "#a7a9ac",
                 "#d0e088","#7c885c","#d22628","#343822","#231f20",
                 "#f5ee31","#a99fce","#54525e","#b0accc",
                 "#5e5b73","#efcd9f", "#68705d", "#f8f391", "#faf7b6", "#c4be5d", "#764c29", "#c7ac74", "#8fa7aa", "#c8e7dd", "#766a4d", "#e3a291", "#5d777a", "#299c39", "#4055a5", "#b96bac", "#d97646", "#cebb2d", "#bf1e2e", "#d89028", "#85c440", "#36c1ce", "#574a9e")


#' Arrange multiple plots on one page with one shared legend
#' 
#' Place multiple plots on one page, with a shared legend either at the right or at the bottom
#'
#' @param ...       ggplot objects
#' @param ncol      number of columns
#' @param nrow      number of rows
#' @param main      main title
#' @param position  position of legend ["bottom"|"right"]
bic.grid.arrange.shared.legend <- function(..., ncol = length(list(...)), nrow = 1,
                                           main = "", position = c("bottom", "right")) {
  ## copied and pasted from 
  ## https://github.com/tidyverse/ggplot2/wiki/share-a-legend-between-two-ggplot2-graphs

  plots <- list(...)
  position <- match.arg(position)
  top <- textGrob(main,gp=gpar(fontsize=20))
  g <- ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs
  legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
  lheight <- sum(legend$height)
  lwidth <- sum(legend$width)
  gl <- lapply(plots, function(x) x + theme(legend.position="none"))
  gl <- c(gl, ncol = ncol, nrow = nrow)
  combined <- switch(position,
                     "bottom" = arrangeGrob(do.call(arrangeGrob, gl),
                                            legend,
                                            ncol = 1,
                                            top = top,
                                            heights = unit.c(unit(1, "npc") - lheight, lheight)),
                     "right" = arrangeGrob(do.call(arrangeGrob, gl),
                                           legend,
                                           ncol = 2,
                                           top = top,
                                           widths = unit.c(unit(1, "npc") - lwidth, lwidth)))
  grid.newpage()
  grid.draw(combined)

  # return gtable invisibly
  invisible(combined)
}

#' Check formatting of input to bic.plot.rseqc.line.chart()
#'
#' Ensure object(s) passed to 'dat' and/or 'dat2' arguments of bic.plot.rseqc.line.chart()
#' are data frames where column one contains integers in order from least to greatest,
#' and each remaining column contains metric values for one sample
#' 
#' @param  dat   data frame passed to plot function
#' @param  dat2  second data frame passed to plot function (optional)
bic.check.line.chart.data <- function(dat,dat2=NULL){
  if(is.unsorted(dat[,1],strictly=TRUE)){
    stop("first data set must be sorted by first column")
  }
  tryCatch({apply(dat[,-1],1,as.numeric)},
    warning = function(w){
      stop("first data set contains non-numeric values")
    },
    error = function(e){
        stop("first data set contains non-numeric values")
    }
  )
  if(!is.null(dat2)){
    if(is.unsorted(dat2[,1],strictly=TRUE)){
      stop("second data set must be sorted by first column")
    }
    tryCatch({apply(dat2[,-1],1,as.numeric)},
      warning = function(w){
        stop("second data set contains non-numeric values")
      },
      error = function(e){
        stop("second data set contains non-numeric values")
      }
    )
    #if(!all(dim(dat) == dim(dat2))){
    #  stop(paste("first and second data sets have different dimensions\n\tdim(dat) = ",
    #             dim(dat),"\tdim(dat2) = ",dim(dat2),sep="")
    #      )
    #}
  }
  invisible(NULL)
} 

#' Plot RSeQC sequential metrics in a line graph 
#'
#' Create a line chart for merged RSeQC metrics where each line represents
#' a sample with the x axis showing sequential numeric value (e.g., read
#' read position or GC content value) and the y axis is a count
#' 
#' @param dat         data frame where column1 contains sequential numeric values 
#'                    and the remaining columns are metric values for each sample
#' @param title       plot title
#' @param dat2        second data frame with the same structure as dat1, to be plotted
#'                    adjacent to the plot of dat (optional; used to plot data for read 
#'                    1 and read 2 on one page)
#' @param title2      title of second plot (optional)
#' @param main        if plotting two datasets, this is the main title for the page
#' @param col.pal     color palette (Default: "Set3")
#' @param file        PDF file to which plot should be saved (optional)
#'
#' @export
bic.plot.rseqc.line.chart <- function(dat,title,dat2=NULL,title2=NULL,main=NULL,col.pal="Set3",file=NULL){
  width <- 10
  height <- 10
  p <- NULL
  p2 <- NULL

  ## validate input 
  bic.check.line.chart.data(dat,dat2=dat2)

  dat.m <- melt(dat,id.vars=colnames(dat)[1])
  p <- ggplot(dat.m,
              aes(x = dat.m[1], y = as.numeric(value), color = variable)
       ) +
       geom_line(size=0.75) +
       labs(title=title) +
       xlab(colnames(dat)[1]) +
       ylab("") +
       theme(legend.position="none",aspect.ratio=4/3) +
       scale_colour_brewer(direction=-1,name="Sample",palette=col.pal) +
       scale_x_continuous()
  ## only add legend to first plot if it is the only one
  if(!is.null(dat2)){
      width <- 14
      dat2.m <- melt(dat2, id.vars=colnames(dat2)[1])
      p2 <- ggplot(dat2.m,
              aes(x = dat2.m[1], y = as.numeric(value), color = variable)
         ) +
         geom_line(size=0.75) +
         theme(legend.position="none",aspect.ratio=4/3) +
         labs(title=title2) +
         xlab(colnames(dat)[1]) +
         ylab("") +
         scale_colour_brewer(direction=-1,name="Sample",palette=col.pal) +
         scale_x_continuous() 
  } else {
    p <- p + theme(legend.position="right")
  }
  if(!is.null(file)){ 
    pdf(file,onefile=FALSE,width=width,height=height)
  }
  if(!is.null(p2)){
    bic.grid.arrange.shared.legend(p=p,p2=p2,ncol=2,main=main,position="right")
  } else {
    print(p) 
  }
  if(!is.null(file)){
    dev.off()
  }
}

#' Check formatting of input to bic.plot.read.distribution()
#'
#' Ensure object(s) passed to 'dat' argument of bic.plot.read.distribution()
#' is a data frame containing a 'Samples' slot and at least one other slot
#' containing data to plot
#' 
#' @param  dat  data frame passed to plot function 
bic.check.read.distribution.data <- function(dat){
  if(!"Samples" %in% colnames(dat)){
    stop("data set must contain a 'Samples' column")
  }
  if(dim(dat)[2] < 2){
    stop("Data frame does not contain any data")
  }
  invisible(NULL)  
}

#' Plot RSeQC read distribution stats
#' 
#' Plot distribution of reads across different genomic features
#' like exons, introns, TSS, etc. for all samples in a project
#' 
#' @param dat        data frame containing merged output from multiple runs of
#'                   RSeQC's read_distribution.py script, where rows are
#'                   samples and columns are metrics; must contain "Samples" slot
#'                   and may contain any other slots 
#' @param pct        logical indicating that plot should show percentages
#' @param stack      logical indicating that bar chart should be stacked; Default: TRUE
#' @param horizontal logical indicating that bars should be horizontal; Default: TRUE
#' @param col.pal    name of color palette; must be from list of RColorBrewer palettes
#'                   Default: "Set3"
#' @param file       PDF file to which plot should be saved (optional)
#' @export
bic.plot.read.distribution <- function(dat,file=NULL,stack=TRUE,pct=FALSE,col.pal="Set3",horizontal=TRUE){

  ## validate input
  bic.check.read.distribution.data(dat)

  suppressMessages(dat.m <- melt(dat))

  position <- "stack"
  if(!stack){
    position <- position_dodge(width=0.7)
  }
  if(pct){
    position <- "fill"
  }

  p <- ggplot(dat.m, aes(x = Samples, y = value, fill = variable)) +
    geom_bar(stat = "identity", position = position, width = 0.7, color = "black") +
    theme(axis.text.x = element_text(angle=45,size=9,hjust=1,color="black"),
          legend.position="right",
          legend.title = element_blank()
       ) +
    labs(title="RSeQC Read Distribution") +
    scale_fill_brewer(direction=1,palette=col.pal) +
    xlab("") +
    ylab("") 
  if(horizontal){
    p <- p + coord_flip()
  }
  if(pct){
    p <- p + scale_y_continuous(labels=percent)
  }
  if(horizontal){
    p <- p + coord_flip()
  }


  if(!is.null(file)){
    pdf(file)
  }
  print(p)
  if(!is.null(file)){
    dev.off()
  }

}


#' Plot dispersion estimates
#'
#' Plot dispersion estimates for all conditions in experiment. Draw one
#' plot per PDF file.
#' 
#' @param cds          DESeq countsDataSet
#' @param out.dir      Directory in which plots should be saved; Default: $PWD
#' @param file.prefix  String to prepend to PDF file names (optional)
#'
#' @export
bic.plot.dispersion.estimates <- function(cds,out.dir=getwd(),file.prefix=""){
  if(file.prefix != ""){
    file.prefix <- paste(file.prefix,"_",sep="")
  }
  for(cond in ls(cds@fitInfo)){
    pdf(file.path(out.dir,paste(file.prefix,"dispersion_estimates_",cond,".pdf",sep="")))
    plotDispEsts(cds,name=cond)
    dev.off()
  }  
}

#' Draw a heatmap of counts table
#' 
#' Draw heatmap of either raw counts or variance stabilization
#' transformed (?) data for all samples. Write plot to PDF file.
#' 
#' @param cds       DESeq countsDataSet object
#' @param file      PDF file to which heatmap should be saved (optional)
#' @param num.gns   Include this number of most highly expressed genes in 
#'                  the heatmap; Default: 100
#' @param transform logical indicating whether to transform counts; 
#'                  Default: FALSE
#' @export
bic.deseq.heatmap <- function(cds,file=NULL,num.gns=100,transform=FALSE){
  hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
  dat <- NULL
  if(transform){
    cdsBlind <- estimateDispersions(cds, method="blind")
    vst <- varianceStabilizingTransformation(cdsBlind)
    dat <- exprs(vst)
    dat <- dat[-grep("alignment_not_unique|ambiguous|no_feature|not_aligned|too_low_aQual",
                           rownames(dat)),]
    select <- order(rowMeans(dat),decreasing=TRUE)[1:num.gns]
    dat <- dat[select,]
  } else {
    counts.cds <- counts(cds)
    counts.cds <- counts.cds[-grep("alignment_not_unique|ambiguous|no_feature|not_aligned|too_low_aQual",
                           rownames(counts.cds)),]
    select <- order(rowMeans(counts.cds), decreasing=TRUE)[1:num.gns]
    dat <- counts.cds[select,]
  }
  if(!is.null(file)){
    pdf(file)
  }
  par(cex.main=0.8)
  heatmap.2(dat, col=hmcol, trace="none", margin=c(10,10),
            main=paste("Expression of the \n",num.gns," most highly expressed genes",sep=""),
            cexRow=0.6,cexCol=1.0,keysize=1.5,key.title=NA)
  if(!is.null(file)){
    dev.off()
  }
}

#' Draw a heatmap showing sample-to-sample distances using variance 
#' stabilized data
#' 
#' Find potential sample mislabeling errors by viewing distances
#' between every pair of samples. Save plot to PDF file.
#' 
#' @param cds    DESeq countDataSet
#' @param conds  vector of sample conditions that was used to generate \code{cds}
#' @param file   PDF file to which heatmap whould be saved (optional)
#' 
#' @export
bic.sample.to.sample.distances <- function(cds,conds,file=NULL){
  #if(is.null(file)){
  #  file = "heatmap_sample_to_sample_distances.pdf"
  #}
  hmcol = colorRampPalette(brewer.pal(9, "GnBu"))(100)
  cdsBlind <- estimateDispersions(cds,method="blind")
  vst <- varianceStabilizingTransformation(cdsBlind)
  distances <- dist(t(exprs(vst)))
  dist.mat <- as.matrix(distances)
  rownames(dist.mat) = colnames(dist.mat) = with(pData(cdsBlind), paste(rownames(dist.mat), " : " ,conds,sep=""))
  if(!is.null(file)){
    pdf(file)
  }
  par(cex.main=0.8)
  heatmap.2(dist.mat, main="Sample to Sample Distances", trace="none", col=rev(hmcol), margin=c(13,13), key.title=NA)
  if(!is.null(file)){
    dev.off()
  }
}

#' Plot PCA
#' 
#' Run DESeq \code{plotPCA()} on variance stabilised tranformed data
#' 
#' @param cds  DESeq countDataSet
#' @param file PDF file to which plot should be saved
#' @export
bic.deseq.plot.pca <- function(cds,file=NULL){
  #if(is.null(file)){
  #  file="pca.pdf"
  #}
  cdsBlind <- estimateDispersions(cds,method="blind")
  vst <- varianceStabilizingTransformation(cdsBlind)
  if(!is.null(file)){
    pdf(file)
  }
  plt <- DESeq::plotPCA(vst)
  print(plt)
  if(!is.null(file)){
    dev.off()
  }
}

#' Plot MA
#'
#' Plot log2 fold changes versus mean of normalized counts for a DESeq
#' comparison
#' 
#' @param res   object returned from DESeq function nbinomTest
#' @param file  PDF file to which plot should be saved (optional)
#' @export
bic.plot.ma <- function(res, file=NULL){
  #if(is.null(file)){
  #  file = "ma_plot.pdf"
  #}
  if(!is.null(file)){
    pdf(file)
  }
  DESeq::plotMA(res)
  if(!is.null(file)){
    dev.off()
  }
}

#' Histogram of p-values
#' 
#' @param dat  data frame with at least one column with name 'pvals'
#' @param file PDF file to which plot should be saved (optional)
#' @export
bic.pval.histogram <- function(dat,file=NULL){
  #if(is.null(file)){
  #  file="pval_histogram.pdf"
  #}
  if(!is.null(file)){
    pdf(file)
  }
  hist(dat$pval, breaks=100, col="blue", border="slateblue", main="",xlab="p-value",ylab="Frequency")
  if(!is.null(file)){
    dev.off()
  }
}

#' Validate data from PICARD CollectRnaSeqMetrics or AlignmentSummaryMetrics
#'
#' Check that data frame contains the correct column names and that
#' all data is numeric
#'
#' @param dat    data frame passed to plot function
#' @param name   name of plot (Options: ["alignment.distribution" |
#'                                       "5prime3prime.bias" |
#'                                       "alignment.summary"]
bic.check.picard.data <- function(dat,name){
  required.slots <- switch(name,
                           "alignment.distribution" = c("SAMPLE","RIBOSOMAL_BASES","CODING_BASES",
                                                      "UTR_BASES","INTRONIC_BASES","INTERGENIC_BASES"),
                           "5prime3prime.bias" = c("SAMPLE","MEDIAN_CV_COVERAGE","MEDIAN_5PRIME_BIAS",
                                                 "MEDIAN_3PRIME_BIAS","MEDIAN_5PRIME_TO_3PRIME_BIAS"),
                           "alignment.summary" = c("SAMPLE","CATEGORY","PF_READS","PF_READS_ALIGNED")
                          )
  missing.slots = c()
  for(rs in required.slots){
    if (!rs %in% colnames(dat)){
      missing.slots = c(missing.slots,rs)
    }
  }
  if(length(missing.slots) > 0){
    stop(paste("data is missing the following required column(s): ",
               paste(missing.slots,collapse=", "),
               sep="")
        )
  }
  tryCatch({apply(dat[,required.slots[-grep("SAMPLE|CATEGORY",required.slots)]],1,as.numeric)},
    warning = function(w){
      stop("data set contains non-numeric values")
    },
    error = function(e){
      stop("data set contains non-numeric values")
    }
  )
  if(name=="alignment.summary"){
    if(!"FIRST_OF_PAIR" %in% dat$CATEGORY | !"SECOND_OF_PAIR" %in% dat$CATEGORY){
      stop("'CATEGORY' column must contain at least one instance of 'FIRST_OF_PAIR' and one of 'SECOND_OF_PAIR'")
    }
  }
  invisible(NULL)
}


#' Plot distibution of alignment across different genomic locations
#'
#' Bar chart showing for each sample the distribution of read
#' alignments across ribosomal, coding, UTR, intronic and 
#' intergenic bases
#' 
#' @param dat        data frame consisting of data from CollectRNASeqMetrics
#'                   output
#' @param col.pal    name of color palette; must be from list of RColorBrewer palettes
#'                   Default: "Set3"
#' @param file       PDF file to which plot should be saved (optional)
#' @param horizontal logical indicating that bars should be horizontal; Default: TRUE
#' @param pct        plot percentages
#'
#' @export
bic.plot.alignment.distribution <- function(dat,pct=FALSE,horizontal=TRUE,col.pal="Set3",file=NULL){
  ## validate data
  bic.check.picard.data(dat,"alignment.distribution")
  y <- data.frame(Sample = dat$SAMPLE, 
                 Ribosomal = dat$RIBOSOMAL_BASES, 
                 Coding = dat$CODING_BASES,
                 UTR = dat$UTR_BASES,
                 Intronic = dat$INTRONIC_BASES,
                 Intergenic = dat$INTERGENIC_BASES)

  y.m <- melt(y,id.var="Sample")

  position <- "stack"
  y.lbl <- "Total Bases (millions)"
  if(pct){ 
    position <- "fill" 
    y.lbl <- "Percent Bases"
  } 

  ### counts
  p <- ggplot(y.m, aes(x = Sample, y = value/1000000, fill = variable)) + 
    geom_bar(stat="identity", position=position, width=0.7, color="black")+
    theme(axis.text.x = element_text(angle=45,size=9,hjust=1,color="black"),
        legend.position="right",
        legend.title = element_blank()
     ) + 
    scale_fill_brewer(direction=1,palette=col.pal) +      
    labs(title="Alignment Distribution") + 
    xlab("") +  
    ylab(y.lbl)
    if(horizontal){
      p <- p + coord_flip()
    }
    if(pct){
      p <- p + scale_y_continuous(labels=percent)
    } 
  if(horizontal){
    p <- p + coord_flip()
  }
  if(!is.null(file)){
    pdf(file)
  }
  print(p)
  if(!is.null(file)){
    dev.off()
  }
}

#' Plot coverage uniformity (5' and 3' bias)
#' 
#' Bar chart showing median CV coverage, median 5' and 3' bias,
#' and median 5' to 3' bias for each sample.
#'
#' @param dat      data frame consisting of data from CollectRNASeqMetrics
#'                 output
#' @param col.pal  name of color palette; must be from list of RColorBrewer palettes
#'                 Default: "Set3"
#' @param horizontal logical indicating that bars should be horizontal; Default: TRUE
#' @param file     PDF file to which plot should be saved (optional)
#'
#' @export
bic.plot.5prime3prime.bias <- function(dat,col.pal="Set3",horizontal=TRUE,file=NULL){
  ## validate data
  bic.check.picard.data(dat,"5prime3prime.bias")
  y <- data.frame(Sample = dat$SAMPLE,
                  cvCoverage = dat$MEDIAN_CV_COVERAGE,
                  fivePrimeBias = dat$MEDIAN_5PRIME_BIAS,
                  threePrimeBias = dat$MEDIAN_3PRIME_BIAS,
                  fivetoThreePrimeBias = dat$MEDIAN_5PRIME_TO_3PRIME_BIAS)

  suppressMessages(y.m <- melt(y))

  position <- position_dodge(width=0.7)

  p <- ggplot(y.m, aes(x = Sample, y = value, fill = variable)) + 
    geom_bar(stat = "identity", position = position, width = 0.7, color = "black") + 
    theme(axis.text.x = element_text(angle=45,size=9,hjust=1,color="black"),
          legend.position="right",
          legend.title = element_blank()
       ) +    
    scale_fill_brewer(direction=1,palette=col.pal, labels = c("Median CV of Coverage","Median 5\' Bias","Median 3\' Bias","Median 5\' to 3\' Bias")) +
    labs(title="Coverage Uniformity") + 
    xlab("") + 
    ylab("")
  
  if(horizontal){
    p <- p + coord_flip()
  }
  if(!is.null(file)){
    pdf(file)
  }
  print(p)
  if(!is.null(file)){
    dev.off()
  }

}

#' Plot normalized coverage
#'
#' Line chart showing normalized coverage for each sample
#'
#' @param dat  data frame containing combined histograms from collectrnaseqmetrics
#' @param col.pal  name of color palette; must be from list of RColorBrewer palettes
#'                 Default: "Set3"
#' @param file PDF file to which plot should be saved (optional)
#' @export
bic.plot.coverage <- function(dat,col.pal="Set3",file=NULL){

  suppressMessages(x.m <- melt(dat, id.vars="position"))
  x.m$position <- as.integer(as.character(x.m$position))
  p <-  ggplot(x.m, aes(x = position, y = value, color = variable)) +  
        geom_line(size=0.7) + 
        theme(legend.position="right",
            legend.text = element_text(size=9),
            legend.title = element_blank()
            ) +
        labs(title="Normalized Coverage") + 
         xlab("Read position") + 
         ylab("") +
         scale_colour_brewer(direction=-1,name="Sample",palette=col.pal) 
         #scale_color_hue(colnames(dat)[-1]) + 
         #guides(colour = guide_legend(override.aes = list(size=5)))
  if(!is.null(file)){
    pdf(file)
  }
  print(p)
  if(!is.null(file)){
    dev.off()
  }
}


#' Plot numbers of mapped and unmapped reads for each sample
#'
#' Stacked bar chart showing either absolute values or percentages of mapped 
#' and unmapped reads for R1 and R2 of each sample. Takes in data frame
#' containing PICARD AlignmentSummaryMetrics, or at minimum a data frame with
#' columns: CATEGORY, PF_READS, PF_READS_ALIGNED, SAMPLE, where CATEGORY column
#' contains at least categories FIRST_OF_PAIR and SECOND_OF_PAIR.
#'
#' @param dat      data frame of PICARD AlignmentSummaryMetrics, or at least 
#'                 containing columns CATEGORY, PF_READS, PF_READS_ALIGNED and 
#'                 SAMPLE, where CATEGORY contains at least values FIRST_OF_PAIR
#'                 and SECOND_OF_PAIR
#' @param pct      logical indicating whether to show percentages rather than absolute
#'                 read counts
#' @param col.pal  name of color palette; must be from list of RColorBrewer palettes
#'                 Default: "Set3"
#' @param position position of grouped bars; Default: "stack", option: "dodge"
#' @param file     PDF file to which plot should be saved (optional)
#'
#' @export
bic.plot.alignment.summary <- function(dat,position="stack",pct=FALSE,col.pal="Set3",file=NULL){
  ## validate input data
  bic.check.picard.data(dat,"alignment.summary")

  dat <- dat[-which(dat$CATEGORY=="PAIR"),]
  dat$UNMAPPED <- dat$PF_READS-dat$PF_READS_ALIGNED
  dat <- dat[,c("CATEGORY","PF_READS","UNMAPPED","SAMPLE")]
  dat$CATEGORY <- revalue(dat$CATEGORY,c("FIRST_OF_PAIR"="R1","SECOND_OF_PAIR"="R2"))  
  colnames(dat) <- c("Category","MappedReads","UnmappedReads","Sample")
  suppressMessages(dat.m <- melt(dat))
  
  cat.lbl.y <- -2
  y.lbl <- "Reads (xMillion)"
  if(pct){ 
    position <- "fill" 
    cat.lbl.y <- -0.04
    y.lbl <- ""
  }

  p <- ggplot(dat.m[which(dat.m$variable!="Total"),], aes(x=Category, y=value/1000000, fill=variable)) + 
   geom_bar(stat="identity", position=position, colour="black") +
   facet_wrap( ~ Sample, nrow=1, strip.position="bottom") + 
   geom_text(data=dat.m[which(dat.m$variable=="MappedReads"),], mapping=aes(x=Category, y=cat.lbl.y, label=Category), vjust=0) +
   theme(axis.text.x = element_blank(),
      axis.ticks=element_blank(),
      legend.position="right",
      legend.title = element_blank(),
      strip.text.x = element_text(size = 9,color="black",angle=90, hjust=0.5, vjust=1)
    ) +
  scale_fill_brewer(direction=1,palette=col.pal) + 
  labs(title="Alignment Summary") + 
  xlab("") +
  ylab(y.lbl)
  if(pct){
    p = p + scale_y_continuous(labels=percent) +
    ylab(y.lbl)
  } 
  if(!is.null(file)){
    pdf(file)
  }
  print(p)
  if(!is.null(file)){
    dev.off()
  }
}

#' Write PDF file containing heierarchical clustering tree
#'
#' Quickly plot heirarchical clustering of data with several default
#' parameters like width, height, font size, etc.
#' 
#' @param dat            matrix containing data to plot
#' @param file.name      file name. Default: "tmp.pdf"
#' @param title          Title of plot
#' @param sample.labels  Vector of sample labels. Default: column names in matrix
#' @param conds          vector of sample conditions, in same order as column names
#'                       in matrix or sample.labels; if given, nodes will be colored
#'                       according to conditions; Default: NULL
#' @param width          plot width (see plot docs)
#' @param height         plot height (see plot docs)
#' @param lwd            line width
#' @param cex.main       size of title
#' @param cex.lab        size of labels
#' @param cex            font size
#' @param xlab           label of x axis
#' @param ylab           label of y axis
#' @export
bic.pdf.hclust<-function(dat,conds=NULL,file.name="tmp.pdf",title="",width=20,height=14,lwd=3,
    cex.main=2.5,cex.lab=3.0,cex=3.0,xlab="",ylab="",sample.labels=NULL)
{
  if(!is.null(sample.labels)){
    colnames(dat) <- sample.labels
  }
  h <- hclust(dist(t(dat)))
  dend <- as.dendrogram(h)

  lab.cex = 2.0
  max.name = 1
  ## get longest column name in order to ensure margins are correct
  for(n in colnames(dat)){
    if (nchar(n) > max.name){
      max.name <- nchar(n)
    }
  }
  ## adjust label size if number of samples is 20
  if(ncol(dat) > 20){
    lab.cex <- 30/ncol(dat)
  }

  dend <- dendextend::set(dend,"labels_cex",lab.cex)

  ## if conditions are given, color points on leaves accordingly
  if(!is.null(conds)){
    names(conds) <- colnames(dat)
    conds <- conds[labels(dend)]
    pch <- c(19)
    col=as.numeric(as.factor(conds))
    dend <- dendextend::set(dend,"leaves_pch",pch)
    dend <- dendextend::set(dend,"leaves_col",col)
    dend <- dendextend::set(dend,"leaves_cex",lab.cex)
  }

  if(!is.null(file.name)){
    pdf(file.name,width=width,height=height)
  }
  rmar <- max.name*(lab.cex/2)
  par(oma=c(3,2,3,1))
  par(mar=c(3,2,2,rmar))
  plot(dend, horiz=T,lwd=lwd, main=title, cex.main=cex.main,xlab=xlab,ylab=ylab, cex.lab=lab.cex, cex=cex)
  if(!is.null(file.name)){
    dev.off()
  }
}


#' Plot heirachical clustering of all samples
#'
#' Plot clustering of samples using the heierarchical clustering method
#' 
#' @param norm.counts  data matrix containing normalized counts, where
#'                     a column represents a sample and a row represents
#'                     a gene. may or may not contain "GeneID" and "GeneSymbol"
#'                     columns.
#' @param conds        vector of sample conditions, in same order as column names 
#'                     in matrix or sample.labels; if given, nodes will be colored
#'                     according to conditions; Default: NULL
#' @param title        main title of plot
#' @param log2         logical indicating whether data is on log2 scale (Default: FALSE)
#' @param file.name    plot will be saved in this file; Default: 
#'                     $PWD/counts_scaled_hclust.pdf
#' @export
bic.hclust.samples <- function(norm.counts,conds=NULL,log2=FALSE,file.name=NULL,title=""){

  if("GeneID" %in% colnames(norm.counts) | 
     "GeneSymbol" %in% colnames(norm.counts)){
    norm.counts <- norm.counts[,-grep("GeneID|GeneSymbol",colnames(norm.counts))]
  }
  norm.counts <- bic.matrix2numeric(norm.counts)

  if(length(colnames(norm.counts)) < 3){
    cat("Less than three samples; can not run cluster analysis\n")
    return()
  }

  if(is.null(file.name)){
    file.name <- "counts_scaled_hclust.pdf"
  }
  
  if(log2==FALSE){
    pseudo <- min(norm.counts[norm.counts > 0])
    counts2hclust <- log2(norm.counts + pseudo)
  } else {
    counts2hclust <- norm.counts
  }
  tryCatch({
   bic.pdf.hclust(counts2hclust,conds=conds,file.name=file.name,title=title)
     }, error = function(err) {
        stop(err)
        traceback()
     }, warning= function(war){
        warn(war)
     }
  )

}

#' Plot MDS clustering of all samples
#'
#' Plot clustering of samples using the MDS method
#' 
#' @param norm.counts  data matrix containing normalized counts, where
#'                     a column represents a sample and a row represents
#'                     a gene
#' @param conds        vector of conditions to be appended to sample IDs
#'                     for labeling
#' @param file         PDF file to which plot should be saved (optional)
#' @param log2         logical indicating whether norm.counts is on log2 scale 
#'                     Default: FALSE
#' @param labels       logical indicating whether to include sample labels on plot; 
#'                     Default: TRUE
#' @export
bic.mds.clust.samples <- function(norm.counts,log2=FALSE,file=NULL,conds=NULL,labels=TRUE){

  if("GeneID" %in% colnames(norm.counts) |
     "GeneSymbol" %in% colnames(norm.counts)){
    norm.counts <- norm.counts[,-grep("GeneID|GeneSymbol",colnames(norm.counts))]
  }

  norm.counts <- bic.matrix2numeric(norm.counts)

  if(length(colnames(norm.counts)) < 3){
    cat("Less than three samples; can not run cluster analysis\n")
    return
  }

  if(is.null(conds)){
    conds=rep('s',length(colnames(norm.counts)))
  }

  if(log2==FALSE){
    pseudo <- min(norm.counts[norm.counts > 0])
    counts2hclust <- log2(norm.counts + pseudo)
  } else {
    counts2hclust <- norm.counts
  }

  md <- cmdscale(dist(t(counts2hclust)),2)

  if(!is.null(file)){
    pdf(file,width=18,height=12)
  }

  if(labels){
    plot(md, col=as.factor(conds),lwd=2.5,cex=1.5, main="Multidimensional Scaling (MDS)")
    legend("topleft", levels(as.factor(conds)),col=as.factor(levels(as.factor(conds))),pch=1,cex=1.2)
    text(md,colnames(counts2hclust),cex=0.9)
  } else {
    plot(md, col=as.factor(conds),pch=21,main="Multidimensional Scaling (MDS)",lwd=2.5,cex=1.5)
    legend("topleft", levels(as.factor(conds)),col=seq(along=levels(as.factor(conds))),pch=21,cex=1.2)
  }
  if(!is.null(file)){
    dev.off()
  }
}

#' Create PDF of generic red/green/black heatmap showing relative
#' expression values
#'
#' Given the results of DESeq comparison of two conditions and the
#' full normalized count matrix, generate a heatmap showing the relative 
#' expression values of "top 100" differentially expressed genes. If
#' there are fewer than 100 genes in the results file, all of them will
#' be included in the heatmap. Color scheme is red/black/green.
#'
#' @param norm.counts.matrix  matrix of normalized counts, where a row is a gene
#'                            and a column is a sample. May contain "GeneID" and/or
#'                            "GeneSymbol" column. If GeneSymbol column is present, 
#'                            it will be used for heatmap labeling. If not, GeneID 
#'                            column will be used.
#' @param condA               the first condition named in the bic.deseq.file name
#' @param condB               the second condition named in the bic.deseq.file name
#' @param genes               a vector of genes to be included in the heatmap;
#'                            if normalized counts matrix includes "GeneSymbol" column,
#'                            genes must be from that column. Otherwise, they must
#'                            be from the "GeneID" column.                      
#' @param file                name of PDF file to which heatmap should be written. (optional) 
#' 
#' @export
bic.standard.heatmap <- function(norm.counts.matrix,condA,condB,genes=NULL,file=NULL){

  dat <- norm.counts.matrix
  if ("GeneSymbol" %in% colnames(dat)){
    idHeader <- "GeneSymbol"
  } else {
    idHeader <- "GeneID"
  }

  tryCatch({ 
    ## extract data for the DE genes
    htmp.dat <- dat[genes,grep(paste(idHeader,condA,condB,sep="|"),colnames(dat))]

    ## remove duplicate genes
    if(length(htmp.dat[which(duplicated(htmp.dat[,idHeader])),idHeader])>0){
      htmp.dat <- htmp.dat[-which(duplicated(as.vector(htmp.dat[,idHeader]))),]
    }

    ## remove any rows that have NAs
    htmp.dat <- htmp.dat[complete.cases(htmp.dat),]
    ## as long as gene symbols are unique, assign them
    ## as rownames; if not, we'll have to average values
    ## for genes that occur multiple times
    rownames(htmp.dat) <- htmp.dat[,idHeader]
    htmp.dat <- htmp.dat[,-1]
    htmp.dat <- bic.matrix2numeric(htmp.dat)
  }, err = function(){ 
     warning(paste0("Can not generate heatmap for ",condA," vs ",condB))
  })

  ## replace any zeros with ones before taking log2
  htmp.dat[htmp.dat==0] <- 1
  htmp.dat <- as.matrix(log2(htmp.dat))

  ## take only the top 100 genes in order to make heatmap legible
  if(dim(htmp.dat)[1]>100){
    htmp.dat <- htmp.dat[1:100,]
  }

  #if(is.null(out.file)){
  #  out.file <- paste("ResDESeq_",condA,"_vs_",condB,"_heatmap.pdf",sep="")
  #}
 
  if(dim(htmp.dat)[1]>1 && dim(htmp.dat)[2]>1){
    ## make heatmap pdf
    if(!is.null(file)){
      pdf(file,width=16,height=16)
    }
    par(cex.main=1.4)
    heatmap.2(htmp.dat - apply(htmp.dat, 1, mean), 
              trace='none', 
              col=colorpanel(16,"green","black","red"),
              cexRow=0.9,
              #cexCol=2.0, 
              dendrogram="both",
              main=paste("Top Differentially Expressed Genes ",condA," vs ",condB,sep=""), 
              symbreaks=TRUE, 
              keysize=0.5, 
              margin=c(20,10))
    if(!is.null(file)){
      dev.off()
    }
  }
 
}
bic-mskcc/bicrnaseq documentation built on May 24, 2019, 3:04 a.m.