R/comparePlot.R

Defines functions comparePrep comparePlot

Documented in comparePlot comparePrep

### Function comparePlot ###
#' Function ComparePlot
#'
#' In the simpliest form it draws a nicely formatted scatterplot of the
#' first two columns in a dataframe.  Several formatting options are
#' available to enhance the plots.  If you supply pvalues or FDR values and a threshold,
#' the plot is color coded to show X unique, Y Unique and
#' Common differentialy expressed (DE) genes in different colors.
#'
#' Other options add
#' an identity line (slope=1, intercept=0) and/or a crosshair at (0,0).
#'
#' The comparePlot has also been implemented
#' with a scalable theme that makes it easy to scale font sizes larger
#' or smaller for PPT or Knitr output.  Just add either theme_grey(n) or theme_bw(n),
#' where n equals the base font size (12 works well for knitr, 18 or 24 works well
#' for PPT).  e.g. MyPlot + theme_grey(18).  However, theme_gray and theme_bw have the
#' side effect of resetting the legend position.  Thus, do not use those standard themes
#' if you want to preserve the custom legend locations.  You must rely on the baseFontSize
#' argument in this function to set font sizes to preserve a custom legend location.
#'
#' \strong{Data Structure for the input dataframe:}
#'
#' The x and y values should be in the first two columns. By default, their
#' column names will be used for the axis labels.  You can overide the xylabels
#' the xlab and ylab arguments.
#'
#' Optionally, significance measures in the form of pvalues or FDR values can be supplied
#' for X and Y respectively.  If provided, these columns \strong{must} be named "xp" and "yp" respectively.
#' Together with a threshold (which defaults to 0.01), these values will
#' be used to color code the plot to show X Unique, Y Unique and Common DE
#' genes.  You can use either pvalues or FDR values for the signficance columns.  Use the
#' pthreshold argument to set a proper threshold for FDR values.
#'
#' Sensible defaults are chosen for symbols (Size, Shape, Color and Fill).  Optionally,
#' there are arguments to allow you to fiddle with these settings.  A length of 4 is
#' required for these arguments which applies the attributes in this order:
#' Common, X Unique, Y Unique and Not Significant.
#'
#' Note if you're not using pvalues or FDR values to color the plot, the X Unique color
#' values are used.
#'
#' @author John Thompson, \email{jrt@@thompsonclan.org}
#' @keywords compare plot ggplot2 logratio scatterplot
#'
#' @param df A dataframe with the first two columns representing the x and y variables.
#'          Optionally add xp and yp columns to hold pvalues or FDR values.
#' @param xlab X axis label (default to first column name)
#' @param ylab Y axis label (default to second column name)
#' @param title Plot title (optional)
#' @param pthreshold Used to color points (default = 0.01)
#' @param symbolSize Size of symbols; default = c(4, 4, 4, 2)
#' @param symbolShape Shape of the symbols; Default = c(21, 21, 21, 20)  (20 = filled circle,
#'        21 = fillable open circle) See \url{http://www.cookbook-r.com/Graphs/Shapes_and_line_types}
#' @param symbolColor c(Common, xUnique, yUnique, NoChange);
#'        default = c("black", "grey0", "grey1", "grey25")
#' @param symbolFill Set the fill color for the symbols. Note only symbols 21-25 are fillable. This will
#'        have no effect on other symbols. Default = c("darkgoldenrod1", "deepskyblue4", "red3", "grey25")
#' @param alpha Controls the transparency of the plotted points (0-1; default = 0.5)
#' @param rugColor Specify color for rug density plot along xy axes. Set to NULL
#'    to disable rug layer. (Default=NULL)
#' @param rugAlpha Sets the transparency for the rug layer.  An alpha <1.0 takes
#'   a long time to draw (~10 min on 32bit Win7).  So I set the default at 1.0,
#'   but for a final plot try 0.1 or 0.2 for alpha for more informative rug.
#' @param crosshair Color for the crosshair; (Default = "grey50", NULL disables)
#'        See \url{http://research.stowers-institute.org/efg/R/Color/Chart}
#' @param referenceLine Color for a slope=1, intercept=0 reference line
#'        (Default = "darkgoldenrod1"; NULL disables)
#' @param refLineThickness Set thickness for crosshair and referenceLine (Default=1)
#' @param dens2D Overlay a density2D layer (Default = TRUE) Note: only applies
#'        to simple compare plot without pvalue coloring.
#' @param legendPosition One of "top", "bottom", "left", "right", "ne", "se", "nw", "sw", NULL.
#'        top/bottom/left/right place the legend outside the figure.  ne/se/nw/sw place the figure
#'        inside the figure. NULL disables the legend. Default = "right"
#' @param baseFontSize The smallest size font in the figure in points. Default = 12
#' @param themeStyle "bw" or "grey" which corresponds to theme_bw or theme_grey respectively.
#'        Default = bw"
#' @param footnote optional string placed right justified at bottom of plot.
#' @param footnoteSize applies to footnote. (default = 3)
#' @param footnoteColor applies to footnote. (default = "black")
#' @param footnoteJust Value 0-1. 0 is left justified, 1 is right justified, 0.5 is centered. (default=1)
#'
#' @return ggplot object
#'
#' @examples
#'
#'   #Retrieve the 1st two contrasts from a DGEobj as a list of dataframes (length=2; named items)
#'   ttList <- getType(dgeObj, "topTable")[1:2]
#'
#'   #capture the default logFC and P.Value
#'   compareDat <- comparePrep(ttList)
#'
#'   #switch to an FDR value for the significance measure
#'   compareDat <- comparePrep(ttList, significanceCol="adj.P.Val")
#'
#'   #draw the plot
#'   cPlot <- comparePlot(compareDat, title = "Plot Title")
#'   print(cPlot)
#'
#'    Deluxe Plot with bells and whistles.
#'
#'    MyPlot = comparePlot (df, pthreshold = 0.5,
#'      xlab = "x Axis Label", ylab = "y Axis Label",
#'      title = "Plot Title",
#'      crosshair = "red",
#'      referenceLine = "blue",
#'      legendPosition="nw")
#'
#' @import ggplot2 magrittr
#' @importFrom dplyr left_join
#'
#' @export
comparePlot <- function(df, pthreshold=0.01,
                        xlab=NULL, ylab=NULL, title=NULL,
                        symbolSize = c(4, 4, 4, 2),
                        symbolShape = c(21, 21, 21, 20),
                        symbolColor = c("black", "grey0", "grey1", "grey25"),
                        symbolFill = c("darkgoldenrod1", "deepskyblue4", "red3", "grey25"),
                        alpha = 0.5,
                        rugColor = NULL,
                        rugAlpha = 1.0,
                        crosshair="grey50",
                        referenceLine="darkgoldenrod1",
                        refLineThickness = 1,
                        dens2D = TRUE,
                        legendPosition = "right",
                        baseFontSize = 12,
                        themeStyle = "bw",
                        footnote,
                        footnoteSize=3,
                        footnoteColor="black",
                        footnoteJust=1
                        ) {

  with.seed <- function(seed, expr) {
    saved.seed <- .Random.seed
    on.exit(.Random.seed <<- saved.seed)
    set.seed(seed)
    expr
  }

  #argument checks
  #
  # 1st two columns should be numeric
  if (ncol(df) <2) {
    stop ("Need at least 2 numeric columns to proceed.")
  } else if (!is.numeric(df[,1]) || !is.numeric(df[,2])) {
    stop("Column 1 and 2 must hold numeric X and Y values.")
  }
  #symbol parameters must all be length=4
  if (!length(symbolSize)==4 || !length(symbolShape)==4 ||
      !length(symbolColor)==4 || !length(symbolFill)==4){
    stop("symbol arguments must be length=4 (symbolSize, SymbolShape, SymbolColor, symbolFill)")
  }

  names(symbolShape) = c("Common", "xUnique", "yUnique", "NoChange" )
  names(symbolSize) = c("Common", "xUnique", "yUnique", "NoChange" )
  names(symbolColor) = c("Common", "xUnique", "yUnique", "NoChange" )
  names(symbolFill) = c("Common", "xUnique", "yUnique", "NoChange" )

  ssc = data.frame(group = c("Common", "X Unique", "Y Unique", "Not Significant"),
                   symbolShape = symbolShape,
                   symbolSize = symbolSize,
                   symbolColor = symbolColor,
                   symbolFill = symbolFill,
                   stringsAsFactors = FALSE)

  #used to set uniform square scale
  scalemax = df[,1:2] %>% as.matrix %>% abs %>% max %>% multiply_by(1.05)

  #columns to plot
  #capture the labels from the colname
  xlabel = colnames(df)[1]
  ylabel = colnames(df)[2]
  #now make the columnames suitable for use with aes_string
  x = make.names(colnames(df)[1])
  y = make.names(colnames(df)[2])
  colnames(df)[1:2] = make.names(colnames(df)[1:2])

  ###SIMPLE PLOT: plot all data (no significance measures supplied)
  if (is.null(df[["xp"]]) | is.null(df[["yp"]])) {

    CompPlot = ggplot(df, aes_string(x=x, y=y)) +
      geom_point(shape=1,
                 size=symbolSize[["xUnique"]],
                 color=symbolFill[["xUnique"]],
                 fill=symbolFill[["xUnique"]],
                 alpha=alpha) +
      coord_equal(xlim=c(-scalemax, scalemax), ylim=c(-scalemax, scalemax))
    if (dens2D == TRUE){
      CompPlot = CompPlot + geom_density2d(color=symbolFill[["yUnique"]])
    }
  } else
  ### DELUXE PLOT: plot groups in different colors/shapes
  if (!is.null(df[["xp"]]) & !is.null(df[["yp"]])){

    #Let's plot the subsets
    xindx = df[["xp"]] <= pthreshold
    yindx = df[["yp"]] <= pthreshold

    #boolean indexes to parse groups
    bothindx = xindx & yindx
    neitherindx = !xindx & !yindx
    xindx = xindx & !bothindx #unique to X
    yindx = yindx & !bothindx #unique to y

    #create group factor column in df
    df$group=NA
    df$group[bothindx] = "Common"
    df$group[xindx] = "X Unique"
    df$group[yindx] = "Y Unique"
    df$group[neitherindx] = "Not Significant"
    df %<>% dplyr::left_join(ssc)
    df$group %<>% factor(levels=c("Common", "X Unique", "Y Unique", "Not Significant"))

    #set an order field to control order of plotting
    #plot order is 1) Not Significant plotted first, 2) randomly plot X and Y Unique
    #then plot Common last
    df$order = with.seed(1954, sample.int(nrow(df))) + #add random order (seeded for reproducibility)
      nrow(df) * (df$group == "Common") +  #assign common a high value to sort last
      -nrow(df) * (df$group == "Not Significant") #assign NotSig group neg values to sort first

    CompPlot <- ggplot (df, aes_string(x=x, y=y)) +
                      aes(shape=group, size=group,
                        color=group, fill=group,
                        order=order) +
        #scale lines tell it to use the actual values, not treat them as factors
        scale_shape_manual(name="Group", guide="legend", labels=ssc$group,
                             values=ssc$symbolShape) +
        scale_size_manual(name="Group", guide="legend", labels=ssc$group,
                            values=ssc$symbolSize) +
        scale_color_manual(name="Group", guide="legend", labels=ssc$group,
                             values=ssc$symbolColor) +
        scale_fill_manual(name="Group", guide="legend", labels=ssc$group,
                            values=ssc$symbolFill) +
        geom_point(alpha=alpha) +
        #make it square with same axis scales
        coord_equal(xlim=c(-scalemax, scalemax), ylim=c(-scalemax, scalemax)) +
        #box around the legend
        theme(legend.background = element_rect(fill="gray95", size=.5, linetype="dotted"))
  } ###end DELUXE PLOT

  ### Optional Decorations

  if (!is.null(rugColor)){
    CompPlot = CompPlot + geom_rug(data=df, inherit.aes=FALSE,
                                         color = rugColor,
                                         alpha=rugAlpha,
                                         show.legend=FALSE,
                                         aes_string(x=x, y=y))
  }

  if (!is.null(crosshair)){
    CompPlot = CompPlot +
    geom_hline(yintercept=0, color=crosshair,
               size=refLineThickness, alpha=0.5) +
    geom_vline(xintercept=0, color=crosshair,
               size=refLineThickness, alpha=0.5)
  }

  if (!is.null(referenceLine)){
    CompPlot = CompPlot +
      geom_abline(slope=1, intercept=0, color=referenceLine,
                  size=refLineThickness, alpha=0.5)
  }

  ### Add Labels

  if (is.null(xlab)){ #use colname unless supplied as argument
    CompPlot = CompPlot + xlab(xlabel)
  } else {
    CompPlot = CompPlot + xlab(xlab)
  }
  if (is.null(ylab)){
    CompPlot = CompPlot + ylab(ylabel)
  } else {
    CompPlot = CompPlot + ylab(ylab)
  }
  if (!is.null(title)){
    CompPlot = CompPlot +
      ggtitle(title)
  }

  #Set the font size before placing the legend
  if (tolower(themeStyle) == "bw") {
    CompPlot = CompPlot + theme_bw() + baseTheme(baseFontSize)
  } else {
    CompPlot = CompPlot + theme_grey() + baseTheme(baseFontSize)
  }

  CompPlot = setLegendPosition(CompPlot, legendPosition, themeStyle)

  #footnote
  if (!missing(footnote))
    CompPlot <- addFootnote (CompPlot, footnoteText=footnote,
                             footnoteSize=footnoteSize,
                             footnoteColor="black",
                             footnoteJust=footnoteJust,
                             yoffset = 0.05)

  return(CompPlot)
}

### Function comparePrep ###
#' Function comparePrep
#'
#' Takes two topTable dataframes and outputs a dataframe suitable for function
#' comparePlot (2 columns of LogRatio data and 2 columns of significant
#' measures). Filter the two topTable to contain just the intersecting genes
#' (present in both datasets). The two dataframes must have the same type of
#' gene IDs as rownames.
#'
#'
#' @param ttList A named list of 2 topTable dataframes (required).  The
#'   names are used as column names for the value columns in the output.
#' @param valueCol Name of column containing values to plot (default = "logFC")
#' @param significanceCol Name of column to use for significance (default = "P.Value")
#'
#' @return A data frame with 2 LogRatio measurements and 2 significance columns.  Columns 1 and 3 map
#' to sample 1 and columns 2 and 4 map to sample 2.  The returned dataframe is formatted as expected
#' by the comparePlot function.
#'
#'
#' @examples
#'
#'   #Retrieve the 1st two contrasts from a DGEobj
#'   ttList <- getType(dgeObj, "topTable")[1:2]
#'
#'   #capture the default logFC and P.Value
#'   compareDat <- comparePrep(ttList)
#'
#'   #switch to an FDR value for the significance measure
#'   compareDat <- comparePrep(ttList, significanceCol="adj.P.Val")
#'
#'   #draw the plot
#'   cPlot <- comparePlot(compareDat)
#'
#' @importFrom assertthat assert_that
#' @import magrittr
#'
#' @export
comparePrep <- function(ttList, valueCol="logFC", significanceCol="P.Value"){

  assertthat::assert_that(length(ttList) == 2,
                          !is.null(names(ttList)),
                          "data.frame" %in% class(ttList[[1]]),
                          "data.frame" %in% class(ttList[[2]]),
                          valueCol %in% colnames(ttList[[1]]),
                          valueCol %in% colnames(ttList[[2]]),
                          significanceCol %in% colnames(ttList[[1]]),
                          significanceCol %in% colnames(ttList[[2]]))

  ttNames <- names(ttList)
  tt1 <- ttList[[1]]
  tt2 <- ttList[[2]]

  commonIDs <- intersect(rownames(tt1), rownames(tt2))
  if (length(commonIDs) <1)
    stop("No common gene ids between the two dataframes")

  #filter both tables to the same set of genes in the same order.
  tt1 %<>% rownames_to_column(var="geneid") %>%
    filter(geneid %in% commonIDs) %>%
    arrange(geneid)
  tt2 %<>% rownames_to_column(var="geneid") %>%
    filter(geneid %in% commonIDs) %>%
    arrange(geneid)

  if (!all(tt1$geneid == tt2$geneid))
    stop ("Could NOT align the gene ids in the two topTable files")

  #assemble the return df
  df <- bind_cols(fc1=tt1[[valueCol]],
                         fc2=tt2[[valueCol]],
                         xp=tt1[[significanceCol]],
                         yp=tt2[[significanceCol]],
                         ) %>%
    as.matrix() %>%
    as.data.frame() %>%
    set_colnames(c(ttNames[1], ttNames[2], "xp", "yp")) %>%
    set_rownames(tt1$geneid)

  # colnames(df) <- c(ttNames[1], ttNames[2], "xp", "yp")
  # rownames(df) <- tt1$geneid
  return(df)

}
jrthompson54/DGE.Tools2 documentation built on May 12, 2021, 8:47 p.m.