R/vis.R

Defines functions metagenePlot .clacMetageneRegion .calcMetageneStat readGenomeDistPlot readLengthDistPlot .plot_theme

Documented in .calcMetageneStat .clacMetageneRegion metagenePlot .plot_theme readGenomeDistPlot readLengthDistPlot

# visualization

#' Plot theme
#'
#' @param gridX a logical variable indicating if drawing x-axis grid lines. (Default: FALSE).
#' @param gridY a logical variable indicating if drawing y-axis grid lines. (Default: TRUE).
#'
#' @return a ggplot theme.
#'
#' @importFrom ggplot2 theme element_text element_blank element_line
#'
.plot_theme = function(gridX=FALSE, gridY=TRUE) {
  gridMajorX = element_blank()
  gridMajorY = element_blank()
  if(gridX) {
    gridMajorX = element_line(size=0.1, color='black')
  }
  if(gridY) {
    gridMajorY = element_line(size=0.1, color='black')
  }

  theme = theme(
    plot.title=element_text(hjust=0.5),
    panel.background=element_blank(),
    axis.line = element_line(color='black'),
    panel.grid.major.x=gridMajorX,
    panel.grid.major.y=gridMajorY,
    strip.text.x=element_blank()
  )

  return(theme)
}

#' Read length distribution plot
#'
#' @description Plot read length distribution as a barplot. The \code{readLenDF} should be
#' output from \code{readLengthDist} function.
#'
#' @param readLenDF A \code{data.frame} of read length distribution containing three columns,
#' \code{readLen}, \code{count}, and \code{pct}. This dataframe should be output from
#' \code{readLengthDist} function. (Required).
#' @param color A character variable indicating the color of the read length distribution
#' bar plot. (Default: \code{'steelblue'}).
#' @param type A character variable indicating if the actual read counts for each read length
#' or the relative percentage should be plotted. Must be selected from \code{c('count',
#' 'pct')}. (Default: \code{'pct'}).
#'
#' @return A \code{ggplot} object showing the read length distribution.
#'
#' @export
#'
#' @importFrom methods is
#' @importFrom ggplot2 ggplot aes geom_bar position_dodge xlab ylab
#'
readLengthDistPlot = function(readLenDF, color='steelblue', type=c('count', 'pct')) {
  # sanity check
  if(!(!is.null(readLenDF) & is(readLenDF, 'data.frame') & ncol(readLenDF) == 3)) {
    stop('readLenDF must be a data.frame of three columns generated by readLengthDist.')
  }
  if(!(length(color) == 1 & all(is.character(color)))) {
    stop('color must be a character variable.')
  }
  if(!(all(type %in% c('count', 'pct')))) {
    stop('type must be a character variable selected from c(\'count\', \'pct\').')
  }
  readLenDF = as.data.frame(readLenDF)

  colnames(readLenDF) = c('readLen', 'count', 'pct')
  plotYLab = NULL
  if('pct' %in% type) {
    type = 'pct'
    plotYLab = 'Percentage of Reads'
  } else {
    type = 'count'
    plotYLab = 'Number of Reads'
  }

  # plot dataframe
  plotDF = readLenDF[, c('readLen', type)]
  colnames(plotDF) = c('readLen', 'stat')

  # barplot
  p = ggplot(plotDF, aes(x=readLen, y=stat)) + geom_bar(stat='identity', fill=color,
        position=position_dodge()) + xlab('Read Length') + ylab(plotYLab) + .plot_theme()

  return(p)
}

#' Genomic feature distribution plot of aligned reads
#'
#' @description Plot reads genomic feature distribution. The \code{featureDF} dataframe
#' should be output from \code{readGenomeDist} function.
#'
#' @param featureDF A \code{data.frame} of reads genomic feature distribution containing three
#' columns, \code{feature}, \code{count}, and \code{pct}. This dataframe should be output from
#' \code{readGenomeDist} function. (Required).
#' @param type A character variable indicating if the actual read counts for each read length
#' or the relative percentage should be plotted. Must be selected from \code{c('count',
#' 'pct')}. (Default: \code{'pct'}).
#'
#' @return A \code{ggplot} object showing the reads genomic feature distribution.
#'
#' @export
#'
#' @importFrom methods is
#' @importFrom ggplot2 ggplot aes geom_bar position_stack coord_flip xlab ylab theme
#'
readGenomeDistPlot = function(featureDF, type=c('count', 'pct')) {
  # sanity check
  if(!(!is.null(featureDF) & is(featureDF, 'data.frame') & ncol(featureDF) == 3)) {
    stop('featureDF must be a data.frame of three columns generated by readGenomeDist.')
  }
  if(!(all(type %in% c('count', 'pct')))) {
    stop('type must be a character variable selected from c(\'count\', \'pct\').')
  }
  featureDF = as.data.frame(featureDF)

  colnames(featureDF) = c('feature', 'count', 'pct')
  plotYLab = NULL
  if('pct' %in% type) {
    type = 'pct'
    plotYLab = 'Percentage of Reads'
  } else {
    type = 'count'
    plotYLab = 'Number of Reads'
  }

  # plot dataframe
  plotDF = featureDF[, c('feature', type)]
  plotDF$sample = rep('', nrow(plotDF))
  plotDF$feature = factor(plotDF$feature, levels=c('CDS', 'UTR', 'Intron', 'Intergenic'))
  colnames(plotDF) = c('feature', 'stat', 'sample')

  # barplot
  p = ggplot(plotDF, aes(x=sample, y=stat, fill=feature)) + geom_bar(stat='identity',
    position=position_stack(reverse=TRUE)) + coord_flip() + xlab('Sample') + ylab(plotYLab) +
    theme(legend.position='top') + .plot_theme(gridX=TRUE, gridY=FALSE)

  return(p)
}

#' Calculate metagene column-wise summary statistics
#'
#' @param metagene Either a single matrix or a list of two matrices.
#' @param metageneFun A character variable of either 'mean' or 'median' indicating how
#' metagene should be summarized.
#' @param mode A numeric variable of either 1 or 2.
#'
#' @return A data.frame with two columns, 'type' and 'value'. If mode == 1, 'type' column will
#' be filled with 'Region'. If mode == 2, 'type' column will be filled with 'Start' and 'End'.
#' 'value' column is the column summary statistics for each position.
#'
#' @importFrom matrixStats colMedians
#'
.calcMetageneStat = function(metagene, metageneFun, mode) {
  metageneStat = NULL
  if(mode == 1) {
    if(metageneFun == 'mean') {
      metageneStat = colMeans(metagene)
    } else if(metageneFun == 'median') {
      metageneStat = colMedians(metagene)
    }

    metageneStat = data.frame(type=rep('Region', length(metageneStat)), value=metageneStat)
    metageneStat$type = factor(metageneStat$type, levels=c('Region'))
  } else if(mode == 2) {
    metageneStartStat = NULL
    metageneEndStat = NULL

    if(metageneFun == 'mean') {
      metageneStartStat = colMeans(metagene$start)
      metageneEndStat = colMeans(metagene$end)
    } else if(metageneFun == 'median') {
      metageneStartStat = colMedians(metagene$start)
      metageneEndStat = colMedians(metagene$end)
    }

    metageneStartStat = data.frame(type=rep('Start', length(metageneStartStat)),
      value=metageneStartStat)
    metageneEndStat = data.frame(type=rep('End', length(metageneEndStat)),
      value=metageneEndStat)
    metageneStat = rbind(metageneStartStat, metageneEndStat)
    metageneStat$type = factor(metageneStat$type, levels=c('Start', 'End'))
  }

  return(metageneStat)
}

#' Calculate metagene region
#'
#' @param regionObj Either a GRanges object or a list. Must be output from calcMetagene
#' function.
#' @param mode A numeric variable of either 1 or 2.
#'
#' @return If mode == 1, return a numeric vector representing region indices 1, 2, ..., N where
#' N is the region width. If mode == 2, return a numeric vector combining two numeric vectors
#' representing CDS start and end region indices.
#'
#' @importFrom GenomicRanges width
#'
.clacMetageneRegion = function(regionObj, mode) {
  metageneRegion = NULL
  if(mode == 1) {
    regionWidth = unique(width(regionObj))
    if(length(regionWidth) != 1) {
      stop('Not all regions have equal widths.')
    }

    metageneRegion = seq(1, regionWidth)
  } else if(mode == 2) {
    cdsStartUpstream = regionObj$startFlankLength[1]
    cdsStartDnstream = regionObj$startFlankLength[2]
    cdsEndUpstream = regionObj$endFlankLength[1]
    cdsEndDnstream = regionObj$endFlankLength[2]

    start = c(seq(-1 * cdsStartUpstream, -1), seq(0, cdsStartDnstream - 1))
    end = c(seq(-1 * (cdsEndUpstream - 1), 0), seq(1, cdsEndDnstream))

    metageneRegion = c(start, end)
  }

  return(metageneRegion)
}

#' Metagene plot
#'
#' @description Plot metagene signal per position. The \code{metagene} must be output from
#' \code{calcMetagene} function.
#'
#' @param metagene A \code{list} containing three elements. The first element is metagne,
#' either one \code{matrix} if \code{regionGR} is set or a list of two \code{matrices} in CDS
#' start and end regions (with names of start and end). Each row is a transcript, each column
#' is a position, and a value represents read counts or relative frequency for a transcript at
#' a position. The second element is the \code{GRanges} for the metagene, either one range if
#' \code{regionGR} is set or a list of two ranges representing the CDS start and end regions.
#' This list also contains information of the transcripts selected and flanking sequence
#' lengths for CDS start and end. The third element is an internal variable indicating if
#' \code{regionGR} is specified or not (1 means \code{regionGR} is set and 2 means not set).
#' Check \code{calcMetagene} function to see more details of metagene calculation and output
#' object. The \code{metagene} must be output from \code{calcMetagene} function. (Required).
#' @param color A character variable if \code{metagene$mode == 1}, or a character vector
#' specifying the colors for CDS start and end regions if \code{metagene$mode == 2}. (Default:
#' \code{ifelse(metagene$mode == 1, 'steelblue', c('red1', 'steelblue'))}).
#' @param metageneFun A character variable indicating how metagene should be summarized. Must
#' be selected from \code{c('mean', 'median')}. If \code{'mean'}, column means will be taken.
#' If \code{'median'}, column medians will be taken. (Default: \code{'mean'}).
#'
#' @return A \code{ggplot} object showing the metagene signal per position. If
#' \code{metagene$mode == 1}, the x-axis is labeled as 1, 2, ..., N where N is the width of the
#' metagene region. If \code{metagene$mode == 2}, the CDS start region is labeled as -M, -(M-1)
#' , -(M-2), ..., -1, 0, 1, ..., (N-1) where M and N are values set by \code{cdsStartUpstream}
#' and \code{cdsStartDownstream} options in \code{calcMetagene} function. Position 0 in CDS
#' start region represents the first base of start codon; the CDS end region is labeled as
#' -(P-1), -(P-2), ..., -1, 0, 1, ..., Q where P and Q are values set by \code{cdsEndUpstream}
#' and \code{cdsEndDownstream}. Position 0 in CDS end region represents the last base of stop
#' codon.
#'
#' @export
#'
#' @importFrom methods is
#' @importFrom matrixStats colMedians
#' @importFrom ggplot2 ggplot aes geom_bar position_dodge facet_grid xlab ylab geom_vline
#' geom_text scale_fill_manual
#'
metagenePlot = function(metagene, color=ifelse(metagene$mode == 1, list('steelblue'),
  list(c('red1', 'steelblue')))[[1]], metageneFun=c('mean', 'median')) {
  # sanity check
  if(!(is.list(metagene) & !is.null(names(metagene)) & length(metagene) == 3 &
    all(names(metagene) == c('metagene', 'region', 'mode')))) {
    stop('metagene must be output from calcMetagene function.')
  }
  if(!(is.numeric(metagene$mode) & length(metagene$mode) == 1 &
    all(metagene$mode %in% c(1, 2)))) {
    stop('metagene must be output from calcMetagene function.')
  }
  if(metagene$mode == 1) {
    if(!(is.matrix(metagene$metagene) & is(metagene$region, 'GRanges'))) {
      stop('metagene must be output from calcMetagene function.')
    }
    if(!(length(color) == 1 & all(is.character(color)))) {
      stop('color must be a character variable.')
    }
  } else if(metagene$mode == 2) {
    if(!(is.list(metagene$metagene) & length(metagene$metagene) == 2 &
      !is.null(names(metagene$metagene)) & all(names(metagene$metagene) == c('start', 'end'))
      & is.matrix(metagene$metagene$start) & is.matrix(metagene$metagene$end))) {
      stop('metagene must be output from calcMetagene function.')
    }
    if(!(nrow(metagene$metagene$start) == nrow(metagene$metagene$end) &
      all(rownames(metagene$metagene$start) == rownames(metagene$metagene$end)))) {
      stop('metagene must be output from calcMetagene function.')
    }
    if(!(is.list(metagene$region) & !is.null(names(metagene$region)) &
      all(names(metagene$region) == c('start', 'end', 'tx', 'startFlankLength',
      'endFlankLength')))) {
      stop('metagene must be output from calcMetagene function.')
    }
    if(!(is.numeric(metagene$region$startFlankLength) &
      length(metagene$region$startFlankLength) == 2 &
      all(metagene$region$startFlankLength > 0))) {
      stop('metagene must be output from calcMetagene function.')
    }
    if(!(is.numeric(metagene$region$endFlankLength) &
      length(metagene$region$endFlankLength) == 2 &
      all(metagene$region$endFlankLength > 0))) {
      stop('metagene must be output from calcMetagene function.')
    }
    if(!(length(color) == 2 & all(is.character(color)))) {
      stop('color must be a character vector of length 2.')
    }
  }

  if(!(all(metageneFun %in% c('mean', 'median') & all(is.character(metageneFun))))) {
    stop('metageneFun must be a character variable selected from c(\'mean\', \'median\').')
  }
  if('mean' %in% metageneFun) {
    metageneFun = 'mean'
  } else {
    metageneFun = 'median'
  }

  metageneStat = .calcMetageneStat(metagene$metagene, metageneFun, mode=metagene$mode)
  metageneRegion = .clacMetageneRegion(metagene$region, mode=metagene$mode)

  # plot dataframe
  plotDF = metageneStat
  plotDF$position = metageneRegion

  # metagene plot
  p = NULL
  if(metagene$mode == 1) {
    metageneMaxPosition = plotDF$position[which.max(plotDF$value)]

    p = ggplot(plotDF, aes(x=position, y=value)) + geom_bar(stat='identity', fill=color,
      position=position_dodge()) + xlab('Position') + ylab('Normalized Read Counts') +
      geom_vline(aes(xintercept=metageneMaxPosition), size=0.2) +
      geom_text(mapping=aes(hjust=0.5, vjust=1, x=metageneMaxPosition, y=Inf,
      label=as.character(metageneMaxPosition))) + .plot_theme()
  } else if(metagene$mode == 2) {
    metageneStartMaxPosition = plotDF[plotDF$type == 'Start', 'position'][which.max(
      plotDF[plotDF$type == 'Start', 'value'])]
    metageneEndMaxPosition = plotDF[plotDF$type == 'End', 'position'][which.max(
      plotDF[plotDF$type == 'End', 'value'])]

    p = ggplot(plotDF, aes(x=position, y=value, fill=type)) + geom_bar(stat='identity',
      position=position_dodge()) + xlab('Position') + ylab('Normalized Read Counts') +
      facet_grid(. ~ type) + geom_vline(xintercept=0, size=0.25, linetype='dashed') +
      geom_vline(xintercept=0, size=0.25, linetype='dashed') +
      geom_vline(data=plotDF[plotDF$type == 'Start', ],
        aes(xintercept=metageneStartMaxPosition), size=0.2) +
      geom_vline(data=plotDF[plotDF$type == 'End', ], aes(xintercept=metageneEndMaxPosition),
        size=0.2) +
      geom_text(data=plotDF[plotDF$type == 'Start', ], mapping=aes(x=metageneStartMaxPosition,
        y=Inf, label=as.character(metageneStartMaxPosition)), hjust=0.5, vjust=1) +
      geom_text(data=plotDF[plotDF$type == 'End', ], mapping=aes(x=metageneEndMaxPosition,
        y=Inf, label=as.character(metageneEndMaxPosition)), hjust=0.5, vjust=1) +
      scale_fill_manual(values=color, name='', breaks=c('Start', 'End'),
        labels=c('Coding start', 'Coding end')) +
      theme(legend.position='top') + .plot_theme()
  }

  return(p)
}
nzhang89/RiboSeeker documentation built on April 15, 2022, 10:18 a.m.