R/plotting.R

Defines functions plotColumnCounts

Documented in plotColumnCounts

#' Plot categorized metacolumns of ranged data.
#' 
#' This function counts categorcal variables stored as metacolumn in \code{\link{GRanges-class}} object.
#'
#' @param gr A \code{\link{GRanges-class}} object with metadata columns to be summarized and plotted.
#' @param colName A metacolumn name to be sumarized and plotted.
#' @param facedID A metacolumn name to be used to split data in sub-plots.
#' @param colors A user defined set of colors used for plotting.
#' @return A \code{ggplot} object.
#' @author David Porubsky
#' @export

plotColumnCounts <- function(gr, colName='gen', facetID=NULL, colors=NULL) {
  plt.df <- as.data.frame(gr)
  ## Use user defined name of the column to plot
  if (is.null(facetID)) {
    data.tab <- plt.df %>% group_by(.dots=eval(colName)) %>% summarize(counts = length(eval(parse(text = colName))))
  } else {
    data.tab <- plt.df %>% group_by(.dots=c(eval(facetID), eval(colName) )) %>% summarize(counts = length(eval(parse(text = colName))))
  }
  
  ## Set colors
  col.categs <- length(unique(plt.df$ID))
  if (is.null(colors)) {
    col <- brewer.pal(n = col.categs, name = "Set1")
  } else if (length(colors) < col.categs) {
    message("Not enough colors specified!!! Please provide ", col.categs, "distict colors")
  } else {
    col <- colors
  }
  
  plt <- ggplot(data.tab) + geom_col(aes_string(x=eval(colName), y='counts', fill=eval(facetID)))
  plt <- plt + geom_text(data=data.tab, aes_string(x=eval(colName), y='counts', label='counts'), vjust=0)
  plt <- plt + scale_fill_manual(values = col, guide='none')
  plt <- plt + theme_bw() + xlab("")
  
  if (!is.null(facetID)) {
    plt <- plt + facet_grid(eval(parse(text = facetID)) ~ .)
  }  
  
  return(plt)
}


#' Plot categorized metacolumns of ranged data per chromosome.
#' 
#' This function counts categorcal variables stored as metacolumn in \code{\link{GRanges-class}} object per each chromosome.
#'
#' @param gr A \code{\link{GRanges-class}} object with metadata columns to be summarized and plotted.
#' @param colName A metacolumn name to be sumarized and plotted.
#' @param facedID A metacolumn name to be used to split data in sub-plots.
#' @return A \code{ggplot} object.
#' @author David Porubsky
#' @export

plotColumnCountsPerChr <- function(gr, colName='gen', facetID=NULL, normChrSize=FALSE) {
  plt.df <- as.data.frame(gr)
  ## Use user defined name of the column to plot
  if (is.null(facetID)) {
    data.tab <- plt.df %>% group_by(.dots=c('seqnames', eval(colName) )) %>% summarize(counts = length(eval(parse(text = colName))))
    #data.tab <- plt.df %>% group_by(.dots='seqnames') %>% summarize(counts = n()) # counts per chr only
  } else {
    data.tab <- plt.df %>% group_by(.dots=c('seqnames', eval(facetID), eval(colName) )) %>% summarize(counts = length(eval(parse(text = colName))))
  }
  
  if (normChrSize) {
      seq.len <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[seqlevels(gr)]
      data.tab <- data.tab %>% group_by(seqnames) %>% mutate(ChrSizeNorm=counts / seq.len[seqnames])
      
      plt <- ggplot(data.tab) + geom_col(aes_string(x='seqnames', y='ChrSizeNorm', fill=eval(colName)))
      plt <- plt + scale_fill_manual(values = brewer.pal(n = 9, name = "Set1"))
      plt <- plt + theme_bw() + xlab("")
  } else {
      plt <- ggplot(data.tab) + geom_col(aes_string(x='seqnames', y='counts', fill=eval(colName)))
      plt <- plt + scale_fill_manual(values = brewer.pal(n = 9, name = "Set1"))
      plt <- plt + theme_bw() + xlab("")
  }
      
  if (!is.null(facetID)) {
    plt <- plt + facet_grid(eval(parse(text = facetID)) ~ .)
  }  
  
  return(plt)
}


#' Plot distribution of total sizes of all inversions per genotype. 
#'
#' This function plots sum of widths of all ranged data (total bp) per genotype and per chromosome.
#'
#' @param gr A \code{\link{GRanges-class}} object with metadata columns to be summarized and plotted.
#' @return A \code{ggplot} object.
#' @author David Porubsky
#' @export

basesPerGenotypePerChr <- function(gr, normChrSize=FALSE) {
  plt.df <- as.data.frame(gr)
  data.tab <- plt.df %>% group_by(.dots=c("ID","seqnames","gen")) %>% summarize(counts = length(gen), inv.bases=sum(width))
  
  if (normChrSize) {
      seq.len <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[seqlevels(gr)]
      data.tab <- data.tab %>% group_by(seqnames) %>% mutate(ChrSizeNorm=inv.bases / seq.len[seqnames])
      plt <- ggplot(data.tab) + geom_col(aes(x=seqnames, y=ChrSizeNorm, fill=gen))
  } else {
      plt <- ggplot(data.tab) + geom_col(aes(x=seqnames, y=inv.bases, fill=gen))
  }    
      
  plt <- plt + scale_fill_manual(values = brewer.pal(n = 4, name = "Set1")[3:4], name="")
  plt <- plt + facet_grid(ID ~ .) + theme_bw()
  plt <- plt + xlab("Chromosomes") + ylab("Inverted bases (bp)")
  return(plt)
} 

#' Plot sorted size distribution of ranged data.
#' 
#' This function take \code{\link{GRanges-class}} object and plot width distribution of all ranges per variable stored in metacolumn field 'ID'
#'
#' @param gr A \code{\link{GRanges-class}} object with metadata columns to be summarized and plotted.
#' @param plotUniqueBases Set to \code{TRUE} if you want to plot size of unique bases per range.
#' @param violin Set to \code{TRUE} if you want plot violo_plot instead of dot_plot.
#' @param colors A user defined set of colors used for plotting.
#' @param title A user defined title of the plot.
#' @return A \code{ggplot} object.
#' @author David Porubsky
#' @export

rangesSizeDistribution <- function(gr, plotUniqueBases=FALSE, violin=FALSE, colors=NULL, title=NULL) {
  inv.sizes.ord <- order(width(gr), decreasing = FALSE)
  if (plotUniqueBases) {
    size.dist.df <- data.frame(x=1:length(inv.sizes.ord), size=width(gr)[inv.sizes.ord], uniqueBases=gr$TotalUniqueBases[inv.sizes.ord], ID=gr$ID[inv.sizes.ord], stringsAsFactors = FALSE)
  } else {
    size.dist.df <- data.frame(x=1:length(inv.sizes.ord), size=width(gr)[inv.sizes.ord], ID=gr$ID[inv.sizes.ord], stringsAsFactors = FALSE)
  }  
  
  ## Get median size distribution per ID
  size.dist.df <- size.dist.df %>% group_by(ID) %>% mutate(med_size = median(size))
  
  ## Set colors
  col.categs <- length(unique(size.dist.df$ID))
  if (is.null(colors)) {
    col <- brewer.pal(n = col.categs, name = "Set1")
  } else if (length(colors) < col.categs) {
    message("Not enough colors specified!!! Please provide ", col.categs, "distict colors")
  } else {
    col <- colors
  }
  
  if (violin) {
    plt <- ggplot(size.dist.df) + geom_violin(aes(x=ID, y=size, fill=ID), trim = FALSE) +
           #geom_dotplot(aes(x=ID, y=size), binaxis='y', stackdir='center', dotsize=0.05, binwidth = 1) +
           scale_fill_manual(values = col, name="", guide='none') +
           geom_text(aes(x = ID, y = med_size, label=med_size), color="white", vjust=-0.5) +
           xlab("") 
  } else {
    plt <- ggplot(size.dist.df) + geom_point(aes(x=x, y=size, color=ID)) +
           facet_grid(ID ~ .) +
           geom_hline(aes(yintercept = med_size, group=ID), color="black") +
           geom_text(aes(x = 1, y = med_size, label=med_size), color="black", vjust=-0.5) +
           scale_color_manual(values = col, name="", guide='none') +
           xlab("Size sorted inversions") 
    if (plotUniqueBases) {
      plt <- plt + geom_point(aes(x=x, y=uniqueBases), color='gray')
    }
  }
  
  plt <- plt + 
         scale_y_continuous(breaks=c(1000,10000,100000,1000000), labels = comma, trans = 'log10') +
         geom_hline(yintercept = c(10000, 1000000), linetype="dashed") +
         ylab("Inversion size (log10)")
  
  if (!is.null(title) & is.character(title)) {
    plt <- plt + ggtitle(title)
  }
  
  return(plt)
}  


#' Plots scatter of event counts to chromosome size
#'
#' @param gr A \code{\link{GRanges-class}} object with metadata columns to be summarized and plotted.
#' @param bsgenome A \code{\link{GBSgenome-class}} object to provide chromosome lengths for plotting.
#' @param colBy A metacolumn name to be used to color data by.
#' @param facetID A metacolumn name to be used to split data in sub-plots.
#' @param lm Set to TRUE if regression line should be added to the plot.
#' @return A \code{ggplot} object.
#' @author David Porubsky
#' @export

eventsPerChrSizeScatter <- function(gr, bsgenome, colBy=NULL, facetID=NULL, lm=FALSE) {
  if (any(is.na(seqlengths(gr))) & is.null(bsgenome)) {
    message("Chromosome lengths are missing. Please submit BSgenome object of the genome you want to plot.")
  }
  
  ## Load BSgenome
  if (class(bsgenome) != 'BSgenome') {
    if (is.character(bsgenome)) {
      suppressPackageStartupMessages(library(bsgenome, character.only=T))
      bsgenome <- as.object(bsgenome) # replacing string by object
    }
  }
  
  plt.df <- as.data.frame(gr)
  seq.len <- seqlengths(bsgenome)[seqlevels(gr)]
  data.tab <- plt.df %>% group_by(.dots='seqnames') %>% summarize(counts = n()) %>% mutate(ChrLen=seq.len[seqnames])
  
  if (lm) {
    colBy=NULL
    facetID=NULL
    message("Parameters 'colBy' and 'facetID' cannot be used when 'lm=TRUE'")
    l.mod = lm(counts ~ ChrLen, data=data.tab)
    suppressWarnings( conf.level <- predict(l.mod, interval="prediction", level = 0.95) )
    data.tab <- data.tab %>% mutate(resid=abs(resid(l.mod)), fitted=fitted(l.mod))
    data.tab <- cbind(data.tab, conf.level)
    
    r.sq <- paste0('R^2=', round(summary(l.mod)$r.squared, 3))
    r.sq.df <- as.data.frame(r.sq)
    
    plt <- data.tab %>% ggplot() + geom_line(aes_string(x='ChrLen', y='fitted')) +
      geom_line(aes_string(x='ChrLen', y='lwr'), color = "red", linetype = "dashed") +
      geom_line(aes_string(x='ChrLen', y='upr'), color = "red", linetype = "dashed") +
      geom_point(aes_string(x='ChrLen', y='counts', color='resid')) +
      geom_text(aes_string(x='ChrLen', y='counts', label='seqnames'), vjust=-0.5, hjust=-0.1) +
      geom_text(data = r.sq.df, aes(x=-Inf, y=Inf, label=r.sq), inherit.aes = F, hjust=-0.5, vjust=1) +
      scale_colour_gradient(low="blue", high="red") +
      scale_x_continuous(labels = comma) +
      labs(x="Chromosome size (bp)", y='counts', colour="Residuals")
  } else if (!is.null(colBy) & is.character(colBy)) {
    data.tab <- plt.df %>% group_by(.dots=c('seqnames', eval(colBy))) %>% summarize(counts = n()) %>% mutate(ChrLen=seq.len[seqnames])
    plt <- data.tab %>% ggplot(aes_string(x='ChrLen', y='counts', color=eval(colBy))) + 
      geom_point() + 
      geom_text(data=data.tab, aes_string(x='ChrLen', y='counts', label='seqnames'), vjust=-0.5, hjust=-0.1) +
      scale_x_continuous(labels = comma) +
      scale_color_manual(values = brewer.pal(n = 9, name = "Set1")) +
      xlab("Chromosome size (bp)")
  } else {
    plt <- data.tab %>% ggplot(aes_string(x='ChrLen', y='counts')) + 
      geom_point() + 
      geom_text(data=data.tab, aes_string(x='ChrLen', y='counts', label='seqnames'), vjust=-0.5, hjust=-0.1) +
      scale_x_continuous(labels = comma) +  
      xlab("Chromosome size (bp)")
  }
  ## Split the plot by facetID
  if (!is.null(facetID)) {
    plt <- plt + facet_grid(eval(parse(text = facetID)) ~ ., scales = 'free') #+ geom_smooth(method = "lm", se = FALSE)
  }  
  return(plt)
}


#' Plot genome-wide distribution of ranged data.
#' Ranges are color by the 'ID' column.
#'
#' @param gr A \code{\link{GRanges-class}} object with metadata columns to be summarized and plotted.
#' @param userTrack A user defined set of ranges defined \code{\link{GRanges-class}} object to be plotted over the genome-wide ideogram.
#' @param userTrackGeom A ggplot2 geom to be used to visualize 'userTrack'.
#' @param colors A user defined set of colors used for plotting.
#' @param bsgenome A \code{\link{BSgenome-class}} object to provide chromosome lengths for plotting.
#' @return A \code{ggplot} object.
#' @author David Porubsky
#' @export

genomewideRangesIdeo <- function(gr, userTrack=NULL, userTrackGeom='rect', colors=NULL, bsgenome=NULL) {
  if (any(is.na(seqlengths(gr))) & is.null(bsgenome)) {
    message("Chromosome lengths are missing. Please submit BSgenome object of the genome you want to plot.")
  }
  
  ## Load BSgenome
  if (class(bsgenome) != 'BSgenome') {
    if (is.character(bsgenome)) {
      bsgenome <- tryCatch({
        suppressPackageStartupMessages(library(bsgenome, character.only=TRUE))
        bsgenome <- eval(parse(text=bsgenome)) ## replacing string by object
      }, error = function(e) {return(NULL)})
    } else {
      bsgenome <- NULL
    }
  }
  
  ## Prepare ideo data for plotting
  seq.len <- seqlengths(bsgenome)[paste0('chr', c(1:22, 'X'))]
  ideo.df <- data.frame(seqnames=names(seq.len), length=seq.len)
  ideo.df$seqnames <- factor(ideo.df$seqnames, levels=paste0('chr', c(1:22, 'X')))
  
  ## Prepare ranged data for plotting
  gr$level <- GenomicRanges::disjointBins(gr)
  plt.df <- as.data.frame(gr)
  
  ## Set colors
  col.categs <- length(unique(plt.df$ID))
  if (is.null(colors)) {
    col <- brewer.pal(n = col.categs, name = "Set1")
  } else if (length(colors) < col.categs) {
    message("Not enough colors specified!!! Please provide ", col.categs, "distict colors")
  } else {
    col <- colors
  }
  
  plt <- ggplot(ideo.df) + geom_linerange(aes(x=-1, ymin=0, ymax=length), size=2, color="gray36")
  if (!is.null(userTrack)) {
    strand(userTrack) <- "*"
    userTrack <- reduce(userTrack)
    ## Remove seqlevels not present in the data or different than chr1:22 and X
    chroms2keep <- seqlevels(userTrack)[seqlevels(userTrack) %in% paste0('chr', c(1:22, 'X'))]
    userTrack <- keepSeqlevels(userTrack, chroms2keep, pruning.mode = 'coarse')
    userTrack.df <- as(userTrack, 'data.frame')
    if (userTrackGeom == 'rect') {
      plt <- plt + geom_linerange(data=userTrack.df, aes(x=-1, ymin=as.numeric(start), ymax=as.numeric(end)), size=2, color="darkgoldenrod1")
    } else if (userTrackGeom == 'point') {
      userTrack.df$midpoint <- userTrack.df$start + ((userTrack.df$end - userTrack.df$start)/2)
      plt <- plt + geom_point(data=userTrack.df, aes(x=-1, y=midpoint), size=1, color="darkgoldenrod1")
    }
  }
  plt <- plt + 
    coord_flip() + 
    facet_grid(seqnames ~ ., switch='y') +
    geom_linerange(data=plt.df , aes(x=level, ymin=start, ymax=end+250000, color=ID), size=1) +
    scale_color_manual(values =  col, name="") +
    scale_y_continuous(expand = c(0,0)) +
    theme_void() +
    theme(axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank()) +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    theme(strip.text.y.left = element_text(angle = 0))
  return(plt)
}


#' Plot size distribution of ranges in all Venn partitions.
#'
#' @param venn A \code{data.frame} object as an output of \pkg{VennDiagram} package.
#' @param overlaps A \code{\link{GRanges-class}} object with overlaps determined by the same idx column ID.
#' @return A \code{ggplot} object.
#' @author David Porubsky

plotVennPartitions <- function(venn=NULL, overlaps=NULL) {
  partitions <- list()
  for (i in 1:nrow(venn)) {
    partition <- venn[i,]
    partition.idx <- unlist(partition$..values..)
    partition.ranges <- overlaps[overlaps$sub.group %in% partition.idx]
    partition.ranges$partitionID <- partition$..set..
    partitions[[i]] <- as.data.frame(partition.ranges)
  }
  
  plt.df <- do.call(rbind, partitions)
  plt <- ggplot(plt.df) + geom_boxplot(aes(x=ID, y=width, fill=ID))
  plt <- plt + scale_fill_manual(values = brewer.pal(n = 3, name = "Set1"), name="")
  plt <- plt + scale_y_continuous(breaks=c(1000,10000,100000,1000000), labels = comma, trans = 'log10')
  plt <- plt + geom_hline(yintercept = c(10000, 100000, 1000000), linetype="dashed")
  plt <- plt + xlab("") + ylab("Inversion size (log10)")
  plt <- plt + facet_grid(partitionID ~ .)
  return(plt)
}


#' Plot genotype distribution of ranges in all Venn partitions.
#'
#' @param venn A \code{data.frame} object as an output of \pkg{VennDiagram} package.
#' @param overlaps A \code{\link{GRanges-class}} object with overlaps determined by the same idx column ID.
#' @return A \code{ggplot} object.
#' @author David Porubsky

plotVennPartitionsGen <- function(venn=NULL, overlaps=NULL) {
  partitions <- list()
  for (i in 1:nrow(venn)) {
    partition <- venn[i,]
    partition.idx <- unlist(partition$..values..)
    partition.ranges <- overlaps[overlaps$idx %in% partition.idx]
    partition.ranges$partitionID <- partition$..set..
    partitions[[i]] <- as.data.frame(partition.ranges)
  }
  plt.df <- do.call(rbind, partitions)
  
  data.tab <- plt.df %>% group_by(.dots=c('partitionID', 'ID', 'gen')) %>% summarize(counts = length(gen))
  plt <- ggplot(data.tab, aes_string(x='ID', y='counts', group='gen')) 
  plt <- plt + geom_col(aes_string(fill='gen'), position = 'dodge')
  plt <- plt + geom_text(aes_string(label='counts'), vjust=0,  position=position_dodge(width = 1))
  plt <- plt + scale_fill_manual(values = brewer.pal(n = 3, name = "Set1"), name="")
  plt <- plt + xlab("")
  plt <- plt + facet_grid(partitionID ~ .)
  return(plt)
}


#' Plot number of overlaps between query and subject ranges.
#'
#' @param query.gr A \code{\link{GRanges-class}} object.
#' @param subject.gr A \code{\link{GRanges-class}} object. 
#' @param facedID A metacolumn name to be used to split data in sub-plots.
#' @return A \code{ggplot} object.
#' @author David Porubsky
#' @export

plotOverlapWithRanges <- function(query.gr=NULL, subject.gr=NULL, facetID=NULL) {
  counts <- countOverlaps(query.gr, subject.gr)
  query.gr$userTrackOverlapCount <- counts
  
  inv.sizes.ord <- order(width(query.gr), decreasing=TRUE)
  
  plt.df <- as.data.frame(query.gr[inv.sizes.ord])
  plt.df$x <- 1:nrow(plt.df) 
  
  plt <- ggplot(plt.df) + geom_col(aes_string(x='x', y='width'), color='black')
  plt <- plt + geom_col(aes_string(x='x', y='userTrackOverlapCount'), color='red', inherit.aes=FALSE)
  plt <- plt + scale_y_continuous(breaks=c(1000,10000,100000,1000000), labels = comma, trans = 'log10')
  plt <- plt + geom_hline(yintercept = c(10000, 1000000), linetype="dashed")
  plt <- plt + xlab("Size sorted inversions (bp)") + ylab("Gene count | Inversion size")
  
  return(plt)
} 


#' Plot genome-wide distribution of plus and minus reads.
#'
#' @param gr A \code{\link{GRanges-class}} object.
#' @param bin.len A size of bins to count directional reads in.
#' @param chromosomes A user defined set of chromosomes to be plotted.
#' @param colors A colors for plus and minus reads.
#' @param title User defined title of the exported plot.
#' @return A \code{ggplot} object.
#' @importFrom gtools mixedsort
#' @importFrom scales comma
#' @author David Porubsky
#' @export

plotCompositeIdeo <- function(gr=NULL, bin.len=200000, chromosomes=NULL, colors=c('#EFEDF5', '#68228B'), title=NULL) {
  ## Keep only user defined chromosomes
  if (!is.null(chromosomes) & is.character(chromosomes)) {
    gr <- GenomeInfoDb::keepSeqlevels(gr, value = chromosomes, pruning.mode = 'coarse')
    seqlevels(gr) <- chromosomes[chromosomes %in% as.character(unique(seqnames(gr)))]
  }
  ## Sort reads by position and by chromosome
  gr <- sort(gr, ignore.strand=TRUE)
  if (is.null(chromosomes)) {
    seqlevels(gr) <- gtools::mixedsort(seqlevels(gr))
  }
  #seqlengths(gr) <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[seqlevels(gr)]
  
  chrom.lengths <- seqlengths(gr)
  
  ## Bin the data
  chrom.lengths.floor <- floor(chrom.lengths/ bin.len) * bin.len
  binned.data <- unlist(tileGenome(chrom.lengths.floor, tilewidth = bin.len), use.names = FALSE)
  seqlengths(binned.data) <- chrom.lengths
  
  Watsonreads <- GenomicRanges::countOverlaps(binned.data, gr[strand(gr)=='-']) 
  Crickreads <- GenomicRanges::countOverlaps(binned.data, gr[strand(gr)=='+'])
  bothreads <- Watsonreads + Crickreads
  
  mcols(binned.data)$bothreads <- bothreads
  mcols(binned.data)$Watsonreads <- Watsonreads
  mcols(binned.data)$Crickreads <- Crickreads
  
  dfplot.reads <- as.data.frame(binned.data)
  
  #Crickreads.outlier <- stats::quantile(dfplot.reads$Crickreads, 0.99)
  #Watsonreads.outlier <- stats::quantile(dfplot.reads$Watsonreads, 0.99)
  outlier <- stats::quantile(dfplot.reads$Crickreads, 0.99)
  ## Set outlier bins to the limit
  dfplot.reads$Crickreads[dfplot.reads$Crickreads >= outlier] <- outlier
  dfplot.reads$Watsonreads[dfplot.reads$Watsonreads >= outlier] <- outlier
  
  my.theme <- theme(legend.position="none",
                    panel.grid.major = element_blank(),
                    panel.grid.minor = element_blank(),
                    panel.border = element_blank(),
                    panel.background = element_blank(),
                    axis.text.y=element_blank(),
                    axis.ticks.y=element_blank(),
                    strip.text.y.left = element_text(angle = 0))
                    #strip.text.y = element_text(angle = 180) ) 
  
  dfplot.reads$midpoint <- dfplot.reads$start + ( (dfplot.reads$end - dfplot.reads$start) %/% 2 )
  
  ## Construct ggplot
  dfplot.reads$mCrickreads <- -dfplot.reads$Crickreads
  ggplt <- ggplot(dfplot.reads)
  ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='mCrickreads', x='midpoint'), color=colors[1], size=1)
  ggplt <- ggplt + facet_grid(seqnames ~ ., switch = "y", scales = 'free')
  ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='Watsonreads', x='midpoint'), color=colors[2], size=1)
  ggplt <- ggplt +
    xlab("Genomic position") +
    ylab("Read counts") +
    scale_x_continuous(expand = c(0,0), labels = scales::comma) + my.theme
  
  if (!is.null(title) & is.character(title)) {
    ggplt <- ggplt + ggtitle(title)
  }
  
  return(ggplt)
}


#' Plot 'dotplot' of two sequences.
#'
#' This function specifically plots matched region of two sequences based on nucmer results.
#' 
#' @param nucmer.coords A coordinates from nucmer output. [RUN: nucmer --coords ...] 
#' @param genome.coords Set to \code{TRUE} if you want to work in genomic coordinates.
#' @param highlight.pos A set of postions to be highlighted on the x-axis.
#' @param title A character string to use as a title for the plot.
#' @param sd.track A segmental dulication track to be highlight at the plot.
#' @param shape A shape used to plot aligned sequences: Either 'segm' or 'point'.
#' @return A \code{\link[ggplot2:ggplot]{ggplot}} object.
#' @author David Porubsky
#' @export
plotNucmerCoords <- function(nucmer.coords = NULL, genome.coord = TRUE, highlight.pos = NULL, title = NULL, sd.track = NULL, shape='segm') {
  
  ## Helper function
  remapCoord <- function(x = NULL, new.range = NULL) {
    offset <- (min(new.range) + x[1]) - 1
    dist <- cumsum(diff(x))
    new.x <- c(offset, (offset + dist))
    return(new.x)
  } 
  
  ## Read in coordinates from nucmer output
  coords <- read.table(nucmer.coords, skip=5, stringsAsFactors = FALSE)
  coords.df <- data.frame(s1.start=coords$V1,
                          s1.end=coords$V2,
                          s2.start=coords$V4,
                          s2.end=coords$V5,
                          s1.id=coords$V12,
                          s2.id=coords$V13, 
                          stringsAsFactors = FALSE)
  
  ## Translate sequence coordinates to genome-wide coordinates
  if (genome.coord) {
    s1.region <- unique(coords.df$s1.id)
    s2.region <- unique(coords.df$s2.id)
    s1.chr <- strsplit(s1.region, ":")[[1]][1]
    s1.region <- strsplit(s1.region, ":")[[1]][2]
    s2.region <- strsplit(s2.region, ":")[[1]][2]
    s1.range <- as.numeric( strsplit(s1.region, "-")[[1]] )
    s2.range <- as.numeric( strsplit(s2.region, "-")[[1]] )
    coords.df$s1.start <- remapCoord(x = coords.df$s1.start, new.range = s1.range)
    coords.df$s1.end <- remapCoord(x = coords.df$s1.end, new.range = s1.range)
    coords.df$s2.start <- remapCoord(x = coords.df$s2.start, new.range = s2.range)
    coords.df$s2.end <- remapCoord(x = coords.df$s2.end, new.range = s2.range)
  }
  
  ## Color segments by directionality
  coords.df$dir <- 'rev'
  forw.mask <- (coords.df$s1.start < coords.df$s1.end) & (coords.df$s2.start < coords.df$s2.end)
  coords.df$dir[forw.mask] <- 'forw'
  
  if (shape == 'segm') {
    ## Plot alignments
    plt <- ggplot2::ggplot(coords.df, aes(x=s1.start,xend=s1.end,y=s2.start,yend=s2.end, color=dir)) + 
      geom_segment()
  } else if (shape == 'point') {
    coords.df$s1.midpoint <- coords.df$s1.start + ((coords.df$s1.end - coords.df$s1.start)/2)
    coords.df$s2.midpoint <- coords.df$s2.start + ((coords.df$s2.end - coords.df$s2.start)/2)
    plt <- ggplot2::ggplot(coords.df, aes(x=s1.midpoint, y=s2.midpoint, color=dir)) + 
      geom_point()
  }  
  plt <- plt +  
    xlab(unique(coords.df$s1.id)) +
    ylab(unique(coords.df$s2.id)) +
    theme_bw() + 
    theme(aspect.ratio=1) + #force x and y axis to have the same proportion
    scale_color_manual(values = c('chartreuse4', 'darkgoldenrod2'))
    
  ## Highlight user defined postion on x axis
  if (!is.null(highlight.pos)) {
    plt <- plt + geom_vline(xintercept = highlight.pos, color='black', linetype = 2)
  }
  ## Highlight regions of SDs on x axis
  if (!is.null(sd.track) & genome.coord) {
    s1.gr <- GRanges(seqnames=s1.chr, ranges=IRanges(start=s1.range[1], end=s1.range[2]))
    sd.track <- subsetByOverlaps(sd.track, s1.gr)
    sd.track <- reduce(sd.track)
    if (length(sd.track) > 0) {
      sd.track.df <- as.data.frame(sd.track)
      # make sure that sd regions do not extend over x axis region
      sd.track.df$start[sd.track.df$start < start(s1.gr)] <- start(s1.gr)
      sd.track.df$end[sd.track.df$end > end(s1.gr)] <- end(s1.gr)
      
      plt <- plt + geom_rect(data=sd.track.df  ,aes(xmin=start, xmax=end, ymin=Inf, ymax=-Inf), fill='gray', alpha=0.25, inherit.aes = FALSE)
    }  
  }
  ## Highlight user defined postion on x axis
  if (!is.null(title)) {
    plt <- plt + ggtitle(title)
  }
  return(plt)
}


#' Plot read-pair coverage profile per breakpoint
#'
#' This function takes as an input aligned read-pairs in BAM and plots coverage profile per breakpoint.
#' Breakpoint positions are stored as part of a readName.
#' e.g. readName__chr1-109148195-109148196__chr1_invDup_gorilla_roi4_1
#'
#' @param plot.ambig Bamfile with aligned reads.
#' @param plot.pairs Bamfile with aligned reads.
#' @param view.range Bamfile with aligned reads.
#' @inheritParams exportBedGraph 
#' @return A \code{\link[ggplot2:ggplot]{ggplot}} object.
#' @importFrom Rsamtools ScanBamParam scanBamFlag
#' @importFrom GenomicAlignments readGAlignmentPairs first last getDumpedAlignments
#' @importFrom tidyr separate
#' @author David Porubsky
#' @export 
#' 
plotReadPairCoverage <- function(bamfile, mapq=10, filt.flag=0, min.read.len=5000, plot.ambig=FALSE, plot.pairs=FALSE, blacklist=NULL, view.range=100000) {
  
  filename <- basename(bamfile)
  
  ## Load paired reads from BAM
  suppressWarnings( data.raw <- GenomicAlignments::readGAlignmentPairs(bamfile, param=Rsamtools::ScanBamParam(what=c('mapq', 'flag', 'qname'), flag=scanBamFlag(isDuplicate=FALSE))) ) 
  data.first <- as(GenomicAlignments::first(data.raw), 'GRanges')
  data.last <- as(GenomicAlignments::last(data.raw), 'GRanges')
  
  ## Plot ambiguous read pairs if TRUE
  if (plot.ambig) {
    data.dumped <- GenomicAlignments::getDumpedAlignments()
    data.dumped.gr <- as(data.dumped, 'GRanges')
  }  
  
  ## Filter by mapq
  if (mapq > 0) {
    mask <- data.first$mapq >= mapq &  data.last$mapq >= mapq
    data.first <- data.first[mask]
    data.last <- data.last[mask]
  }  
  ## Filter by flag
  if (filt.flag > 0) {
    bit.flag.first <- bitwAnd(filt.flag, data.first$flag)
    bit.flag.last <- bitwAnd(filt.flag, data.last$flag)
    mask <- bit.flag.first == 0 & bit.flag.last == 0
    data.first <- data.first[mask]
    data.last <- data.last[mask]
  }  
  ## Filter by read length
  mask <- width(data.first) >= min.read.len & width(data.last) >= min.read.len
  data.first <- data.first[mask]
  data.last <- data.last[mask]
  ## Filter out blacklisted regions
  if (!is.null(blacklist)) {
    hits.first <- findOverlaps(query = data.first, subject = blacklist)
    hits.last <- findOverlaps(query = data.last, subject = blacklist)
    filt <- c(queryHits(hits.first), queryHits(hits.last))
    if (length(filt) > 0) {
      data.first <- data.first[-filt]
      data.last <- data.last[-filt]
    }
  }  
  
  ## Create final data object
  data <- data.first
  data$mate <- data.last
  
  ## Get breakpoint ID
  data$ID <- sapply(data$qname, function(x) strsplit(x, "__")[[1]][2])
  breakpoints <- data.frame(break.id = unique(data$ID))
  ## Convert breakpoints to data.frame
  breakpoints.df <- tidyr::separate(breakpoints, break.id, sep = "-", into = c('chr','start','end'), convert = TRUE)
  
  ## Get region boundaries
  region.start <- min(breakpoints.df$start)
  region.end <- max(breakpoints.df$end)
  if (view.range > 0) {
    x.min <- region.start - view.range
    x.max <-region.end + view.range
  }
  
  ## Plot coverage profile per breakpoint
  data.grl <- split(data, data$ID)
  plt.data <- list()
  for (i in seq_along(data.grl)) {
    gr <- data.grl[[i]]
    ID <- unique(gr$ID)
    cov <- coverage(gr)
    cov.gr <- as(cov, 'GRanges')
    cov.gr$ID <- ID
    plt.data[[ID]] <- as.data.frame(cov.gr)
  }
  plt.df <- do.call(rbind, plt.data)
  
  ## Set random RGB colors
  #color.rgb <- sapply(1:length(data.grl), function(x) round(runif(3, 0, 255)))
  #color.rgb <- apply(color.rgb, 2, function(x) rgb(x[1], x[2], x[3], maxColorValue = 255))
  
  plots <- list()
  ## Prepare title
  title <- ggdraw() + draw_label(filename, fontface='bold')
  plots[[length(plots)+1]] <- title
  
  plt <- ggplot(plt.df) +
    geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=score, fill=ID)) + 
    geom_vline(xintercept = c(breakpoints.df$start)) +
    coord_cartesian(xlim = c(x.min, x.max)) +
    scale_fill_manual(values = brewer.pal(n = max(3, length(data.grl)), name = "Set1")) +
    theme(legend.position="bottom") +
    ggtitle("Breakpoint coverage")
  plots[[length(plots)+1]] <- plt
  
  if (plot.pairs) {
    plt.df <- as.data.frame(data)
    plt.df$level <- unlist(sapply(table(plt.df$ID), function(x) 1:x), use.names = FALSE)
    
    plt <- ggplot(plt.df) + 
      geom_segment(aes(x=start, xend=mate.start, y=level, yend=level), size=0.25) +
      geom_point(aes(x=start, y=level, color=strand), shape=108) +
      geom_point(aes(x=mate.start, y=level, color=mate.strand), shape=108) +
      geom_vline(xintercept = c(breakpoints.df$start)) +
      coord_cartesian(xlim = c(x.min, x.max)) + 
      theme(legend.position="bottom",
            axis.title.y=element_blank(),
            axis.text.y=element_blank(),
            axis.ticks.y=element_blank()) +
      ggtitle("Read-pair links") +
      xlab('')
    plots[[length(plots)+1]] <- plt
  }
  
  if (plot.ambig) {
    data.dumped.cov <- coverage(data.dumped.gr)
    data.dumped.cov.gr <- as(data.dumped.cov, "GRanges")
    plt.df <- as.data.frame(data.dumped.cov.gr)
    
    plt <- ggplot(plt.df) +
      geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=score, fill=ID), fill='orange') + 
      geom_vline(xintercept = c(breakpoints.df$start)) +
      coord_cartesian(xlim = c(x.min, x.max)) +
      theme(legend.position="bottom") +
      xlab('Genomic position (bp)') +
      ggtitle("Coverage of ambiguous read pairs")
    plots[[length(plots)+1]] <- plt
  }
  
  heights <- c(0.1, rep(1, length(plots)-1))
  
  final.plt <- cowplot::plot_grid(plotlist = plots, ncol = 1, rel_heights = heights)
  return(final.plt)
}


#' Plot BAM alignments around user defined genomic regions.
#'
#' @param bamfile Bamfile with aligned reads.
#' @param regions A \code{\link{GRanges-class}} object with genomic regions to process.
#' @param file A filename where final plot should be exported.
#' @return A \code{ggplot-class} object.
#' @importFrom dplyr group_by summarise "%>%"
#' @importFrom Rsamtools scanBamHeader ScanBamParam scanBamFlag
#' @importFrom GenomicAlignments readGAlignments cigarRangesAlongReferenceSpace cigarToRleList
#' @author David Porubsky
#' @export
#' 
plotAlignmentsPerRegion <- function(bamfile=NULL, regions=NULL, file=NULL) {
  ## Load BAM file
  message("Reading BAM file: ", basename(bamfile))
  data.raw <- GenomicAlignments::readGAlignments(bamfile, param=Rsamtools::ScanBamParam(what=c('cigar', 'mapq', 'flag', 'qname'), flag=Rsamtools::scanBamFlag(isDuplicate=FALSE)))
  frags <- as(data.raw, 'GRanges')
  
  ## Go over user defined genomic location and plot BAM alignments at these regions
  plots <- list()
  for (i in seq_along(regions)) {
    invDup.region <- regions[i]
    regions.ID <- as.character(invDup.region)
    message("    Working on region: ", regions.ID)
    ## Select fragments from a genomic region
    frags.region <- IRanges::subsetByOverlaps(frags, invDup.region)
    if (length(frags.region) > 0) {
      ## Parse cigar string
      cigar.iranges <- GenomicAlignments::cigarRangesAlongReferenceSpace(frags.region$cigar, flag = frags.region$flag, pos = start(frags.region))
      cigarRle <- GenomicAlignments::cigarToRleList(frags.region$cigar)
      ## Convert parsed cigar to GRanges
      cigar.gr <- GenomicRanges::GRanges(seqnames=as.character(seqnames(frags.region[1])), ranges=unlist(cigar.iranges))
      cigar.gr$cigar <- unlist(runValue(cigarRle))
      #qname.id <- sapply(frags.region$qname, function(x) strsplit(x, "\\.")[[1]][3])
      cigar.gr$qname <- factor(rep(frags.region$qname, lengths(cigar.iranges)), levels = unique(frags.region$qname))
      
      ## Restrict plotted region to the size of genomic regions of interest to right and left
      lookup.region <- primatR::resizeRanges(invDup.region, times = 1, bsgenome = BSgenome.Hsapiens.UCSC.hg38)
      cigar.gr <- IRanges::subsetByOverlaps(cigar.gr, lookup.region)
      cigar.grl <- split(cigar.gr, cigar.gr$qname)
      
      ## Construct plot ##
      ## Plot BAM alignements
      cigar.gr.df <- as.data.frame(cigar.gr)
      cigar.gr.df$level <- rep(1:length(unique(cigar.gr$qname)), lengths(cigar.grl))
      level.offset <- max(cigar.gr.df$level) + 1.5
      plt <- ggplot() + geom_rect(data=cigar.gr.df, aes(xmin=start, xmax=end, ymin=level, ymax=level+1, fill=cigar)) +
        scale_fill_manual(values = brewer.pal(n = 9, name = 'Set1'), drop=FALSE) +
        theme_bw() +
        ylab("") +
        xlab("Genomic Position (bp)") +
        ggtitle(regions.ID)
      ## Annotate read names
      text.label <- cigar.gr.df %>% group_by(qname) %>% summarise(x=min(start))
      text.label$y <- unique(cigar.gr.df$level)
      plt <- plt + geom_text(data=text.label, aes(x=x, y=y, label=qname), hjust=0, vjust=-0.5, inherit.aes = FALSE)
      ## Plot min & max range as gray bar
      max.region <- data.frame(start=min(c(start(cigar.gr), start(invDup.region))), 
                               end=max(c(end(cigar.gr), end(invDup.region))), y=level.offset)
      plt <- plt + geom_rect(data=max.region, aes(xmin=start, xmax=end, ymin=y+0.25, ymax=y+0.75), fill='gray', inherit.aes = FALSE)
      ## Plot region of interest as black bar
      invDup.region.df <- as.data.frame(invDup.region)
      invDup.region.df$y <- level.offset
      plt <- plt + geom_rect(data=invDup.region.df, aes(xmin=start, xmax=end, ymin=y, ymax=y+1), fill='black', inherit.aes = FALSE)
      plots[[length(plots) + 1]] <- plt
    } else {
      message("    No fragments assembled for this region!!!")
    }
  }
  if (!is.null(file)) {
    ## Export final plots in PDF
    message("Printing to PDF ...")
    grDevices::pdf(file, width=10, height=4)
    bquiet = lapply(plots, print)
    d <- grDevices::dev.off()
  }
  return(plots)
  message("DONE!!!")
}


#' Plot Strand-seq BAM alignments within a user defined set of genomic regions.
#'
#' @param bamfile Bamfile with aligned reads.
#' @param regions A \code{\link{GRanges-class}} object with genomic regions to process.
#' @inheritParams bam2GRanges
#' @return A \code{list} of \code{ggplot-class} object.
#' @author David Porubsky
#' @export
#' 
plotSSEQReadsPerRegion <- function(bamfiles=NULL, regions=NULL, min.mapq=10, pairedEndReads = TRUE) {
  ## Loop over each region
  bam.plots <- list()
  for (i in seq_along(bamfiles)) {
    ## Load BAM file
    bamfile <- bamfiles[i]
    bam.id <- basename(bamfile)
    message("Reading BAM file: ", basename(bamfile))
    frags <- primatR::bam2GRanges(bamfile = bamfile, region = regions, pairedEndReads = pairedEndReads, min.mapq = min.mapq)
    
    ## Filter by mapping quality
    if (min.mapq > 0) {
      frags <- frags[frags$mapq >= min.mapq]
    }
    
    region.plots <- list()
    for (j in seq_along(regions)) {
      region <- regions[j]
      region.id <- as.character(region)
      message('    Plotting region: ', region.id)
      region.frags <- IRanges::subsetByOverlaps(frags, region)
      region.frags.plus <- region.frags[strand(region.frags) == '+']
      region.frags.minus <- region.frags[strand(region.frags) == '-']
      region.frags.plus$level <- GenomicRanges::disjointBins(region.frags.plus)
      region.frags.minus$level <- GenomicRanges::disjointBins(region.frags.minus) * -1
      
      region.df <- as.data.frame(region)
      frags.df <- as.data.frame(c(region.frags.plus, region.frags.minus))
      
      plt <- ggplot2::ggplot() + 
        geom_linerange(data=region.df, aes(x=0, ymin=start, ymax=end), size=2, color='gray') + 
        geom_point(data=frags.df, aes_string(x='level', y='start', color='strand'), size=2, position = position_jitter(height = 1), alpha=0.5) + 
        scale_color_manual(values = c("paleturquoise4","sandybrown")) + 
        scale_y_continuous(name = region.id, labels = comma) +
        scale_x_continuous("Watson | Crick") +
        coord_flip() + 
        theme(strip.text.y = element_text(angle = 360)) +
        theme_minimal() +
        ggtitle(bam.id)
      region.plots[[j]] <- plt
    }
    bam.plots[[i]] <- region.plots
  }
  return(bam.plots)
}
daewoooo/primatR documentation built on March 28, 2024, 6:41 a.m.