R/PlotsDepthOfCoverage.R

Defines functions plotOverallCovHistogram plotDepthOfCoverageMegablock

Documented in plotDepthOfCoverageMegablock plotOverallCovHistogram

#' plot a tiled panel of chromosomal depths-of-coverage
#'
#' This method will display depths-of-coverage for the chromosomes contained in
#' the provided parsedBamFile
#' container
#'
#' @import ggplot2
#' @importFrom stats sd
#' @import RColorBrewer
#' @return ggplot2 image
#' @param coverageData data.frame of coverage data
#' @param colMax defines the number of columns
#' @param ... for downstream tuning
#'
#' @examples
#' demoBam <- system.file("extdata",
#'     "Ecoli_zymo_R10_filt_subs.bam",
#'     package = "nanopoRe")
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' bamSummarise(demoBam, blockSize=1000L)
#' plotDepthOfCoverageMegablock()
#'
#' @export
plotDepthOfCoverageMegablock <- function(coverageData=NULL, colMax = 4, ...) {

    if (is.null(coverageData)) {
        coverageData <- bamSummaryToCoverage(...)
    }

    coverageDF <- as.data.frame(coverageData, stringsAsFactors = FALSE)
    coverageDF$seqnames <- as.character(coverageDF$seqnames)

    suppressWarnings(posMatrix <- matrix(gtools::mixedsort(unique(
        coverageDF$seqnames)), ncol = colMax, byrow = TRUE))
    # data may be recycled ... remove duplicate values ...
    posMatrix[which(duplicated(posMatrix[seq(nrow(posMatrix) *
        ncol(posMatrix))]))] <- NA
    coverageDF <- cbind(coverageDF, as.data.frame(matrix(unlist(lapply(
        coverageDF$seqnames, function(x) { which(posMatrix == x, arr.ind = TRUE)
        })), ncol = 2, byrow = TRUE, dimnames = list(c(), c("row", "col")))))

    meanCov <- mean(coverageDF$binned_cov, na.rm = TRUE)

    coverageDF$start <- (as.numeric(coverageDF$start - 1))/1e+06
    # set ylim to mean + 2stdev?
    ylimit <- mean(coverageDF$binned_cov, na.rm = TRUE) + 2 * stats::sd(
        coverageDF$binned_cov, na.rm = TRUE)

    plotLegend <- paste("chr", gtools::mixedsort(unique(coverageDF$seqnames)))
    plotCols <- ceiling(length(plotLegend)/colMax)
    legendDF <- data.frame(x = Inf, y = Inf, lab = plotLegend, row = unlist(
        lapply(seq_len(plotCols), rep, times = colMax))[seq_along(plotLegend)],
        col = rep(seq(1, colMax), length.out = length(plotLegend)))

    suppressWarnings(megadepthplot <- ggplot(coverageDF, aes_string(
        "start", "binned_cov")) + geom_hline(yintercept = meanCov, size = 0.3,
        colour = "red") + geom_line(colour = brewer.pal(6, "Paired")[2]) +
        facet_grid(rows = vars(row), cols = vars(col), shrink = TRUE) +
        coord_cartesian(ylim=c(0, ylimit))+scale_x_continuous(labels = comma) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1), strip.text.y =
        element_blank(), strip.text.x = element_blank()) + xlab(
        "Position along chromosome (Mb)") + ylab("Depth of Coverage (X)") +
        labs(title =
        "Plot showing depth of coverage vs position for chromosomes mapped") +
        geom_text(aes_string("x", "y", label = "lab"), data=legendDF, vjust=1,
        hjust = 1, size = 3.5) + theme(plot.title = element_text(size = 11)))

    return(megadepthplot)
}




#' plot a histogram of whole genome depth-of-coverage
#'
#' This method will plot a histogram of whole genome depth-of-coverage
#'
#' @importFrom utils head
#' @param bamFile is the path to the bamFile to plot
#' @param segments number of segments to split data into
#' @param ... such as FORCE=TRUE
#' @return ggplot2 plot
#'
#' @examples
#' demoBam <- system.file("extdata",
#'     "Ecoli_zymo_R10_filt_subs.bam",
#'     package = "nanopoRe")
#' referenceFasta <- system.file("extdata",
#'     "Escherichia_coli_complete_genome.fasta",
#'     package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' bamSummarise(demoBam, blockSize=1000L)
#' plotOverallCovHistogram()
#'
#' @export
plotOverallCovHistogram <- function(bamFile=NULL, segments = 50, ...) {

    coverage <- bamSummaryToCoverage(bamFile, ...)

    meanCov <- mean(coverage$binned_cov, na.rm = TRUE)

    coverageMatrix <- data.frame(table(coverage$binned_cov))
    coverageMatrix$bases <- as.numeric(coverageMatrix$Freq *
        mean(width(coverage)))
    coverageMatrix[, 1] <- as.numeric(levels(
        coverageMatrix[, 1]))[coverageMatrix[, 1]]
    # mask the regions with unmapped reads
    toRemove <- which(coverageMatrix[, 1] == 0)
    if (length(toRemove)>0) {
        coverageMatrix <- coverageMatrix[-toRemove, ]
    }
    breaks <- seq(0, 2 * meanCov, length.out = segments)
    binAssignments <- cut(
        coverageMatrix$Var1, breaks, include.lowest = TRUE, right = FALSE)

    scrapeBinnedBases <- function(level) {
        sum(coverageMatrix[which(binAssignments == level), "bases"])
    }
    binnedData <- unlist(lapply(levels(binAssignments), scrapeBinnedBases))
    binnedDf <- data.frame(coverage=utils::head(breaks, -1), bases=binnedData)


    plot <- ggplot(binnedDf, aes_string(x = "coverage", y = "bases")) +
        geom_vline(xintercept = meanCov, size = 0.3, colour = "red") +
        geom_bar(stat = "identity", fill = brewer.pal(6, "Paired")[2]) +
        scale_y_continuous(labels = comma) + labs(title =
        "Histogram showing distribution of depth-of-coverage across genome") +
        xlab("Coverage (x)") + ylab("Number of bases") +
        theme(plot.title = element_text(size = 11))

    return(ggplot2handler(plot))
}
sagrudd/nanopoRe documentation built on June 7, 2020, 10:20 p.m.