#' extract content from a BAM file
#'
#' This method will extract a block of BAM content from a given BAM file
#'
#' @param bamFile is the location to the BAM file to parse
#' @param yieldSize is the number of mapping observations to extract
#' @return list of summary observations
#'
#' @examples
#' # define the path to a BAM file
#' demoBam <- system.file("extdata",
#' "Ecoli_zymo_R10_filt_subs.bam",
#' package = "nanopoRe")
#' bamdata <- testBam(demoBam, 10)
#'
#' @export
testBam <- function(bamFile, yieldSize = 100L) {
bam = open(BamFile(bamFile, yieldSize = yieldSize))
what = c("qname", "flag", "rname", "strand", "pos", "qwidth", "mapq",
"qual", "cigar")
param = ScanBamParam(what = what, tag = c("NM", "MD"))
reads = scanBam(bam, param = param)[[1L]]
close(bam)
return(reads)
}
#' summarise mapping observations from a BAM file by chromosome
#'
#' This method will parse a BAM file and summarise the mapping observations in a
#' way that can be used for preparation of figures and confidence plots by
#' the nanopoRe package
#'
#' @usage bamSummariseByChr(chrId,
#' bamFile,
#' force = FALSE,
#' blockSize = 10000L,
#' index=NULL
#' )
#' @importFrom Rsamtools ScanBamParam
#' @importFrom Rsamtools BamFile
#' @importFrom Rsamtools scanBam
#' @param chrId is the chromosome to parse
#' @param bamFile is the location to the BAM file to parse
#' @param force logical value describing whether the analysis should be force
#' recalculated
#' @param blockSize the number of reads to process at the time as iterating
#' through
#' @param index path to the BAI index file - should be automatic in most cases
#' @return data.frame of per read summary observations
#'
#' @examples
#' # load reference genome
#' init()
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' # load reference BAM
#' demoBam <- system.file("extdata",
#' "Ecoli_zymo_R10_filt_subs.bam",
#' package = "nanopoRe")
#' # Rsamtools::BamFile has problems in picking up packaged .bam.bai
#' # we will specify it here rather then automatic detection
#' demoBamIdx <- system.file("extdata",
#' "Ecoli_zymo_R10_filt_subs.bam",
#' package = "nanopoRe")
#' bamSummariseByChr(chrId=getChromosomeIds()[1],
#' bamFile=demoBam,
#' blockSize=100L,
#' index=demoBamIdx)
#'
#' @export
bamSummariseByChr <- function(
chrId, bamFile, force = FALSE, blockSize = 10000L, index=NULL) {
bamSummaryResults <- file.path(getRpath(), paste0(sub("\\.[^.]*$", "",
basename(bamFile)), ".bamMetrics.chr", chrId, ".Rdata"))
message(paste0("targetRdata ", bamSummaryResults, "\n"))
if (file.exists(bamSummaryResults) & !force) {
return(readRDS(file = bamSummaryResults))
}
count <- 0
bamInfo <- data.frame()
bam <- NULL
if (!is.null(index)) {
bam = open(BamFile(bamFile, yieldSize = blockSize, index=index))
} else {
bam = open(BamFile(bamFile, yieldSize = blockSize))
}
what = c("qname", "flag", "rname", "strand", "pos", "qwidth", "mapq",
"qual", "cigar")
param = ScanBamParam(
which = GRanges(seqnames = chrId, ranges =
IRanges(start = 1, end = as.numeric(getSeqLengths(chrId)))),
what = what, tag = c("NM", "MD"))
repeat {
reads = scanBam(bam, param = param)[[1L]]
if (length(reads$qname) == 0L)
break
count = count + length(reads$qname)
bamInfo <- rbind(bamInfo, processBamChunk(reads))
}
close(bam)
saveRDS(bamInfo, file = bamSummaryResults, compress=FALSE)
return(bamInfo)
}
#' summarise mapping observations from a BAM file using parallel chr-by-chr
#'
#' This method will parse a BAM file and summarise the mapping observations in a
#' way that can be used for preparation of figures and confidence plots by
#' the nanopoRe package. This method is multithreaded and REQUIRES access to the
#' reference genome.
#'
#' @importFrom parallel detectCores
#' @importFrom pbmcapply pbmclapply
#' @usage parallelBamSummarise(bamFile, force = FALSE, blockSize=10000L,
#' mc.cores = min(parallel::detectCores() - 1, 24))
#' @param bamFile is the location to the BAM file to parse
#' @param force logical value describing whether the analysis should be
#' force recalculated
#' @param blockSize describes the size of BAM chunk to parse
#' @param mc.cores number of threads to use for the process
#' @return data.frame of per read summary observations
#'
#' @examples
#' demoBam <- system.file("extdata",
#' "Ecoli_zymo_R10_filt_subs.bam",
#' package = "nanopoRe")
#' referenceGenome <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceGenome)
#' loadReferenceGenome()
#' bamSummary <- parallelBamSummarise(
#' demoBam, force=FALSE, blockSize=10000L, mc.cores=2)
#'
#' @export
parallelBamSummarise <- function(
bamFile, force = FALSE, blockSize = 10000L, mc.cores = min(
parallel::detectCores() - 1, 24)) {
bamSummaryResults <- file.path(
getRpath(), paste0(
sub("\\.[^.]*$", "", basename(bamFile)), ".bamMetrics", ".Rdata"))
bamInfo <- NULL # the result container ...
if (file.exists(bamSummaryResults) && !force) {
#return(getCachedFileObject("BamTargetData", bamSummaryResults))
bamInfo <- readRDS(file = bamSummaryResults)
} else {
message(paste0("targetRdata ", bamSummaryResults, "\n"))
mcharv <- pbmclapply(
getChromosomeIds(), bamSummariseByChr, bamFile=bamFile, force=force,
blockSize=blockSize, mc.cores = mc.cores, mc.preschedule = FALSE,
mc.silent = FALSE)
bamInfo <- data.frame()
for (chr in getChromosomeIds()) {
bamInfo <- rbind(bamInfo, bamSummariseByChr(chr, bamFile))
}
saveRDS(bamInfo, file = bamSummaryResults, compress=FALSE)
}
setCachedObject("bamfile", new.env())
assign("bamInfo", bamInfo, envir = getCachedObject("bamfile"))
return(invisible(bamInfo))
}
#' summarise mapping observations from a BAM file
#'
#' This method will parse BAM file and summarise the mapping observations in a
#' way that can be used for preparation of figures and confidence plots by
#' the nanopoRe package. This is single threaded and DOES NOT require access to
#' reference genome.
#'
#' @importFrom Rsamtools ScanBamParam
#' @importFrom Rsamtools BamFile
#' @importFrom Rsamtools scanBam
#' @param bamFile is the location to the BAM file to parse
#' @param force logical value of whether analysis should be force recalculated
#' @param blockSize the number of reads to process in chunks
#' @return data.frame of per read summary observations
#'
#' @examples
#' demoBam <- system.file("extdata",
#' "Ecoli_zymo_R10_filt_subs.bam",
#' package = "nanopoRe")
#' bamSummary <- bamSummarise(demoBam, force=FALSE, blockSize=1000L)
#'
#' @export
bamSummarise <- function(bamFile, force = FALSE, blockSize = 50000L) {
bamSummaryResults <- file.path(
getRpath(), paste0(
sub("\\.[^.]*$", "", basename(bamFile)), ".bamMetrics", ".Rdata"))
bamInfo <- NULL # the result container ...
if (file.exists(bamSummaryResults) && !force) {
#return(getCachedFileObject("BamTargetData", bamSummaryResults))
bamInfo <- readRDS(file = bamSummaryResults)
} else {
count <- 0
bamInfo <- data.frame()
bam = open(BamFile(bamFile, yieldSize = blockSize))
what = c("qname", "flag", "rname", "strand", "pos", "qwidth", "mapq",
"qual", "cigar")
param = ScanBamParam(what = what, tag = c("NM", "MD"))
repeat {
reads = scanBam(bam, param = param)[[1L]]
if (length(reads$qname) == 0L)
break
count = count + length(reads$qname)
bamInfo <- rbind(bamInfo, processBamChunk(reads))
}
close(bam)
saveRDS(bamInfo, file = bamSummaryResults, compress=FALSE)
}
# and save the object into memory
setCachedObject("bamfile", new.env())
assign("bamInfo", bamInfo, envir = getCachedObject("bamfile"))
return(invisible(bamInfo))
}
getReadFlags <- function(bamChunk, flagMatrix) {
readFlag <- factor(rep("Primary", length(bamChunk$flag)), levels =
c("Primary", "Secondary", "Supplementary", "Unmapped"))
flagMatrix <- as.data.frame(bamFlagAsBitMatrix(as.integer(bamChunk$flag)))
readFlag[which(flagMatrix$isSecondaryAlignment == 1)] <- "Secondary"
readFlag[which(flagMatrix$isSupplementaryAlignment == 1)] <- "Supplementary"
readFlag[which(flagMatrix$isUnmappedQuery == 1)] <- "Unmapped"
return(readFlag)
}
#' @import Rsamtools
#' @importFrom GenomicAlignments cigarRangesAlongPairwiseSpace
#' @importFrom GenomicAlignments cigarRangesAlongQuerySpace
#' @importFrom GenomicAlignments cigarRangesAlongReferenceSpace
#' @importFrom IRanges width
processBamChunk <- function(bamChunk) {
bamChunk <- as.data.frame(bamChunk)
# separate out unmapped reads
flagMatrix <- as.data.frame(bamFlagAsBitMatrix(as.integer(bamChunk$flag)))
unmappedChunk <- bamChunk[which(flagMatrix$isUnmappedQuery == 1), ]
bamChunk <- bamChunk[which(flagMatrix$isUnmappedQuery == 0), ]
# simplify the read flags ...
readFlag <- getReadFlags(bamChunk, flagMatrix)
# calculate aln_len for alignment
ref_aln_len <- unlist(width(cigarRangesAlongReferenceSpace(bamChunk$cigar,
reduce.ranges = TRUE)))
endpos <- bamChunk$pos + ref_aln_len
# mean readq
readq <- unlist(lapply(as.character(bamChunk$qual), qualToMeanQ))
# get the actual query bases mapped ...
q_aln_len <- unlist(
IRanges::width(cigarRangesAlongQuerySpace(
bamChunk$cigar, after.soft.clipping = TRUE, reduce.ranges = TRUE)))
q_ins_len <- sum(IRanges::width(
cigarRangesAlongPairwiseSpace(bamChunk$cigar, ops = c("I"))))
q_del_len <- sum(IRanges::width(
cigarRangesAlongPairwiseSpace(bamChunk$cigar, ops = c("D"))))
q_match_len <- sum(IRanges::width(
cigarRangesAlongPairwiseSpace(bamChunk$cigar, ops = c("M"))))
q_mm_len <- bamChunk$tag.NM - q_ins_len - q_del_len
coverage = q_aln_len/bamChunk$qwidth
accuracy = (q_match_len - q_mm_len)/(q_match_len + q_ins_len + q_del_len)
identity = (q_match_len - q_mm_len)/(q_match_len)
parsed <- data.frame(qname=bamChunk$qname, readFlag=readFlag,
rname=bamChunk$rname, strand = bamChunk$strand, start = bamChunk$pos,
qwidth = bamChunk$qwidth, end = endpos, mapq = bamChunk$mapq,
readq = readq, coverage=coverage, accuracy=accuracy, identity=identity)
if (nrow(unmappedChunk) > 0) {
unmapParse <- data.frame(qname=unmappedChunk$qname,
readFlag = factor(rep("Unmapped", length(unmappedChunk$flag)),
levels = c("Primary", "Secondary", "Supplementary", "Unmapped")),
rname = NA, strand = factor(rep("*", length(unmappedChunk$strand)),
levels = c("+", "-", "*")), start = NA, end = NA,
qwidth = nchar(unmappedChunk$qual), mapq = NA,
readq = unlist(lapply(as.character(unmappedChunk$qual),
qualToMeanQ)), coverage = NA, accuracy = NA, identity = NA)
parsed <- rbind(parsed, unmapParse)
}
return(parsed)
}
#' get depth of coverage information from across the genome
#'
#' This method will return a tiled Granges object containing mean depth of
#' coverage information
#'
#' @usage bamSummaryToCoverage(bamFile=NULL,
#' tilewidth = 1e+05,
#' blocksize = 10000,
#' flag = 'Primary',
#' ...
#' )
#' @param bamFile - path to the bamFile to use
#' @param tilewidth - the size of the window to use for the tiling
#' @param blocksize to use for parsing the BAM file
#' @param flag mapping type for filter (Primary/Secondary/Supplementary)
#' @param ... such as FORCE=TRUE
#' @return GRanges object with mean depth of coverage data in binned_cov field
#'
#' @examples
#' demoBam <- system.file("extdata",
#' "Ecoli_zymo_R10_filt_subs.bam",
#' package = "nanopoRe")
#' referenceFasta <- system.file("extdata",
#' "Escherichia_coli_complete_genome.fasta",
#' package = "nanopoRe")
#' setReferenceGenome(referenceFasta)
#' bamSummarise(demoBam, blockSize=1000L)
#' bamSummaryToCoverage()
#'
#' @export
bamSummaryToCoverage <- function(
bamFile=NULL, tilewidth = 1e+05, blocksize = 10000, flag = "Primary", ...) {
bamSummary <- NULL
if (is.null(bamFile)) {
bamSummary <- get("bamInfo", envir = getCachedObject("bamfile"))
} else {
bamSummary <- bamSummarise(bamFile, blockSize=blocksize, ...)
}
primary <- bamSummary[which(bamSummary$readFlag == flag), ]
# depending on the genome used there may be a load of warnings here this is
# likely due to reads mapping beyond segment boundaries - warnings are
# masked here since they are expected
suppressWarnings(
grdata <- GRanges(seqnames = primary$rname, ranges = IRanges(
start = primary$start, end = primary$end),
strand = primary$strand,
seqlengths = getSeqLengths(levels(primary$rname))))
mapCoverage <- coverage(grdata)
bins <- tileGenome(
seqlengths(grdata), tilewidth=tilewidth, cut.last.tile.in.chrom = TRUE)
return(binnedAverage(bins, mapCoverage, "binned_cov"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.