#' @export
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom BiocParallel bpmapply bpisup bpstart bpstop SerialParam
correlateReads <- function(bam.files, max.dist=1000, cross=TRUE, param=readParam(), BPPARAM=SerialParam())
# Calculates the cross-correlation between reads of different strands (or autocorrelations between reads in general)
# Note that the BAM files must be sorted, and usually duplicate reads should be removed for best performance.
# I haven't used R's native acf/ccf() function because it doesn't handle the large inputs from BAM efficiently.
#
# written by Aaron Lun
# created 2 July 2012
{
max.dist <- as.integer(max.dist)
if (max.dist <=0) {
stop("maximum distance must be positive")
}
total.cor <- numeric(max.dist+1L)
total.read.num <- 0L
if (!bpisup(BPPARAM)) {
bpstart(BPPARAM)
on.exit(bpstop(BPPARAM))
}
bam.files <- .make_BamFiles(bam.files)
extracted.chrs <- .activeChrs(bam.files, param$restrict)
for (i in seq_along(extracted.chrs)) {
chr <- names(extracted.chrs)[i]
where <- GRanges(chr, IRanges(1L, extracted.chrs[i]))
total.len <- extracted.chrs[i] + 1L # Length of the conceptual vector to compute the correlations.
if (total.len < 2L) { # No way to compute variance if the vector's too small.
next
}
# Reading in the reads for the current chromosome for all the BAM files.
bp.out <- bpmapply(FUN=.correlate_reads, bam.file=bam.files,
MoreArgs=list(where=where, param=param, total.len=total.len),
BPPARAM=BPPARAM, SIMPLIFY=FALSE)
all.f <- lapply(bp.out, "[[", "forward")
all.r <- lapply(bp.out, "[[", "reverse")
if (!cross) {
all.f <- mapply("c", all.f, all.r, SIMPLIFY=FALSE)
}
forward.reads <- sum(vapply(bp.out, FUN="[[", i="nforward", FUN.VALUE=0L))
num.reads <- forward.reads + sum(vapply(bp.out, FUN="[[", i="nreverse", FUN.VALUE=0L))
# Assembling RLEs (with some protection from empties). We need reads for any correlation and
# reads on both strands to get cross-correlations. If we're doing cross-correlations, then
# we compare between strands; if we're doing autocorrelations, we compare within all reads.
if (num.reads==0L) {
next
}
if (cross && (forward.reads==0L || forward.reads==num.reads)) { # correlations undefined, so they don't contribute to total.read.num.
next
}
all.f <- rle(sort(do.call(c, all.f)))
if (cross) {
all.r <- rle(sort(do.call(c, all.r)))
} else {
all.r <- all.f
}
# We call the C++ function to compute correlations.
ccfs <- .Call(cxx_correlate_reads, all.f$values, all.f$lengths, all.r$values, all.r$lengths, max.dist, total.len)
# Returning some output. Note that the coefficient is weighted according to the number
# of reads on each chromosome, as described in Kharchenko et al. (2008).
total.read.num <- total.read.num + num.reads
total.cor <- total.cor + ccfs*num.reads
}
# Cleaning up and returning the correlations.
if (total.read.num) {
total.cor <- total.cor/total.read.num
}
return(total.cor)
}
.correlate_reads <- function(bam.file, where, param, total.len) {
if (param$pe=="both") {
reads <- .extractPE(bam.file, where=where, param=param, with.reads=TRUE)
} else {
reads <- .extractSE(bam.file, where=where, param=param)
}
forward.pos <- reads$forward$pos
forward.pos[forward.pos < 1L] <- 1L
reverse.pos <- reads$reverse$pos + reads$reverse$qwidth
reverse.pos[reverse.pos > total.len] <- total.len
num.reads <- length(forward.pos)+length(reverse.pos)
forward.reads <- length(forward.pos)
return(list(forward=forward.pos, reverse=reverse.pos,
nforward=length(forward.pos), nreverse=length(reverse.pos)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.