# Functional Genomics Center Zurich
# This code is distributed under the terms of the GNU General
# Public License Version 3, June 2007.
# The terms are available here: http://www.gnu.org/licenses/gpl.html
# www.fgcz.ch
### Filter ATAC-seq bam file; process by sample;
#### 1. remove duplicates
#### 2. chrM removal
#### 3. low mapping quality filtering (< 10)
#### 4. shift the 5' start
atacBamProcess <- function(input = NA, output = NA, param = NA) {
require(GenomicAlignments)
require(Rsamtools)
require(rtracklayer)
## if output is not an EzDataset, set it!
if (!is(output, "EzDataset")) {
output <- input$copy()
output$setColumn("BAM", paste0(
getwd(), "/", input$getNames(),
"_processed.bam"
))
output$setColumn("BAI", paste0(
getwd(), "/", input$getNames(),
"_processed.bam.bai"
))
output$dataRoot <- NULL
}
bamFile <- input$getFullPaths("BAM")
## For now, we only have paired-end ATAC-seq
stopifnot(param$paired)
## remove the duplicates in bam
noDupBam <- tempfile(pattern = "nodup_", tmpdir = ".", fileext = ".bam")
message("Remove duplicates...")
if(param$removeDuplicates){
dupBam(inBam = bamFile, outBam = noDupBam, operation = "remove", ram = param$ram)
} else{
outBam = noDupBam
file.copy(from=bamFile, to=outBam, overwrite=TRUE)
Rsamtools::indexBam(outBam)
}
noDupNoMTNOLowBam <- basename(output$getColumn("BAM"))
filteroutBam(
inBam = noDupBam, outBam = noDupNoMTNOLowBam,
cores = param$cores, chrs = c("M", "MT", "chrM", "chrMT"), mapQ = 10
)
file.remove(c(noDupBam, paste0(noDupBam, ".bai")))
if (param$shiftATAC) {
require(ATACseqQC)
what <- c("qname", "flag", "mapq", "isize", "seq", "qual", "mrnm")
tags <- c("AS", "XN", "XM", "XO", "XG", "NM", "MD", "YS", "YT")
flag <- scanBamFlag(
isSecondaryAlignment = FALSE,
isUnmappedQuery = FALSE,
isNotPassingQualityControls = FALSE
)
scanBamParam <- ScanBamParam(flag = flag, tag = tags, what = what)
message("Reading nodup noMT bam file...")
reads <- readGAlignmentPairs(file = noDupNoMTNOLowBam, param = scanBamParam)
file.remove(c(noDupNoMTNOLowBam, paste0(noDupNoMTNOLowBam, ".bai")))
## shiftAlignments
message("Shifting 5' start...")
firstReads <- ATACseqQC:::shiftReads(first(reads),
positive = 4L, negative = 5L
)
lastReads <- ATACseqQC:::shiftReads(last(reads),
positive = 4L, negative = 5L
)
rm(reads)
message("Exporting bam file...")
export(
GAlignmentPairs(first = firstReads, last = lastReads),
noDupNoMTNOLowBam
)
rm(firstReads)
rm(lastReads)
gc()
}
return(output)
}
### Make or remove duplicated in bam file
dupBam <- function(inBam, outBam, operation = c("mark", "remove"), ram = 20) {
operation <- match.arg(operation)
javaCall = paste0("java", " -Djava.io.tmpdir=. -Xmx", ram, "g")
cmd <- paste(javaCall, " -jar ", Sys.getenv("Picard_jar"),
"MarkDuplicates",
paste0("I=", inBam),
paste0("O=", outBam),
paste0("M=", tempfile()),
paste0(
"REMOVE_DUPLICATES=",
if_else(operation == "mark", "false", "true")
),
"> /dev/null 2>&1"
)
ezSystem(cmd)
Rsamtools::indexBam(outBam)
invisible(outBam)
}
### Filter bam by removing chrs, low mapQ
filteroutBam <- function(inBam, outBam, cores = ezThreads(), chrs = NULL, mapQ = NULL) {
require(Rsamtools)
cmd <- paste("samtools view -b", "-@", cores, "-o", outBam)
if (!is.null(mapQ)) {
cmd <- paste(cmd, "-q", mapQ)
}
cmd <- paste(cmd, inBam)
if (!is.null(chrs)) {
allChrs <- ezBamSeqNames(inBam)
keepChrs <- setdiff(allChrs, chrs)
cmd <- paste(cmd, paste(keepChrs, collapse = " "))
}
ezSystem(cmd)
indexBam(outBam)
invisible(outBam)
}
### Convert bam to bigwig file
bam2bw <- function(file,
destination = sub("\\.bam$", ".bw", file, ignore.case = TRUE),
paired = FALSE,
# mode=c("RNA-seq", "DNA-seq"), ## TODO: add strandness for RNA-seq
method = c("deepTools", "Bioconductor"),
cores = ezThreads()) {
method <- match.arg(method)
if (method == "Bioconductor") {
## Better compatability; single-base resolutio;
## Slower; higher RAM comsuption
require(rtracklayer)
require(GenomicAlignments)
if (paired) {
aligns <- readGAlignmentPairs(file)
} else {
aligns <- readGAlignments(file)
}
cov <- coverage(aligns)
export.bw(cov, destination)
} else if (method == "deepTools") {
cmd <- paste(
"bamCoverage", "--bam", file, "-o", destination,
"--binSize 10 -of bigwig", "-p", cores
)
ezSystem(cmd)
}
invisible(destination)
}
splitBamByRG <- function(inBam, mc.cores = ezThreads()) {
# The following Rsamtools requires mapped Bam file.
# require(Rsamtools)
# header <- scanBamHeader(inBam)
# readGroupNames <- header[[inBam]]$text[names(header[[inBam]]$text) == "@RG"]
# readGroupNames <- sub("ID:", "", sapply(readGroupNames, "[", 1))
cwd <- getwd()
inBam <- normalizePath(inBam)
# This is more general approach for uBam and mapped Bam.
tempDIR <- file.path(cwd, paste("samtools-split", Sys.getpid(), sep = "-"))
setwdNew(tempDIR)
cmd <- paste("samtools split -@", mc.cores, "-f %!", inBam)
ezSystem(cmd)
setwd(cwd)
readGroupNames <- list.files(path = tempDIR)
bamFns <- paste0(readGroupNames, ".bam")
file.rename(from = file.path(tempDIR, readGroupNames), to = bamFns)
names(bamFns) <- readGroupNames
# Clearning
unlink(tempDIR, recursive = TRUE)
return(bamFns)
}
mergeBamAlignments <- function(alignedBamFn, unmappedBamFn,
outputBamFn, fastaFn,
keepUnmapped = FALSE) {
## Use . as tmp dir. Big bam generates big tmp files.
cmd <- paste(
prepareJavaTools("picard"), "MergeBamAlignment",
paste0("ALIGNED=", alignedBamFn),
paste0("UNMAPPED=", unmappedBamFn),
paste0("O=", outputBamFn),
paste0("R=", fastaFn)
)
ezSystem(cmd)
if (!keepUnmapped) {
tempBam <- tempfile(
pattern = "noUnmapped", tmpdir = ".",
fileext = ".bam"
)
cmd <- paste(
"samtools view -F 4 -h -b", outputBamFn, ">",
tempBam
)
ezSystem(cmd)
file.remove(outputBamFn)
file.rename(from = tempBam, to = outputBamFn)
}
invisible(outputBamFn)
}
posSpecErrorBam <- function(bamGA, genomeFn) {
require(GenomicAlignments)
require(Hmisc)
require(BSgenome)
require(Rsamtools)
nMaxReads <- 100000
what <- c("qname", "seq")
stopifnot(all(what %in% colnames(mcols(bamGA))))
hasGap <- grepl("I|D|N", cigar(bamGA))
readLength <- qwidth(bamGA)
genomeFnIndixe <- FaFile(genomeFn)
genomeLengths <- seqlengths(genomeFnIndixe)
isOutOfRange <- start(bamGA) + readLength - 1 > genomeLengths[as.character(seqnames(bamGA))] |
start(bamGA) < readLength
# this is very conservative; needed because there might be clipped bases in the beginning
if (any(isOutOfRange)) {
ezWrite("#reads out of range: ", sum(isOutOfRange))
idx <- which(isOutOfRange)
idx <- idx[1:min(10, length(idx))]
badAlignments <- data.frame(
pos = start(bamGA)[idx],
cigar = cigar(bamGA)[idx],
width = readLength[idx]
)
print(badAlignments)
}
indexKeep <- which(!hasGap & !isOutOfRange)
# indexKeep <- which(!hasGap)
if (length(indexKeep) > nMaxReads) {
indexKeep <- sample(indexKeep, size = nMaxReads, replace = FALSE)
}
ezWrite(
"#alignments: ", length(bamGA),
" #valid alignments: ", sum(!hasGap & !isOutOfRange),
" #used:", length(indexKeep)
)
if (length(indexKeep) == 0) {
return(NULL)
}
bamGA <- bamGA[indexKeep]
bamGR <- granges(bamGA) ## used to record the actual mapped coordinates of seq
## adjust the start POS according to H and/or S
tempCigar <- str_extract(cigar(bamGA), "^(\\d+H)?(\\d+S)?\\d+M")
## get the number of H at the beginning
noOfH <- explodeCigarOpLengths(tempCigar, ops = "H")
stopifnot(!any(lengths(noOfH) > 1))
noOfH[lengths(noOfH) == 0] <- 0
noOfH <- as.integer(noOfH)
## get the number of S at the beginning
noOfS <- explodeCigarOpLengths(tempCigar, ops = "S")
stopifnot(!any(lengths(noOfS) > 1))
noOfS[lengths(noOfS) == 0] <- 0
noOfS <- as.integer(noOfS)
nBeginClipped <- noOfH + noOfS
start(bamGR) <- start(bamGR) - nBeginClipped
## add - to the begin and end of SEQ for revComplement
Xbegin <- makeNstr("-", noOfH)
tempCigar <- str_extract(cigar(bamGA), "\\d+[^HS](\\d+S)?(\\d+H)?$")
noOfH <- explodeCigarOpLengths(tempCigar, ops = "H")
stopifnot(!any(lengths(noOfH) > 1))
noOfH[lengths(noOfH) == 0] <- 0
noOfH <- as.integer(noOfH)
Xend <- makeNstr("-", noOfH)
noOfS <- explodeCigarOpLengths(tempCigar, ops = "S")
stopifnot(!any(lengths(noOfS) > 1))
noOfS[lengths(noOfS) == 0] <- 0
noOfS <- as.integer(noOfS)
nEndClipped <- noOfH + noOfS
mcols(bamGR)$seq <- DNAStringSet(paste0(Xbegin, mcols(bamGA)$seq, Xend))
end(bamGR) <- end(bamGR) + nEndClipped
indexNeg <- as.logical(strand(bamGA) == "-")
mcols(bamGR)$seq[indexNeg] <- reverseComplement(mcols(bamGR)$seq[indexNeg])
seqChar <- strsplit(as.character(mcols(bamGR)$seq), "")
readLength <- lengths(seqChar)
maxLength <- quantile(readLength, 0.95)
if (maxLength < max(readLength)) {
readLength[readLength > maxLength] <- maxLength
seqChar <- mapply(function(x, l) {
x[1:l]
}, seqChar, readLength)
bamGR <- resize(bamGR, width = readLength, fix = "start")
}
# referenceChar <- genome[bamGR]
referenceChar <- getSeq(genomeFnIndixe, bamGR)
stopifnot(all(width(referenceChar) == width(bamGR))) # check the position change is wright
referenceChar <- strsplit(as.character(referenceChar), "")
# assuming we have unique read length and set it to the maximal read length here.
nEndTrimmed <- maxLength - readLength
trimmedMatrix <- mapply(function(readLength, nEndTrimmed) {
rep(c(FALSE, TRUE), c(readLength, nEndTrimmed))
},
readLength, nEndTrimmed,
SIMPLIFY = FALSE
)
## build a clippedMatrix to record the clipped character
nNormal <- readLength - nBeginClipped - nEndClipped
clippedMatrixNoTrim <- mapply(function(nBeginClipped, nNormal, nEndClipped) {
rep(
c(TRUE, FALSE, TRUE),
c(nBeginClipped, nNormal, nEndClipped)
)
},
nBeginClipped, nNormal, nEndClipped,
SIMPLIFY = FALSE
)
matchMatrix <- mapply("==", referenceChar, seqChar, SIMPLIFY = FALSE)
clippedMatrixNoTrim[indexNeg] <- lapply(clippedMatrixNoTrim[indexNeg], rev)
clippedMatrix <- mapply(function(clippedMatrixNoTrim, nEndTrimmed) {
c(clippedMatrixNoTrim, rep(FALSE, nEndTrimmed))
},
clippedMatrixNoTrim, nEndTrimmed,
SIMPLIFY = FALSE
)
matchMatrix <- sapply(matchMatrix, function(x) {
x[1:maxLength]
})
clippedMatrix <- matrix(unlist(clippedMatrix), ncol = length(clippedMatrix))
clippedRate <- rowMeans(clippedMatrix, na.rm = TRUE)
trimmedMatrix <- matrix(unlist(trimmedMatrix), ncol = length(trimmedMatrix))
trimmedRate <- rowMeans(trimmedMatrix, na.rm = TRUE)
matchMatrix[clippedMatrix] <- NA
errorRate <- 1 - rowMeans(matchMatrix, na.rm = TRUE)
names(errorRate) <- 1:nrow(matchMatrix)
names(clippedRate) <- 1:nrow(matchMatrix)
names(trimmedRate) <- 1:nrow(matchMatrix)
return(list(
trimmedRate = trimmedRate, clippedRate = clippedRate,
errorRate = errorRate
))
}
### Create the MD tag for BAM file and replace matches with "=" in seq
calmdBam <- function(bamFns, mdBamFns = sub("\\.bam$", "_md.bam", bamFns),
genomeFn, mc.cores = 4L) {
stopifnot(length(bamFns) == length(mdBamFns))
cmdsCalmd <- paste("samtools calmd -b -e", bamFns, genomeFn, ">", mdBamFns)
cmdsIndex <- paste("samtools index", mdBamFns)
indicesExist <- file.exists(mdBamFns)
if (any(indicesExist)) {
warning(
"Some md Bam files already exist!",
paste(mdBamFns[indicesExist], collapse = " ")
)
cmdsCalmd <- cmdsCalmd[!indicesExist]
cmdsIndex <- cmdsIndex[!indicesExist]
}
ezMclapply(cmdsCalmd, ezSystem, mc.preschedule = FALSE, mc.cores = mc.cores)
ezMclapply(cmdsIndex, ezSystem, mc.preschedule = FALSE, mc.cores = mc.cores)
return(mdBamFns)
}
### position specific error with md bam
posSpecErrorMDBam <- function(bamGA) {
hasIndel <- grepl("I|D", cigar(bamGA))
bamGA <- bamGA[!hasIndel]
seqs <- mcols(bamGA)$seq
negStrand <- strand(bamGA) == "-"
seqs[negStrand] <- reverseComplement(seqs[negStrand])
consensusCount <- consensusMatrix(seqs)
errorRate <- 1 - consensusCount["-", ] / colSums(consensusCount)
return(errorRate)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.