R/param_estimator.R

Defines functions calcPhredScoreProfile findReads countPhredScores

Documented in calcPhredScoreProfile

countPhredScores <- function(qual) {
    qual <- as.character(qual)
    qual <- lapply(qual, function(x) unlist(strsplit(x, "")))
    qual <- lapply(qual, function(x) {names(x) <- seq(1, length(x)); return(x)})
    qual <- as.data.frame(do.call("bind_rows", qual))
    qualProb <- apply(qual, 2, table)
    qualProb <- as.matrix(do.call("bind_rows", qualProb))
    qualProb <- qualProb[, order(phred2ASCIIOffset(colnames(qualProb)))]
    qualProb[is.na(qualProb)] <- 0
    return(qualProb)
}


findReads <- function(bamFile,
                      tag = character(0),
                      what=scanBamWhat(),
                      which=IRangesList(),
                      isSupplementary=NA,
                      reverseComplement=NA,
                      mapqFilter=0) {

    f <- scanBamFlag(isDuplicate = FALSE,
                     isSupplementaryAlignment = isSupplementary)

    p <- ScanBamParam(flag = f,
                      tag = tag,
                      simpleCigar = FALSE,
                      reverseComplement = reverseComplement,
                      what = what,
                      which = which,
                      mapqFilter = mapqFilter)

    bam <- scanBam(bamFile, param = p)[[1]]
    return(bam)
}


calcPhredScoreProfile <- function(bamFilePath, mapqFilter=0, maxFileSize=1,
                                  targetRegions=NULL, subsampleRatio=NA,
                                  subsampleRegionLength=1e+5,
                                  disableSubsampling=FALSE, threads=1) {
    
    if(missing(bamFilePath)) stop("bamFilePath is required")
    
    bamFile <- BamFile(bamFilePath)
    seqInfo <- scanBamHeader(bamFile)$targets
    
    colnames(targetRegions) <- c("chr", "start", "end")
    targetRegions <- targetRegions[targetRegions$chr %in% names(seqInfo), ]
    
    if (length(targetRegions) != 0 & is(targetRegions, "GenomicRanges")) {
        targetRegions <- data.frame(seqnames(targetRegions), 
                                    start(targetRegions), 
                                    end(targetRegions))
    }
    
    if (file.info(bamFilePath)$size / 1e+9 <= maxFileSize |
        disableSubsampling) {
        if (length(targetRegions) == 0) {
            regions <- GRanges(names(seqInfo), IRanges(1, seqInfo))
            regions <- 
                unlist(slidingWindows(regions, width=50000, step = 50000))
        } else {
            colnames(targetRegions) <- c("chr", "start", "end")
            regions <- GRanges(targetRegions$chr, IRanges(targetRegions$start,
                                                          targetRegions$end))
        }
        if (file.info(bamFilePath)$size / 1e+9 <= maxFileSize) {
            message("BAM filesize smaller than ",
                    maxFileSize,
                    " gigabytes, calculate Phred quality score profile ",
                    "based on all reads.")
        } else {
            message("Subsampling disabled, ",
                    "calculate Phred quality score profile based on all reads.")
        }
    } else {
        if (is.na(subsampleRatio)) {
            subsampleRatio <-
                maxFileSize * 1e+9 / file.info(bamFilePath)$size
        }
        message("BAM filesize greater than ",
                maxFileSize,
                " gigabytes, calculation based on subsampled reads ",
                "(subsample ratio: ",
                subsampleRatio, ").")
        if (length(targetRegions) == 0) {
            nRegions <- seqInfo * subsampleRatio / subsampleRegionLength
            seqInfo <- seqInfo[nRegions >= 1]
            nRegions <- ceiling(nRegions[nRegions >= 1])
            if (length(seqInfo) == 0) {
                stop("subsampleRegionLength is too large ",
                     "with this subsampleRatio.")
            }
            subsampleRegions <- GRanges()
            for (i in seq_along(seqInfo)) {
                startPos <- sample(seq(1, seqInfo[i], subsampleRegionLength),
                                   nRegions[i], replace = FALSE)
                subsampleRegion <-
                    GRanges(seqnames = names(seqInfo[i]),
                            ranges = IRanges(startPos,
                                             startPos +
                                                 subsampleRegionLength - 1))

                suppressWarnings(subsampleRegions <-
                                     c(subsampleRegions, subsampleRegion))
            }
            regions <- sort(subsampleRegions)
        } else {
            regions <- GRanges(targetRegions$chr, IRanges(targetRegions$start,
                                                          targetRegions$end))
            regions <- regions[sample(seq_along(regions),
                                      round(length(regions) * subsampleRatio))]
        }
    }
    
    message("Get reads from BAM file in ", length(regions), " regions.", 
            "\nUsing ", threads, " thread(s).")
    cl <- makeCluster(threads)
    registerDoParallel(cl)

    readInfo <- foreach(i = seq_along(regions),
                        .combine = "c",
                        .inorder = TRUE,
                        .verbose = FALSE,
                        .errorhandling = "remove",
                        .packages = c("Biostrings", "dplyr",
                                      "GenomicRanges", "Rsamtools"),
                        .export = c("findReads", "countPhredScores")
    ) %dopar% {
            temp <- findReads(bamFile,
                              what = "qual",
                              which = regions[i],
                              reverseComplement=TRUE,
                              mapqFilter = mapqFilter)
            if (length(temp$qual) > 1) {
                temp$qual <- countPhredScores(temp$qual)
                return(temp)
            }
    }
    stopCluster(cl)
    quals <- unname(readInfo[names(readInfo) == "qual"])
    if (length(quals) == 0) {
        stop("No reads found.")
    }
    readLen <- max(unlist(lapply(quals, nrow)))
    PhredScores <- unique(unlist(lapply(quals, colnames)))
    PhredScores <- PhredScores[order(phred2ASCIIOffset(PhredScores))]
    for (i in seq_along(quals)) {
        qual <- quals[[i]]
        PhredScoreDiff <- setdiff(PhredScores, colnames(qual))
        if (length(PhredScoreDiff) > 0 ) {
            temp = matrix(0, nrow = nrow(qual), ncol = length(PhredScoreDiff))
            colnames(temp) <- PhredScoreDiff
            quals[[i]] <- cbind(qual, temp)
            quals[[i]] <- quals[[i]][, PhredScores]
        }
        rowDiff <- readLen - nrow(qual)
        if (rowDiff > 0) {
            temp = matrix(0, nrow = rowDiff, ncol = length(PhredScores))
            quals[[i]] <- rbind(quals[[i]], temp)
        }
    }
    qualProb <- Reduce("+", quals)
    qualProb <- qualProb / rowSums(qualProb)
    return(qualProb)
}
LanyingWei/SimFFPE documentation built on Nov. 22, 2020, 3:37 a.m.