R/bams2counts.R

chromRange <- function(i, chr="") {
  eval(parse(text=paste('RangesList("', chr, i,'"=IRanges(start=0, end=268435456L))',sep="")))
}

bam2fragments <- function(bamFile, X=FALSE, mapq=20) {
    # get position, mate position and insert size (fragment length)
    what <- c("pos","mpos","isize")
    # get first mate from properly paired reads which pass QC
    flag=scanBamFlag(isNotPassingQualityControls=FALSE, isPaired=TRUE, isFirstMateRead=TRUE, hasUnmappedMate=FALSE, isDuplicate=FALSE, isSecondaryAlignment=FALSE)
    bam <- list()
    # autosomes
    for(i in 1:22) {
        which <- chromRange(i)
        param <- ScanBamParam(flag=flag, which = which, what = what, mapqFilter=mapq)
        # scan the data
        bam[[i]] <- scanBam(bamFile, param=param)[[1]]
    }
    if (X) {
        # chromosome X
        which <- chromRange("X")
        param <- ScanBamParam(flag=flag, which = which, what = what)
        # scan the data
        bam[[23]] <- scanBam(bamFile, param=param)[[1]]
    }
    bam
}

fragments2dataframe <- function(nbam, tbam, iSizeLim=c(75,750), gbuild="hg19") {
    # normal fragment midpoints
    nfmid <- lapply(nbam, function(x, ll, ul) {
        x$isize <- abs(x$isize)
        sort((pmin(x$pos, x$mpos) + x$isize/2)[x$isize >= ll & x$isize <= ul])
    }, iSizeLim[1], iSizeLim[2])
    # tumor fragment midpoints
    tfmid <- lapply(tbam, function(x, ll, ul) {
        x$isize <- abs(x$isize)
        sort((pmin(x$pos, x$mpos) + x$isize/2)[x$isize >= ll & x$isize <= ul])
    }, iSizeLim[1], iSizeLim[2])
    # bin the mid points into interval of size 100
    binnedcounts <- list()
    nchr <- length(nbam)
    # load blacklisted regions data
    data(hgBlacklist, package="seqDNAcopy",envir=environment())
    blgbuild <- paste(gbuild, "bl", sep="")
    gbl <- get(blgbuild, envir=environment())
    for(i in 1:nchr) binnedcounts[[i]] <- frags2counts(nfmid[[i]], tfmid[[i]], gbl[[i]][,1], gbl[[i]][,2])
    # total number of bins with nonzero count
    nbins <- unlist(lapply(binnedcounts, function(x) {length(x$pos)}))
    # convert to matrix
    fcounts <- matrix(0, sum(nbins), 4)
    colnames(fcounts) <- c("chrom","pos","normal","tumor")
    fcounts[,1] <- rep(1:nchr, nbins)
    for(i in 1:nchr) {
        ii <- fcounts[,1]==i
        fcounts[ii,2] <- binnedcounts[[i]]$pos
        fcounts[ii,3] <- binnedcounts[[i]]$normal
        fcounts[ii,4] <- binnedcounts[[i]]$tumor
    }
    # return data
    as.data.frame(fcounts)
}

bams2counts <- function(nBamFile, tBamFile, GCcorrect=TRUE, gbuild="hg19", mapq=20, iSizeLim=c(75,750), X=FALSE) {
    # normal bam read data ("pos","mpos","isize")
    nbam <- bam2fragments(nBamFile, X, mapq)
    # tumor bam read data ("pos","mpos","isize")
    tbam <- bam2fragments(tBamFile, X, mapq)
    # get the fragment count in bins centered every 100 bases
    out <- fragments2dataframe(nbam, tbam, iSizeLim, gbuild)
    if (GCcorrect) {
        # gc percentage
        gcpct <- rep(NA_real_, nrow(out))
        # get GC percentages from pctGCdata package
        # loop thru chromosomes
        nchr <- max(out$chrom) # IMPACT doesn't have X so only 22
        for (i in 1:nchr) {
            ii <- which(out$chrom==i)
            # allow for chromosomes with no SNPs i.e. not targeted
            if (length(ii) > 0) {
                gcpct[ii] <- getGCpct(i, out$pos[ii], gbuild)
            }
        }
        # GC correction
        tscl <- sum(out$normal)/sum(out$tumor)
        # normal and tumor fragment counts by GC percent level
        ncount <- tapply(out$normal, gcpct, sum)
        tcount <- tapply(out$tumor, gcpct, sum)
        # log-ratio of tumor to normal counts as function of GC percentages
        # standardized by the ratio of library sizes
        logtn <- log2(tcount*tscl) - log2(ncount)
        # percent GC
        pctgc <- as.numeric(names(logtn))
        # weights for each GC percent level
        wts <- log2(tcount+ncount)
        # bins that have non-zero count for both tumor and normal
        ii <- which(is.finite(logtn))
        # gc bias estimated using loess
        gcb <- loess(logtn[ii] ~ pctgc[ii], weights=wts[ii], span=0.25)
        # GC corrected library scaled tumor counts
        jj <- match(gcpct, gcb$x)
        gcscl <- 2^{-gcb$fitted[jj]}
        # gcscl is NA if a bin has NA for gcpct; make it 1
        gcscl[is.na(gcscl)] <- 1
        # scale tumor counts by gcscl
        out$tumor <- gcscl*out$tumor
    }
    out
}
veseshan/seqDNAcopy documentation built on May 3, 2019, 6:11 p.m.