# countUniqueAndMultiReads.R
`countUniqueAndMultiReads` <- function( filein, readBufferSize=1000000, verbose=TRUE) {
# this tool ASSUMES raw unsorted BAM data!!
if ( ! file.exists( filein)) {
cat( "\nBAM file not found: ", filein)
return( list( "Unique"=0, "Multi"=0))
}
con <- bamReader( filein)
# read in the alignment file one buffer at a time
hasMore <- TRUE
nUniq <- nMult <- 0
repeat {
if ( ! hasMore) break
if (verbose) cat( ".")
chunk <- getNextChunk( con, n=readBufferSize, alignedOnly=TRUE)
nNow <- size(chunk)
if ( nNow < 1) break
if ( nNow < readBufferSize) hasMore <- FALSE
whoUniq <- which.unique.align(chunk)
whoMult <- which.multi.read(chunk)
nUniq <- nUniq + length( whoUniq)
nMult <- nMult + length( whoMult)
} # end of each buffer...
bamClose( con)
return( list( "Unique"=nUniq, "Multi"=nMult))
}
`countReadsBySpecies` <- function( filein, readBufferSize=1000000, verbose=TRUE) {
# this tools is FINE with SORTED BAM files..!!
# count up the reads in a BAM file by their organism
speciesSet <- getCurrentTargetSpecies()
NSP <- length( speciesSet)
nReads <- rep.int(0, NSP)
names( nReads) <- speciesSet
if ( ! file.exists( filein)) {
cat( "\nBAM file not found: ", filein)
return( nReads)
}
con <- bamReader( filein)
refData <- getRefData( con)
# pre fetch the set of all possible seqIDs, to speed up the calls
allSeqIDs <- ptrSeqIDs <- vector()
curSpecies <- getCurrentSpecies()
on.exit( setCurrentSpecies( curSpecies))
for ( i in 1:NSP) {
setCurrentSpecies( speciesSet[i])
smap <- getCurrentSeqMap()
ns <- nrow(smap)
allSeqIDs <- c( allSeqIDs, smap$SEQ_ID)
ptrSeqIDs <- c( ptrSeqIDs, rep.int( i, ns))
}
# read in the alignment file one buffer at a time
hasMore <- TRUE
repeat {
if ( ! hasMore) break
if (verbose) cat( ".")
chunk <- getNextChunk( con, n=readBufferSize, alignedOnly=TRUE)
nNow <- size(chunk)
if ( nNow < 1) break
if ( nNow < readBufferSize) hasMore <- FALSE
# we want reads, not alignments!
whoPrim <- which( ! secondaryAlign(chunk))
# look at those SeqIDs
refIDs <- refID(chunk)[whoPrim]
seqIDs <- refID2seqID( refIDs, refData=refData)
where <- match( seqIDs, allSeqIDs, nomatch=0)
myPtrs <- ptrSeqIDs[ where]
for ( i in 1:NSP) {
nNow <- sum( myPtrs == i)
nReads[i] <- nReads[i] + nNow
}
} # end of each buffer...
bamClose( con)
return( as.list( nReads))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.