# pipe.GatherGeneAlignments.R -- collect up the reads that align to some genes, and
# optionally repackage in their original FASTQ format
`pipe.GatherGeneAlignments` <- function( sampleID, genes, optionsFile="Options.txt",
results.path=NULL, tail.width=0,
stages=c("genomic", "splice"), mode=c("all","best.one"),
asFASTQ=FALSE, fastq.keyword="Genes", verbose=TRUE) {
# get needed paths, etc. from the options file
optT <- readOptionsTable( optionsFile)
if ( is.null( results.path)) {
results.path <- getOptionValue( optT, "results.path", notfound=".", verbose=F)
}
NG <- length( genes)
gmap <- getCurrentGeneMap()
smap <- getCurrentSeqMap()
where <- match( genes, gmap$GENE_ID, nomatch=0)
if ( any( where == 0)) {
cat( "\nSome genes not found in current species: ", genes[ where == 0])
where <- where[ where > 0]
genes <- genes[ where]
NG <- length( genes)
}
gptrs <- where
# determine the set of BAM files to visit
Stages <- c( "riboClear", "genomic", "splice")
if ( ! all( stages %in% Stages)) {
cat( "\nAllowed pipeline stages: ", Stages)
stop()
}
# decide if we want all reads or just best one from each location
mode <- match.arg( mode)
bamFiles <- vector()
if ( "riboClear" %in% stages) {
bamFiles <- c( bamFiles, file.path( results.path, "riboClear",
paste( sampleID, ".ribo.converted.bam", sep="")))
}
if ( "genomic" %in% stages) {
bamFiles <- c( bamFiles, file.path( results.path, "align",
paste( sampleID, ".genomic.bam", sep="")))
}
if ( "splice" %in% stages) {
bamFiles <- c( bamFiles, file.path( results.path, "splicing",
paste( sampleID, ".splice.converted.bam", sep="")))
}
outrefid <- outpos <- outncig <- outcig <- outflag <- outseq <- outqual <- vector()
outname <- outrev <- outsize <- outsid <- outgid <- outstage <- vector()
nout <- 0
for ( f in bamFiles) {
thisStage <- "genomic"
if ( regexpr( "ribo", f) > 0) thisStage <- "riboClear"
if ( regexpr( "splice", f) > 0) thisStage <- "splice"
# make sure we have that BAM file sorted and indexed
if (verbose) cat( "\nFile: ", basename(f))
bamf <- BAM.verifySorted( f, verbose=F, threads=2)
if ( is.null( bamf)) next
bamidx <- paste( bamf, "bai", sep=".")
reader <- bamReader( bamf, indexname=bamidx)
refData <- getRefData( reader)
# visit every gene we were given
for ( ig in 1:NG) {
sml <- gmap[ gptrs[ ig], ]
# extract the chunk of reads for this gene's loci
refid <- seqID2refID( sml$SEQ_ID, refData=refData)
start <- sml$POSITION - tail.width
end <- sml$END + tail.width
# quietly catch case of that chromosome not in the data
if (refid < 0) next
# force clip to this chromosome's limits
start <- max( start, 1)
end <- min( end, smap$LENGTH[ match( sml$SEQ_ID, smap$SEQ_ID)])
chunk <- bamRange( reader, coords=c(refid, start, end))
if ( size(chunk) < 1) next
smallDF <- as.data.frame( chunk)
if (asFASTQ) {
smallDF$seq <- readSeq( chunk)
smallDF$qual <- readQual( chunk)
}
smallDF$seqid <- sml$SEQ_ID
smallDF$geneid <- sml$GENE_ID
smallDF$stage <- thisStage
# splices have the reads broken by the splice junction, and a modified readID
# we need to rebuild the originals
if ( thisStage == "splice") {
smallDF <- rejoinSplicedReads( smallDF)
}
# if we want the raw reads, don't keep MARs
if (asFASTQ) {
dups <- which( duplicated( smallDF$name))
if ( length(dups) > 0) {
smallDF <- smallDF[ -dups, ]
}
}
if ( nrow(smallDF) && mode == "best.one") {
# for tools like denovo assembly, limit the read depth by keeping just one copy per
# location. Use count if possible, else random
posFac <- factor( smallDF$position)
keep <- tapply( 1:nrow(smallDF), posFac, function(x) {
if ( length(x) == 1) return(x)
seqTbl <- sort( table( smallDF$seq[x]), decreasing=T)
bestCnt <- seqTbl[1]
isBest <- which( seqTbl == bestCnt)
if ( length(isBest) > 1) isBest <- sample( isBest, size=1)
# return the first row that has this chosen sequence
bestHit <- match( names(seqTbl)[isBest], smallDF$seq[x])
return( x[bestHit])
})
smallDF <- smallDF[ keep, ]
}
if ( verbose) cat( "\n", sml$GENE_ID, "\tN_Alignments: ", nrow(smallDF))
if ( nrow(smallDF) < 1) next
now <- (nout + 1) : (nout + nrow(smallDF))
outrefid[now] <- smallDF$refid
outpos[now] <- smallDF$position
outncig[now] <- smallDF$nCigar
outcig[now] <- smallDF$cigar
outflag[now] <- smallDF$flag
outseq[now] <- smallDF$seq
outqual[now] <- smallDF$qual
outname[now] <- smallDF$name
outrev[now] <- smallDF$revstrand
outsize[now] <- smallDF$insertsize
outsid[now] <- smallDF$seqid
outgid[now] <- smallDF$geneid
outstage[now] <- smallDF$stage
nout <- max( now)
}
bamClose( reader)
}
if ( verbose) cat( "\nTotal Alignments: ", nout)
# put into chromosomal order
if (verbose) cat( "\nSorting alignments..")
ord <- order( outrefid, outpos)
outrefid <- outrefid[ ord]
outpos <- outpos[ ord]
outncig <- outncig[ ord]
outcig <- outcig[ ord]
outflag <- outflag[ ord]
outseq <- outseq[ ord]
outqual <- outqual[ ord]
outname <- outname[ ord]
outrev <- outrev[ ord]
outsize <- outsize[ ord]
outsid <- outsid[ ord]
outgid <- outgid[ ord]
outstage <- outstage[ ord]
out <- data.frame( "refid"=outrefid, "position"=outpos, "nCigar"=outncig, "cigar"=outcig,
"flag"=outflag, "seq"=outseq, "qual"=outqual, "name"=outname,
"revstrand"=outrev, "insertsize"=outsize, "seqid"=outsid, "geneid"=outgid,
"stage"=outstage, stringsAsFactors=FALSE)
if (nrow(out)) rownames(out) <- 1:nrow(out)
if (verbose) cat( " Done.")
if ( asFASTQ) {
if (verbose) cat( "\nConverting Alignments back to FASTQ..")
outfile <- paste( sampleID, fastq.keyword, "fastq.gz", sep=".")
outfile <- file.path( results.path, "fastq", outfile)
fqDF <- data.frame( "READ_ID"=out$name, "READ_SEQ"=out$seq, "SCORE"=out$qual,
stringsAsFactors=FALSE)
# there may be duplicate readIDs, that mapped to more than one location in the genome
# don't let them be written out more than once...
dups <- which( duplicated( fqDF$READ_ID))
if ( length(dups) > 0) {
if (verbose) cat( "\nDropping redundant MAR alignments from FASTQ: ", length(dups))
fqDF <- fqDF[ -dups, ]
}
writeFastq( fqDF, outfile, compress=T)
return( nrow(fqDF))
} else {
return( out)
}
}
`pipe.GatherRegionAlignments` <- function( sampleID, seqids, starts, stops,
optionsFile="Options.txt",
results.path=NULL, stages=c("genomic", "splice"),
asFASTQ=FALSE, fastq.keyword="Region", verbose=TRUE) {
# get needed paths, etc. from the options file
optT <- readOptionsTable( optionsFile)
if ( is.null( results.path)) {
results.path <- getOptionValue( optT, "results.path", notfound=".", verbose=F)
}
# determine the set of BAM files to visit
Stages <- c( "riboClear", "genomic", "splice")
if ( ! all( stages %in% Stages)) {
cat( "\nAllowed pipeline stages: ", Stages)
stop()
}
bamFiles <- vector()
if ( "riboClear" %in% stages) {
bamFiles <- c( bamFiles, file.path( results.path, "riboClear",
paste( sampleID, ".ribo.converted.bam", sep="")))
}
if ( "genomic" %in% stages) {
bamFiles <- c( bamFiles, file.path( results.path, "align",
paste( sampleID, ".genomic.bam", sep="")))
}
if ( "splice" %in% stages) {
bamFiles <- c( bamFiles, file.path( results.path, "splicing",
paste( sampleID, ".splice.converted.bam", sep="")))
}
# can have more than one region...
nRegions <- length( starts)
if (length(stops) != nRegions) stop( "'starts' and 'stops' must be of same length")
if (length(seqids) < nRegions) seqids <- rep( seqids, length.out=nRegions)
out <- data.frame()
for ( f in bamFiles) {
thisStage <- "genomic"
if ( regexpr( "ribo", f) > 0) thisStage <- "riboClear"
if ( regexpr( "splice", f) > 0) thisStage <- "splice"
# make sure we have that BAM file sorted and indexed
bamf <- BAM.verifySorted( f, verbose=FALSE, threads=2)
if ( is.null( bamf)) next
bamidx <- paste( bamf, "bai", sep=".")
reader <- bamReader( bamf, indexname=bamidx)
refData <- getRefData( reader)
# visit this region
for ( iregion in 1:nRegions) {
seqid <- seqids[iregion]
start <- starts[iregion]
stop <- stops[iregion]
# extract the chunk of reads for this gene's loci
refid <- seqID2refID( seqid, refData=refData)
chunk <- bamRange( reader, coords=c(refid, start, stop))
if ( size(chunk) < 1) next
smallDF <- as.data.frame( chunk)
if ( asFASTQ) {
smallDF$seq <- readSeq( chunk)
smallDF$qual <- readQual( chunk)
}
smallDF$stage <- thisStage
# splices have the reads broken by the splice junction, and a modified readID
# we need to rebuild the originals
if ( thisStage == "splice") {
smallDF <- rejoinSplicedReads( smallDF)
}
# if we want the raw reads, don't keep MARs
if (asFASTQ) {
dups <- which( duplicated( smallDF$name))
if ( length(dups) > 0) {
smallDF <- smallDF[ -dups, ]
}
}
if ( verbose) cat( "\n", basename(f), "\nSeqID, Start, Stop: ", seqid, start, stop,
"\tN_Alignments: ", nrow(smallDF))
if ( nrow(smallDF) < 1) next
out <- rbind( out, smallDF)
}
bamClose( reader)
}
if ( verbose) cat( "\nTotal Aignments: ", nrow(out))
# put into chromosomal order
ord <- order( out$seq, out$position)
out <- out[ ord, ]
rownames(out) <- 1:nrow(out)
if ( asFASTQ) {
if (verbose) cat( "\nConverting Alignments back to FASTQ..")
outfile <- paste( sampleID, fastq.keyword, "fastq.gz", sep=".")
outfile <- file.path( results.path, "fastq", outfile)
fqDF <- data.frame( "READ_ID"=out$name, "READ_SEQ"=out$seq, "SCORE"=out$qual,
stringsAsFactors=FALSE)
# there may be duplicate readIDs, that mapped to more than one location in the genome
# don't let them be written out more than once...
dups <- which( duplicated( fqDF$READ_ID))
if ( length(dups) > 0) {
if (verbose) cat( "\nDropping redundant MAR alignments from FASTQ: ", length(dups))
fqDF <- fqDF[ -dups, ]
}
writeFastq( fqDF, outfile, compress=T)
return( nrow(fqDF))
} else {
return( out)
}
}
`rejoinSplicedReads` <- function( tbl) {
# given a data frame of alignments from a splice BAM file, put the halve back together
if ( ! all( c( "position", "seq", "name", "qual") %in% colnames(tbl))) stop( "Not given a splice BAM alignment data frame")
posIn <- tbl$position
nameIn <- tbl$name
seqIn <- tbl$seq
qualIn <- tbl$qual
# all the first halves say 'splice1'
isFront <- grep( "::splice1", nameIn, fixed=T)
isBack <- grep( "::splice2", nameIn, fixed=T)
# grab all the partial items we will need
nameOut <- sub( "::splice[12]", "", nameIn)
nameFront <- nameOut[ isFront]
nameBack <- nameOut[ isBack]
# to be a usable read, we need to see both halves of the same readID
frontHitsBack <- match( nameFront, nameBack, nomatch=0)
keepers <- isFront[ frontHitsBack > 0]
keepBack <- isBack[ frontHitsBack]
# grab that subset of the given table as the result, then update the bits that need it
out <- tbl[ keepers, ]
out$name <- nameOut[ keepers]
out$seq <- paste( seqIn[keepers], seqIn[keepBack], sep="")
out$qual <- paste( qualIn[keepers], qualIn[keepBack], sep="")
# all done, just these pairs that got resolved go back
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.