#' @export
#' @importFrom Biostrings reverseComplement QualityScaledDNAStringSet DNAStringSet PhredQuality
#' @importFrom S4Vectors DataFrame metadata<-
#' @importFrom BiocParallel bpmapply SerialParam bpstart bpstop bpisup
#' @importFrom BiocGenerics rownames<-
#' @importFrom ShortRead FastqStreamer yield
adaptorAlign <- function(adaptor1, adaptor2, filepath, tolerance=250, gapOpening=5, gapExtension=1,
qual.type=c("phred", "solexa", "illumina"), number=1e5, BPPARAM=SerialParam())
# This function aligns both adaptors to the read sequence with the specified parameters,
# and returns the alignments that best match the sequence (with reverse complementing if necessary).
#
# written by Florian Bieberich
# with modifications by Aaron Lun
# created 7 November 2017
{
adaptor1 <- toupper(as.character(adaptor1))
adaptor2 <- toupper(as.character(adaptor2))
qual.type <- match.arg(qual.type)
qual.class <- .qual2class(qual.type)
all.args <- list(adaptor1=adaptor1, adaptor2=adaptor2, tolerance=tolerance,
subseq1=.setup_subseqs(adaptor1), subseq2=.setup_subseqs(adaptor2),
gap.opening=gapOpening, gap.extension=gapExtension)
# Looping across reads.
fhandle <- FastqStreamer(filepath, n=number)
on.exit(close(fhandle))
all.starts <- all.ends <- all.names <- all.rev <- all.widths <- list()
counter <- 1L
if (!bpisup(BPPARAM)) {
bpstart(BPPARAM)
on.exit(bpstop(BPPARAM), add=TRUE)
}
while (length(fq <- yield(fhandle))) {
reads <- .FASTQ2QSDS(fq, qual.class)
by.cores <- .parallelize(reads, BPPARAM)
out <- bpmapply(by.cores, FUN=.align_AA_internal, MoreArgs=all.args, BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE)
all.names[[counter]] <- lapply(out, "[[", i="names")
all.widths[[counter]] <- lapply(out, "[[", i="width")
all.starts[[counter]] <- lapply(out, "[[", i="start")
all.ends[[counter]] <- lapply(out, "[[", i="end")
all.rev[[counter]] <- lapply(out, "[[", i="reversed")
counter <- counter + 1L
}
if (counter==1L) {
# Guarantee some value is returned.
out <- do.call(.align_AA_internal, c(list(reads=QualityScaledDNAStringSet(DNAStringSet(), PhredQuality(0))), all.args))
all.names[[counter]] <- out$names
all.widths[[counter]] <- out$width
all.starts[[counter]] <- out$start
all.ends[[counter]] <- out$end
all.rev[[counter]] <- out$reversed
}
align_start <- do.call(rbind, unlist(all.starts, recursive=FALSE))
align_end <- do.call(rbind, unlist(all.ends, recursive=FALSE))
details <- list(gapOpening=gapOpening, gapExtension=gapExtension)
metadata(align_start) <- c(list(sequence=adaptor1), details)
metadata(align_end) <- c(list(sequence=adaptor2), details)
# Adjusting the reverse coordinates for the read length.
old.start <- align_end$start
old.end <- align_end$end
all.widths <- unlist(all.widths)
align_end$start <- all.widths - old.start + 1L
align_end$end <- all.widths - old.end + 1L
all.names <- unlist(all.names)
rownames(align_start) <- rownames(align_end) <- all.names
output <- DataFrame(read.width=all.widths, adaptor1=I(align_start), adaptor2=I(align_end), reversed=unlist(all.rev), row.names=all.names)
metadata(output) <- list(filepath=filepath, qual.type=qual.type, tolerance=tolerance)
return(output)
}
#########################################
#########################################
#' @importFrom Biostrings reverseComplement
#' @importFrom BiocGenerics width
#' @importFrom XVector subseq
.get_front_and_back <- function(reads, tolerance)
# Extracting the front part of the read (on the forward strand)
# and the end of the read (on the reverse strand).
{
tolerance <- pmin(tolerance, width(reads))
reads.start <- subseq(reads, start = 1, width = tolerance)
reads.end <- subseq(reads, end = width(reads), width = tolerance)
reads.end <- reverseComplement(reads.end)
return(list(front=reads.start, back=reads.end))
}
.qual2class <- function(qual.type) {
paste0(toupper(substr(qual.type, 1, 1)), substr(qual.type, 2, nchar(qual.type)), "Quality")
}
#' @importFrom Biostrings quality
#' @importFrom methods as
#' @importFrom ShortRead FastqStreamer sread id
.FASTQ2QSDS <- function(fq, qual.class) {
seq <- sread(fq)
qual <- as(quality(fq), qual.class)
reads <- QualityScaledDNAStringSet(seq, qual)
names(reads) <- as.character(id(fq))
reads
}
.resolve_strand <- function(start.score, end.score, rc.start.score, rc.end.score)
# This determines whether the read orientation needs to be flipped so that
# adaptor 1 is at the start and adaptor 2 is at the end (see ?tuneAlignments
# for an explanation of what the 'final.score' means).
{
fscore <- pmax(start.score, 0) + pmax(end.score, 0)
rscore <- pmax(rc.start.score, 0) + pmax(rc.end.score, 0)
is.reverse <- fscore < rscore
final.score <- ifelse(is.reverse, rscore, fscore)
return(list(reversed=is.reverse, scores=final.score))
}
#' @importFrom BiocParallel SerialParam bpnworkers
#' @importFrom S4Vectors head
.parallelize <- function(reads, BPPARAM)
# Splits reads across cores. Note the as.list() hack,
# as bplapply doesn't acknowledge DNAStringSetList when BPPARAM is a BatchtoolsParam object.
{
n.cores <- bpnworkers(BPPARAM)
bounds <- seq(from=1L, to=length(reads), length.out=n.cores+1L)
ids <- findInterval(seq_along(reads), head(bounds, -1))
as.list(split(reads, ids))
}
.setup_subseqs <- function(adaptor) {
mpos <- gregexpr("[^ACTG]+", adaptor)[[1]]
if (length(mpos)==1L && mpos==-1L) {
list(starts=integer(0), ends=integer(0))
} else {
list(starts=as.integer(mpos), ends=as.integer(mpos) + attr(mpos, "match.length") - 1L)
}
}
#' @importFrom Biostrings quality pattern subject aligned unaligned
#' @importFrom BiocGenerics score start end
#' @importFrom S4Vectors DataFrame
#' @importFrom XVector subseq compact
#' @importFrom methods new
#' @importClassesFrom S4Vectors DataFrame
.align_and_extract <- function(adaptor, reads, gap.opening, gap.extension, subseq.starts, subseq.ends)
# Align reads and extract alignment strings.
# This requires custom code as the Biostrings getters for the full alignment strings are not efficient.
{
quals <- quality(reads)
out <- .Call(cxx_adaptor_align, reads, quals, .create_encoding_vector(quals),
gap.opening, gap.extension,
adaptor, subseq.starts - 1L, subseq.ends)
output <- DataFrame(score=out[[1]], start=out[[2]], end=out[[3]])
if (length(subseq.starts)) {
segments <- vector("list", length(out[[4]]))
for (i in seq_along(segments)) {
segments[[i]] <- compact(subseq(reads, start=out[[4]][[i]], width=out[[5]][[i]]))
}
names(segments) <- sprintf("Sub%i", seq_along(segments))
segments <- DataFrame(segments)
} else {
segments <- new("DataFrame", nrows=length(reads))
}
output$subseq <- segments
output
}
#' @importFrom BiocGenerics width
.align_AA_internal <- function(reads, adaptor1, adaptor2, tolerance, subseq1, subseq2, ...)
# Wrapper function to pass to bplapply, along with the sarlacc namespace.
{
reads.out <- .get_front_and_back(reads, tolerance)
reads.start <- reads.out$front
reads.end <- reads.out$back
# Performing the alignment of each adaptor to the start/end of the read.
cur.starts <- .align_and_extract(adaptor=adaptor1, reads=reads.start, subseq.starts=subseq1$starts, subseq.ends=subseq1$ends, ...)
cur.ends <- .align_and_extract(adaptor=adaptor2, reads=reads.end, subseq.starts=subseq2$starts, subseq.ends=subseq2$ends, ...)
cur.rc.starts <- .align_and_extract(adaptor=adaptor1, reads=reads.end, subseq.starts=subseq1$starts, subseq.ends=subseq1$ends, ...)
cur.rc.ends <- .align_and_extract(adaptor=adaptor2, reads=reads.start, subseq.starts=subseq2$starts, subseq.ends=subseq2$ends, ...)
# Standardizing the strand.
strand.out <- .resolve_strand(cur.starts$score, cur.ends$score, cur.rc.starts$score, cur.rc.ends$score)
is_reverse <- strand.out$reversed
cur.starts[is_reverse,] <- cur.rc.starts[is_reverse,]
cur.ends[is_reverse,] <- cur.rc.ends[is_reverse,]
list(names=names(reads), width=width(reads), start=cur.starts, end=cur.ends, reversed=is_reverse)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.