#' Build an ERVmap parameter object
#'
#' Build an object of the class \code{ERVmapParam}
#'
#' @param bfl A \code{BamFile} or \code{BamFileList} object, or a character
#' string vector of BAM filenames.
#'
#' @param teFeatures A \code{GRanges} or \code{GRangesList} object with the
#' transposable element (TE) annotated features to be quantified. Elements in
#' this object should have names, which are used as a grouping factor for
#' genomic ranges forming a common locus, unless other metadata column names
#' are specified in the \code{aggregateby} parameter.
#'
#' @param aggregateby Character vector with column names in the annotation
#' to be used to aggregate quantifications. By default, this is an empty
#' vector, which means that the names of the input \code{GRanges} or
#' \code{GRangesList} object given in the \code{teFeatures} parameter are used
#' to aggregate quantifications.
#'
#' @param ovMode Character vector indicating the overlapping mode. Available
#' options are: "ovUnion" (default) and "ovIntersectionStrict",
#' which implement the corresponding methods from HTSeq
#' (\url{https://htseq.readthedocs.io/en/release_0.11.1/count.html}).
#' Ambiguous alignments (alignments overlapping > 1 feature) are addressed
#' as in the original ERVmap algorithm.
#'
#' @param geneFeatures (Default NULL) A \code{GRanges} or
#' \code{GRangesList} object with the
#' gene annotated features to be quantified. Overlaps with unique reads are
#' first tallied with respect to these gene features. Elements should have
#' names indicating the gene name/id. In case that \code{geneFeatures}
#' is a \code{GRanges} and contains
#' a metadata column named \code{type}, only the elements with
#' \code{type} = \code{exon} are considered for the analysis. Then, exon
#' counts are summarized to the gene level. If NULL, gene expression is
#' not quantified.
#'
#' @param singleEnd (Default TRUE) Logical value indicating if reads are single
#' (\code{TRUE}) or paired-end (\code{FALSE}).
#'
#' @param strandMode (Default 1) Numeric vector which can take values 0, 1 or
#' 2.
#' The strand mode is a per-object switch on
#' \code{\link[GenomicAlignments:GAlignmentPairs-class]{GAlignmentPairs}}
#' objects that controls the behavior of the strand getter. See
#' \code{\link[GenomicAlignments:GAlignmentPairs-class]{GAlignmentPairs}}
#' class for further detail. If \code{singleEnd = TRUE}, then
#' \code{strandMode} is ignored.
#'
#' @param ignoreStrand (Default TRUE) A logical which defines if the strand
#' should be taken into consideration when computing the overlap between reads
#' and annotated features. When \code{ignore_strand = FALSE}, an aligned read
#' is considered to be overlapping an annotated feature as long as they
#' have a non-empty intersecting genomic range on the same strand, while when
#' \code{ignoreStrand = TRUE} the strand is not considered.
#'
#' @param fragments (Default not \code{singleEnd}) A logical; applied to
#' paired-end data only. When \code{fragments=TRUE}, the read-counting
#' method in the original ERVmap algorithm is applied: each mate of a
#' paired-end read is counted (including ambiguous and not properly paired
#' reads). When
#' \code{fragments=FALSE}, if the two mates of a paired-end read map to the
#' same element, they are counted as a single hit and singletons, reads with
#' unmapped pairs and other ambiguous or not properly paired fragments are
#' not counted (see "Pairing criteria" in
#' \code{\link[GenomicAlignments]{readGAlignments}()}).
#'
#' @param maxMismatchRate (Default 0.02) Numeric value storing the maximum
#' mismatch rate employed by the ERVmap algorithm to discard aligned reads
#' whose rate of sum of hard and soft clipping or whose rate of the edit
#' distance over the genome reference to the length of the read is above this
#' threshold.
#'
#' @param suboptimalAlignmentTag (Default "auto") Character string storing the
#' tag name in the BAM files that stores the suboptimal alignment score used in
#' the third filter of ERVmap; see
#' \href{https://doi.org/10.1073/pnas.1814589115}{Tokuyama et al. (2018)}.
#' The default, \code{suboptimalAlignmentTag="auto"}, first extracts the name
#' of the read mapper software from one or more BAM files. If BAM files were
#' generated by BWA, the suboptimal alignment scores are obtained from a tag
#' called \code{XS}. For other read mappers, the suboptimal alignment score
#' is considered to be missing since, except from BWA, no other aligner
#' provides a tag with suboptimal alignment scores. In this case, the available
#' secondary alignments are used to implement an analogous approach to that
#' of the third ERVmap filter. When \code{suboptimalAlignmentTag="none"}, it
#' also performs the latter approach even when the tag \code{XS} is available.
#' When this parameter is different from \code{"auto"} and \code{"none"}, a tag
#' with the given name is used to extract the suboptimal alignment score.
#'
#' @param suboptimalAlignmentCutoff (Default 5) Numeric value storing the
#' cutoff above which the difference between the alignment score and the
#' suboptimal alignment score is considered sufficiently large to retain the
#' alignment. When this value is set to \code{NA}, the filtering step based on
#' suboptimal alignment scores is skipped.
#'
#' @param geneCountMode (Default \code{"all"}) Character string indicating if
#' the ERVmap read filters applied to quantify TEs expression should also be
#' applied when quantifying gene expression (\code{"ervmap"}) or not
#' (\code{"all"}), in which case all primary alignments mapping to genes are
#' counted.
#'
#' @param verbose (Default \code{TRUE}) Logical value indicating whether to
#' report progress.
#'
#' @details
#' This is the constructor function for objects of the class
#' \code{ERVmapParam-class}. This type of object is the input to the
#' function \code{\link{qtex}()} for quantifying expression of transposable
#' elements using the ERVmap method
#' \href{https://doi.org/10.1073/pnas.1814589115}{Tokuyama et al. (2018)}. The
#' ERVmap algorithm processes reads following conservative filtering criteria
#' to provide reliable raw count data for each TE.
#'
#' @return A \linkS4class{ERVmapParam} object.
#'
#' @examples
#' bamfiles <- list.files(system.file("extdata", package="atena"),
#' pattern="*.bam", full.names=TRUE)
#' \dontrun{
#' rmskat <- annotaTEs(genome="dm6", parsefun=rmskatenaparser,
#' strict=FALSE, insert=500)
#' rmskLTR <- getLTRs(rmskat, relLength=0.8,
#' fullLength=TRUE,
#' partial=TRUE,
#' otherLTR=TRUE)
#' }
#'
#' ## DO NOT TYPE THIS INSTRUCTION, WHICH JUST LOADS A PRE-COMPUTED ANNOTATION
#' ## YOU SHOULD USE THE INSTRUCTIONS ABOVE TO FETCH ANNOTATIONS
#' rmskLTR <- readRDS(system.file("extdata", "rmskatLTRrlen80flenpartoth.rds",
#' package="atena"))
#'
#' ## build a parameter object for ERVmap
#' empar <- ERVmapParam(bamfiles,
#' teFeatures=rmskLTR,
#' singleEnd=TRUE,
#' ignoreStrand=TRUE,
#' suboptimalAlignmentCutoff=NA)
#' empar
#'
#' @references
#' Tokuyama M et al. ERVmap analysis reveals genome-wide transcription of human
#' endogenous retroviruses. PNAS. 2018;115(50):12565-12572. DOI:
#' \url{https://doi.org/10.1073/pnas.1814589115}
#'
#' @importFrom methods is new
#' @importFrom BiocGenerics path
#' @importFrom cli cli_alert_info cli_alert_success
#' @export
#' @rdname ERVmapParam-class
ERVmapParam <- function(bfl, teFeatures, aggregateby=character(0),
ovMode="ovUnion",
geneFeatures=NULL,
singleEnd=TRUE,
ignoreStrand=TRUE,
strandMode=1L,
fragments=!singleEnd,
maxMismatchRate=0.02,
suboptimalAlignmentTag="auto",
suboptimalAlignmentCutoff=5,
geneCountMode="all",
verbose=TRUE) {
if (verbose)
cli_alert_info("Locating BAM files")
bfl <- .checkBamFileListArgs(bfl, singleEnd, fragments)
if (!ovMode %in% c("ovUnion","ovIntersectionStrict"))
stop("'ovMode' should be one of 'ovUnion', 'ovIntersectionStrict'")
readmapper <- .checkBamReadMapper(path(bfl))
if (verbose)
cli_alert_info("Processing features")
teFeaturesobjname <- deparse(substitute(teFeatures))
geneFeaturesobjname <- deparse(substitute(geneFeatures))
features <- .processFeatures(teFeatures, teFeaturesobjname,
geneFeatures, geneFeaturesobjname,
aggregateby, aggregateexons=TRUE)
obj <- new("ERVmapParam", bfl=bfl, features=features,
aggregateby=aggregateby, ovMode=ovMode,
singleEnd=singleEnd, ignoreStrand=ignoreStrand,
strandMode=as.integer(strandMode), fragments=fragments,
maxMismatchRate=maxMismatchRate, readMapper=readmapper,
suboptimalAlignmentTag=suboptimalAlignmentTag,
suboptimalAlignmentCutoff=as.numeric(suboptimalAlignmentCutoff),
geneCountMode=geneCountMode)
if (verbose)
cli_alert_success("Parameter object successfully created")
obj
}
#' @param object A \linkS4class{ERVmapParam} object.
#'
#' @importFrom methods show
#' @importFrom GenomeInfoDb seqlevels
#'
#' @export
#' @aliases show,ERVmapParam-method
#' @rdname ERVmapParam-class
setMethod("show", "ERVmapParam",
function(object) {
cat(class(object), "object\n")
cat(sprintf("# BAM files (%d): %s\n", length(object@bfl),
.pprintnames(names(object@bfl))))
cat(sprintf("# features (%d): %s\n", length(features(object)),
ifelse(is.null(names(features(object))),
paste("on",
.pprintnames(seqlevels(features(object)))),
.pprintnames(names(features(object))))))
cat(sprintf("# %s, %s",
ifelse(object@singleEnd, "single-end", "paired-end"),
ifelse(object@ignoreStrand, "unstranded", "stranded")))
if (!object@ignoreStrand)
cat(sprintf(" (strandMode=%d)", object@strandMode))
if (!object@singleEnd)
cat(sprintf(", %s",
ifelse(object@fragments,
"counting each paired-end mate",
"counting both paired-end mates")))
cat("\n")
})
#' @importFrom BiocParallel SerialParam bplapply
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @export
#' @aliases qtex
#' @aliases qtex,ERVmapParam-method
#' @rdname qtex
setMethod("qtex", "ERVmapParam",
function(x, phenodata=NULL, mode=ovUnion, yieldSize=1e6L, verbose=1,
BPPARAM=SerialParam(progressbar=ifelse(verbose==1, TRUE, FALSE))) {
.checkPhenodata(phenodata, length(x@bfl))
cnt <- bplapply(x@bfl, .qtex_ervmap, empar=x, mode=x@ovMode,
yieldSize=yieldSize, verbose=verbose,
BPPARAM=BPPARAM)
cnt <- do.call("cbind", cnt)
cdata <- .createColumnData(cnt, phenodata)
colnames(cnt) <- rownames(cdata)
features <- .consolidateFeatures(x, rownames(cnt))
SummarizedExperiment(assays=list(counts=cnt),
rowRanges=features,
colData=cdata)
})
#' @importFrom BiocGenerics basename path
#' @importFrom Rsamtools ScanBamParam yieldSize yieldSize<-
#' @importFrom methods formalArgs
#' @importFrom S4Vectors mcols
#' @importFrom IRanges findOverlapPairs
#' @importFrom GenomicRanges pintersect
.qtex_ervmap <- function(bf, empar, mode, yieldSize=1000000, verbose) {
iste <- mcols(features(empar))$isTE
if (any(duplicated(names(features(empar)[iste]))))
stop(".qtex_ervmap: duplicated names in TE annotations.")
mode <- match.fun(mode)
readfun <- .getReadFunction(empar@singleEnd, empar@fragments)
## suboptimalAlignmentTag set to 'auto' implies following the original
## ERVmap algorithm; when the read mapper is BWA, it is assumed that
## the tag 'XS' stores the suboptimal alignment score.
soatag <- .getsoatag(empar@suboptimalAlignmentTag, empar@readMapper)
avtags <- .getavtags(empar, bf, soatag)
avgene <- !all(iste)
sbflags <- .setBamFlags(bf, empar, avtags, avgene)
avsoas <- .soasAvail(empar@suboptimalAlignmentCutoff, soatag, avtags, bf,
verbose)
if ((!is.na(empar@suboptimalAlignmentCutoff)) &
isTRUE(soatag %in% avtags)) {
empar@suboptimalAlignmentTag <- soatag
}
param <- ScanBamParam(flag=sbflags, what="flag", tag=avtags)
## 'ov' is a 'Hits' object with the overlaps between reads and features
ov <- Hits(nLnode=0, nRnode=length(features(empar)), sort.by.query=TRUE)
## 'ovdiscard' is a 'Hits' object with the overlaps of discarded reads
ovdiscard <- Hits(nLnode=0, nRnode=length(features(empar)[!iste]),
sort.by.query=TRUE)
## 'salnmask' is a logical mask flagging whether an alignment is secondary
salnmask <- logical(0)
salnbestAS <- integer(0)
## 'alnAS' stores the alignment score
alnAS <- integer(0)
## 'alnNH' stores the NH tag
alnNH <- integer(0)
## 'readids' stores unique read identifiers only once, while 'alnreadidx'
## stores for each read index of the corresponding unique read identifier
readids <- character(0)
alnreadidx <- integer(0) # index in 'readids' for alignm. passing filters
readidx <- integer(0) # index in 'readids' for all alignm. (passing & not)
thisalnAS <- integer(0)
alnreadids <- character(0)
n <- nfiltered <- 0
strand_arg <- "strandMode" %in% formalArgs(readfun)
yieldSize(bf) <- yieldSize
cntvec <- .ervmapQuantExpress(bf, empar, mode, readfun, iste, param,avsoas,
strand_arg, avgene, n, ov, ovdiscard,
salnmask, salnbestAS, alnAS, alnNH, readids,
alnreadidx, readidx, thisalnAS, alnreadids,
nfiltered, verbose, avtags)
cntvec
}
#' @importFrom methods formalArgs as
#' @importFrom IRanges findOverlapPairs
#' @importFrom GenomicRanges pintersect
#' @importFrom GenomicAlignments findOverlaps
#' @importFrom S4Vectors countQueryHits countSubjectHits
.ervmapQuantExpress <- function(bf, empar, mode, readfun, iste, param, avsoas,
strand_arg, avgene, n, ov, ovdiscard, salnmask,
salnbestAS, alnAS, alnNH, readids, alnreadidx,
readidx, thisalnAS, alnreadids, nfiltered,
verbose, avtags) {
open(bf)
while (length(alnreads <- do.call(readfun,
c(list(file=bf), list(param=param),
list(strandMode=empar@strandMode)[strand_arg],
list(use.names=(!avsoas || avgene)))))) {
alnreads <- .matchSeqinfo(alnreads, features(empar))
n <- n + length(alnreads)
thissalnmask <- .secondaryAlignmentMask(alnreads) ## second. aln mask
mask <- .pass2Filters(alnreads, empar) ## which alignments pass flt 1&2
if (!is.na(empar@suboptimalAlignmentCutoff)) {
thisalnAS <- .getAlignmentTagScore(alnreads, tag="AS")
mask <- .pass3Filter(mask, alnreads, empar, thisalnAS, thissalnmask,
avsoas)
}
if (avgene || (!is.na(empar@suboptimalAlignmentCutoff) & !avsoas)) {
alnreadids <- .getAlnreadids(alnreads, empar@fragments)
}
mask[thissalnmask] <- TRUE # Setting TRUE in mask to secondary alignments
alnreadsdiscard <- .getreadsdiscard(empar, alnreads, mask, thissalnmask)
alnreads <- .filteralnreads(empar,alnreads,mask,thissalnmask,avsoas,avgene)
thisov <- mode(alnreads, features(empar), ignoreStrand=empar@ignoreStrand,
inter.feature=FALSE)
ov <- .appendHits(ov, thisov)
if (avgene && empar@geneCountMode == "all") {
## calculate and store overlaps between the discarded reads & genes
thisovdiscard <- mode(alnreadsdiscard, features(empar)[!iste],
ignoreStrand=empar@ignoreStrand,
inter.feature=FALSE)
ovdiscard <- .appendHits(ovdiscard, thisovdiscard)
}
## Reading NH tag if secondary alignments are not available
alnNH <- .fetchNHtag(thissalnmask, avgene, avtags, alnreads)
salnmask <- c(salnmask, thissalnmask[mask]) # secondary alignment mask
if (!avsoas && !is.na(empar@suboptimalAlignmentCutoff)) {
## store best secondary alignment score
salnbestAS <- .findSuboptAlignScore(thisalnAS, thissalnmask,
alnreadids, salnbestAS)
## filter and store alignment scores
alnAS <- c(alnAS, thisalnAS[mask])
}
## filter now read identifiers
listreadids <- .filterReadIDs(avgene, empar, avsoas, alnreadids,
readids, readidx, alnreadidx, mask)
readids <- listreadids$readids
readidx <- listreadids$readidx
alnreadidx <- listreadids$alnreadidx
if (verbose > 1) {
nfiltered <- nfiltered + sum(mask & !thissalnmask)
message(sprintf("%s: %d alignments processed (%d are primary and pass filters).\n",
basename(path(bf)), n, nfiltered))
}
}
on.exit(close(bf))
if (length(ov) == 0) {
stop(".ervmapQuantExpress: no overlaps were found between reads and features")
}
## Expression quantification
cntvec <- .ervmapGetCounts(avsoas, salnmask, empar, avgene, ov, alnreadidx,
alnAS, salnbestAS, readidx, mask, alnNH, readids,
bf, ovdiscard, n, verbose, iste)
cntvec
}
## Function to know which tags are present in the BAM file (single-end).
## Input is a BamFile object. Returns a data.frame with tags as columns and the
## BAM file as row, indicating if each tag is present in the BAM file (TRUE) or
## not (FALSE).
#' @importFrom Rsamtools scanBamFlag ScanBamParam yieldSize yieldSize<-
#' @importFrom S4Vectors mcols
.get_tags_in_BAM_singleend <- function(bf, soatag="XS") {
yieldSize(bf) <- 1000
sbflags <- scanBamFlag(isUnmappedQuery = FALSE,
isDuplicate = FALSE,
isNotPassingQualityControls = FALSE,
isSupplementaryAlignment = FALSE,
## XS is only set by BWA for primary alignments
isSecondaryAlignment=FALSE)
param <- ScanBamParam(flag = sbflags,
tag = c("nM", "NM", "AS", "NH", soatag))
tags <- param@tag
tags_df <- data.frame(matrix(nrow = length(bf), ncol = length(tags)))
colnames(tags_df) <- tags
open(bf)
r_test <- readGAlignments(bf, param = param)
close(bf)
# Testing, for each sample, if the files contain the different tags
tags_df[, "NH"] <- ifelse(all(is.na(mcols(r_test)$NH)), FALSE, TRUE)
tags_df[, "NM"] <- ifelse(all(is.na(mcols(r_test)$NM)), FALSE, TRUE)
tags_df[, "nM"] <- ifelse(all(is.na(mcols(r_test)$nM)), FALSE, TRUE)
tags_df[, "AS"] <- ifelse(all(is.na(mcols(r_test)$AS)), FALSE, TRUE)
if (!is.null(soatag))
tags_df[, soatag] <- ifelse(all(grepl("\\d",
(mcols(r_test)[[soatag]]))),
TRUE, FALSE)
return(tags_df)
}
## Function to know which tags are present in the BAM file (paired-end).
## Input is a BamFile object. Returns a data.frame with tags as columns and the
## BAM file as row, indicating if each tag is present in the BAM file (TRUE) or
## not (FALSE).
#' @importFrom Rsamtools scanBamFlag ScanBamParam yieldSize yieldSize<- asMates asMates<-
#' @importFrom S4Vectors mcols
.get_tags_in_BAM_pairedend <- function(bf, empar, soatag="XS") {
yieldSize(bf) <- 1000
asMates(bf) <- TRUE
sbflags <- scanBamFlag(isUnmappedQuery = FALSE,
isProperPair = TRUE,
isDuplicate = FALSE,
isNotPassingQualityControls = FALSE,
isSupplementaryAlignment = FALSE,
## XS is only set by BWA for primary alignments
isSecondaryAlignment=FALSE)
param <- ScanBamParam(flag = sbflags,
tag = c("nM", "NM", "AS", "NH", soatag))
tags <- param@tag
tags_df <- data.frame(matrix(nrow = length(bf), ncol = length(tags)))
colnames(tags_df) <- tags
open(bf)
r_test <- readGAlignmentPairs(bf, param = param,
strandMode = empar@strandMode)
close(bf)
# Testing, for each sample, if the files contain the different tags
tags_df[, "NH"] <- ifelse(all(is.na(mcols(first(r_test))$NH)), FALSE, TRUE)
tags_df[, "NM"] <- ifelse(all(is.na(mcols(first(r_test))$NM)), FALSE, TRUE)
tags_df[, "nM"] <- ifelse(all(is.na(mcols(first(r_test))$nM)), FALSE, TRUE)
tags_df[, "AS"] <- ifelse(all(is.na(mcols(first(r_test))$AS)), FALSE, TRUE)
if (!is.null(soatag))
tags_df[, soatag] <- ifelse(all(grepl("\\d",
(mcols(first(r_test))[[soatag]]))),
TRUE, FALSE)
return(tags_df)
}
#' @importFrom Rsamtools scanBamFlag
.setBamFlags <- function(bf, empar, avtags, avgene) {
sbflags <- scanBamFlag()
if (empar@singleEnd == TRUE | empar@fragments == TRUE) {
## if we use the 'XS' tag then do not read secondary alignments
if ("XS" %in% avtags && empar@readMapper == "bwa" &&
empar@suboptimalAlignmentTag %in% c("auto", "XS") && !avgene)
sbflags <- scanBamFlag(isUnmappedQuery=FALSE,
isDuplicate=FALSE,
isNotPassingQualityControls=FALSE,
isSupplementaryAlignment=FALSE,
isSecondaryAlignment=FALSE)
else ## otherwise do not exclude secondary alignments
sbflags <- scanBamFlag(isUnmappedQuery=FALSE,
isDuplicate=FALSE,
isNotPassingQualityControls=FALSE,
isSupplementaryAlignment=FALSE)
} else {
## if we use the 'XS' tag then do not read secondary alignments
if ("XS" %in% avtags && empar@readMapper == "bwa" &&
empar@suboptimalAlignmentTag %in% c("auto", "XS") && !avgene)
sbflags <- scanBamFlag(isUnmappedQuery=FALSE,
isDuplicate=FALSE,
isNotPassingQualityControls=FALSE,
isSupplementaryAlignment=FALSE,
isSecondaryAlignment=FALSE,
isProperPair=TRUE)
else ## otherwise do not exclude secondary alignments
sbflags <- scanBamFlag(isUnmappedQuery=FALSE,
isDuplicate=FALSE,
isNotPassingQualityControls=FALSE,
isSupplementaryAlignment=FALSE,
isProperPair=TRUE)
}
sbflags
}
#' @importFrom S4Vectors queryHits subjectHits
#' @importFrom Matrix rowSums
#' @importFrom sparseMatrixStats rowMaxs colSums2 rowSums2
.suboptimalAlignmentFilteringCounts <- function(empar, ov, alnreadidx, alnAS,
salnmask, salnbestAS) {
## suboptimal alignment filtering using the available secondary
## alignments. according to Tokuyama et al. (2018) pg. 12571.,
## reads/fragments are kept if the difference between the alignment
## score from BWA (AS) and the suboptimal alignment score from BWA
## (field XS) >= 5. here we calculate the suboptimal alignment score
## by searching for the secondary alignment with the highest score.
## fetch all different read identifiers from the overlapping alignments
rd_idx <- sort(unique(alnreadidx[queryHits(ov)]))
## fetch all different transcripts from the overlapping alignments
tx_idx <- sort(unique(subjectHits(ov)))
## build matrices with primary alignment flags, alignment scores (AS)
## and best secondary alignment AS
palnmat <- .buildOvValuesMatrix(empar, ov, !salnmask, alnreadidx,
rd_idx, tx_idx)
asmat <- .buildOvValuesMatrix(empar, ov, alnAS, alnreadidx,
rd_idx, tx_idx)
salnbestasmat <- .buildOvValuesMatrix(empar, ov, salnbestAS, alnreadidx,
rd_idx, tx_idx)
## AS from primary alignments
asprimaryaln <- rowMaxs(palnmat * asmat)
## fetch maximum AS from secondary alignments
salnmaxas <- rowMaxs(salnbestasmat)
salnmaxas[salnmaxas == 0] <- -99
mask <- ((asprimaryaln - salnmaxas) >= empar@suboptimalAlignmentCutoff)
palnmat <- palnmat[mask, ]
cntvec <- rep(0, length(features(empar)))
cntvec[tx_idx] <- colSums2(palnmat)
cntvec
}
#' @importFrom S4Vectors mcols first second
#' @importFrom Rsamtools bamFlagTest
.secondaryAlignmentMask <- function(aln) {
mask <- NULL
if (is(aln, "GAlignments"))
mask <- bamFlagTest(mcols(aln)$flag, "isSecondaryAlignment")
else if (is(aln, "GAlignmentPairs")) {
mask <- bamFlagTest(mcols(first(aln))$flag, "isSecondaryAlignment") |
bamFlagTest(mcols(second(aln))$flag, "isSecondaryAlignment")
} else if (is(aln, "GAlignmentsList")) {
mask <- bamFlagTest(mcols(unlist(aln, use.names=FALSE))$flag,
"isSecondaryAlignment")
} else
stop(sprintf(".secondaryAlignmentMask: wrong class %s\n", class(aln)))
mask
}
#' @importFrom S4Vectors mcols first second
.getAlignmentTagScore <- function(aln, tag, pairedsummaryfun=pmin.int) {
if (is(aln, "GAlignments"))
score <- as.integer(mcols(aln)[[tag]])
else if (is(aln, "GAlignmentPairs")) {
## take the minimum score from both mates
score <- pairedsummaryfun(as.integer(mcols(first(aln))[[tag]]),
mcols(second(aln))[[tag]])
} else if (is(aln, "GAlignmentsList")) {
score <- as.integer(mcols(unlist(aln, use.names=FALSE))[[tag]])
} else
stop(sprintf(".getAlignmentTagScore: wrong class %s\n", class(aln)))
as.integer(score)
}
#' @importFrom S4Vectors mcols
.fetchNMtag <- function(aln) {
nmtag <- "NM"
if (is(aln, "GAlignments")) {
if (is.null(mcols(aln)[[nmtag]]) || all(is.na(mcols(aln)[[nmtag]]))) {
nmtag <- "nM"
if (is.null(mcols(aln)[[nmtag]]) || all(is.na(mcols(aln)[[nmtag]])))
stop("no NM or nM tag in BAM file.")
}
} else if (is(aln, "GAlignmentPairs")) {
if (is.null(mcols(first(aln))[[nmtag]]) ||
all(is.na(mcols(first(aln))[[nmtag]]))) {
nmtag <- "nM"
if (is.null(mcols(first(aln))[[nmtag]]) ||
all(is.na(mcols(first(aln))[[nmtag]])))
stop("no NM or nM tag in BAM file.")
}
} else if (is(aln, "GAlignmentsList")) {
if (is.null(mcols(unlist(aln))[[nmtag]]) ||
all(is.na(mcols(unlist(aln))[[nmtag]]))) {
nmtag <- "nM"
if (is.null(mcols(unlist(aln))[[nmtag]]) ||
all(is.na(mcols(unlist(aln))[[nmtag]])))
stop("no NM or nM tag in BAM file.")
}
} else
stop(sprintf(".fetchNMtag: wrong class %s\n", class(aln)))
nmtag
}
#' @importFrom S4Vectors mcols first second
#' @importFrom IRanges relist
.getAlignmentMismatches <- function(aln) {
mm <- NULL
nmtag <- .fetchNMtag(aln)
if (is(aln, "GAlignments"))
mm <- mcols(aln)[[nmtag]]
else if (is(aln, "GAlignmentPairs")) {
## take the celing of the mean of mismatches from both mates
mm1 <- as.integer(mcols(first(aln))[[nmtag]])
mm2 <- as.integer(mcols(second(aln))[[nmtag]])
mm1[is.na(mm1)] <- mm2[is.na(mm1)]
mm2[is.na(mm2)] <- mm1[is.na(mm2)]
mm <- ceiling((mm1 + mm2) / 2)
} else if (is(aln, "GAlignmentsList")) {
mm <- mcols(unlist(aln, use.names=FALSE))[[nmtag]]
} else
stop(sprintf(".getAlignmentMismatches: wrong class %s\n", class(aln)))
as.integer(mm)
}
#' @importFrom S4Vectors mcols first second
#' @importFrom IRanges relist
#' @importFrom GenomicAlignments qwidth
#' @importClassesFrom GenomicAlignments GAlignments
.getAlignmentQueryWidth <- function(aln) {
qw <- NULL
if (is(aln, "GAlignments"))
qw <- qwidth(aln)
else if (is(aln, "GAlignmentPairs")) {
## take the ceiling of the mean of query widths from both mates
qw1 <- qwidth(first(aln))
qw2 <- qwidth(second(aln))
qw1[is.na(qw1)] <- qw2[is.na(qw1)]
qw2[is.na(qw2)] <- qw1[is.na(qw2)]
qw <- ceiling((qw1 + qw2) / 2)
} else if (is(aln, "GAlignmentsList")) {
qw <- qwidth(unlist(aln, use.names=FALSE))
} else
stop(sprintf(".getAlignmentQueryWidth: wrong class %s\n", class(aln)))
as.integer(qw)
}
#' @importFrom S4Vectors mcols first second List
#' @importFrom GenomicAlignments cigar explodeCigarOpLengths
.getAlignmentSumClipping <- function(aln) {
sc <- NULL
if (is(aln, "GAlignments"))
sc <- vapply(explodeCigarOpLengths(cigar(aln), ops=c("H", "S")),
FUN.VALUE = integer(1), FUN = sum)
else if (is(aln, "GAlignmentPairs")) {
## take the ceiling of the mean of query widths from both mates
sc1 <- vapply(explodeCigarOpLengths(cigar(first(aln)),ops=c("H", "S")),
FUN.VALUE = integer(1), FUN = sum)
sc2 <- vapply(explodeCigarOpLengths(cigar(second(aln)),
ops=c("H", "S")),
FUN.VALUE = integer(1), FUN = sum)
sc1[is.na(sc1)] <- sc2[is.na(sc1)]
sc2[is.na(sc2)] <- sc1[is.na(sc2)]
sc <- ceiling((sc1 + sc2) / 2)
} else if (is(aln, "GAlignmentsList")) {
sc <- vapply(explodeCigarOpLengths(cigar(unlist(aln, use.names=FALSE)),
ops=c("H", "S")),
FUN.VALUE = integer(1), FUN = sum)
} else
stop(sprintf(".getAlignmentSumClipping: wrong class %s\n", class(aln)))
as.integer(sc)
}
#' @importFrom S4Vectors queryHits subjectHits
#' @importFrom Matrix Matrix
# .buildOvValuesMatrix <- function(ov, values, aridx, ridx, fidx) {
# stopifnot(class(values) %in% c("logical", "integer", "numeric")) ## QC
# ovmat <- Matrix(do.call(class(values), list(1)),
# nrow=length(ridx), ncol=length(fidx))
# mt1 <- match(aridx[queryHits(ov)], ridx)
# mt2 <- match(subjectHits(ov), fidx)
# ovmat[cbind(mt1, mt2)] <- values[queryHits(ov)]
#
# ovmat
# }
## Obtains the name of the tag containing the suboptimal alignment score, if
## present
.getsoatag <- function(suboptimalAlignmentTag, readMapper) {
soatag <- NULL
if (suboptimalAlignmentTag == "auto" && readMapper == "bwa") {
soatag <- "XS"
} else if (nchar(suboptimalAlignmentTag) == 2) {
soatag <- suboptimalAlignmentTag
}
soatag
}
## Calls internal functions to obtain the name of the available tags depending
## on whether the data is single- or paired-end
#' @importFrom BiocGenerics basename path
.getavtags <- function(empar, bf, soatag) {
avtags <- character(0)
if (empar@singleEnd)
avtags <- .get_tags_in_BAM_singleend(bf, soatag=soatag)
else
avtags <- .get_tags_in_BAM_pairedend(bf, empar=empar, soatag=soatag)
avtags <- colnames(avtags)[unlist(avtags)]
if (!empar@suboptimalAlignmentTag %in% c("auto", "none", avtags))
stop(sprintf("suboptimal alignment tag '%s' not available in BAM file %s.",
empar@suboptimalAlignmentTag, basename(path(bf))))
avtags
}
## Looks if the suboptimal alignment score tag reported by the user is present
## in the BAM file.
#' @importFrom BiocGenerics basename path
.soasAvail <- function(suboptimalAlignmentCutoff, soatag, avtags, bf, verbose) {
avsoas <- FALSE
if (!is.na(suboptimalAlignmentCutoff)) {
if (isTRUE(soatag %in% avtags)) {
avsoas <- TRUE
if (verbose > 1) {
fstr <- "%s: using suboptimal alignment scores from tag '%s'.\n"
message(sprintf(fstr, basename(path(bf)), soatag))
}
} else if (verbose > 1) {
fstr <- paste("%s: suboptimal alignment filtering based on",
"secondary alignments.\n")
message(sprintf(fstr, basename(path(bf))))
}
} else if (verbose > 1) {
msg <- paste("suboptimal alignment cutoff value is 'NA', skipping",
"suboptimal alignment filtering.\n")
message(msg)
}
avsoas
}
## Obtains the read identifier of the alignment and, when fragments = TRUE,
## sets a different name to each mate.
.getAlnreadids <- function(alnreads, fragments) {
alnreadids <- character(0)
alnreadids <- names(alnreads)
if (fragments) ## with fragments each mate gets a different read identifier
alnreadids <- paste(rep(make.names(names(alnreads)),lengths(alnreads)),
sequence(lengths(alnreads)), sep=".")
alnreadids
}
## Find the suboptimal alignment score from the AS of the secondary alignments.
.findSuboptAlignScore <- function(thisalnAS, thissalnmask, alnreadids,
salnbestAS) {
maxsaasbyreadid <- split(thisalnAS[thissalnmask], alnreadids[thissalnmask])
maxsaasbyreadid <- vapply(maxsaasbyreadid, FUN.VALUE = integer(1), FUN=max)
mt <- match(names(maxsaasbyreadid), names(salnbestAS))
if (any(is.na(mt))) {
tmpas <- rep(NA_integer_, sum(is.na(mt)))
names(tmpas) <- names(maxsaasbyreadid)[is.na(mt)]
salnbestAS <- c(salnbestAS, tmpas)
}
mt <- match(names(maxsaasbyreadid), names(salnbestAS))
stopifnot(all(!is.na(mt))) ## QC
salnbestAS[mt] <- pmax.int(salnbestAS[mt], maxsaasbyreadid, na.rm=TRUE)
salnbestAS
}
#' @importFrom Matrix rowSums
#' @importFrom sparseMatrixStats rowSums2
.adjustcommonTEgene <- function(palnmatfilt, readidx, mask, ov, maskthird,
empar, alnreadidx, rd_idx, tx_idx, alnNH,
iste) {
## Identifying multi-mapping reads
if (length(alnNH) > 0) {
maskMulti <- alnNH > 2
} else {
maskMulti <- duplicated(readidx) | duplicated(readidx, fromLast = TRUE)
maskMulti <- maskMulti[mask]
}
maskMultimat <- .buildOvValuesMatrix(empar, ov, maskMulti, alnreadidx,
rd_idx, tx_idx)
maskMultimat <- maskMultimat[maskthird, ]
maskUniqmat <- .buildOvValuesMatrix(empar, ov, !maskMulti, alnreadidx,
rd_idx, tx_idx)
maskUniqmat <- maskUniqmat[maskthird, ]
istex <- iste[tx_idx]
## which reads overlap both genes and TEs?
idx <- (rowSums2(palnmatfilt[,istex]) > 0) &
(rowSums2(palnmatfilt[,!istex])>0)
## of these, keep only the overlap with the TE in case of multimapping read
palnmatfilt[idx,istex, drop = FALSE] <- as(as(as(
maskMultimat[idx,istex, drop=FALSE]*palnmatfilt[idx,istex, drop=FALSE],
"lMatrix"), "generalMatrix"), "CsparseMatrix")
## of these, keep only the overlap with the gene in case of a unique read
palnmatfilt[idx,!istex, drop = FALSE] <- as(as(as(
maskUniqmat[idx, !istex, drop=FALSE]*palnmatfilt[idx, !istex, drop=FALSE],
"lMatrix"), "generalMatrix"), "CsparseMatrix")
# sum((rowSums(palnmatfilt[,istex]) > 0) & (rowSums(palnmatfilt[,!istex]) > 0)) # [1] 0 (QC)
## when multi-mapping / unique reads cannot be identified because there is
## no NH and no secondary alignments in the BAM file, 'maskMulti' is FALSE
## for all reads, which causes all reads aligned to overlapping genes and
## TEs to be assigned to genes.
palnmatfilt
}
#' @importFrom S4Vectors queryHits subjectHits
#' @importFrom Matrix rowSums
#' @importFrom sparseMatrixStats rowMaxs colSums2 rowSums2
.countbymatrix <- function(empar, ov, alnreadidx, salnmask, alnAS, salnbestAS,
avgene, applysoasfilter, readidx, mask, alnNH,
iste) {
## fetch all different read identifiers from the overlapping alignments
rd_idx <- sort(unique(alnreadidx[queryHits(ov)]))
## fetch all different transcripts from the overlapping alignments
tx_idx <- sort(unique(subjectHits(ov)))
## build matrix with overlaps of primary alignments
palnmat <- .buildOvValuesMatrix(empar, ov, !salnmask, alnreadidx,
rd_idx, tx_idx)
maskthird <- rep(TRUE, nrow(palnmat))
if (isTRUE(applysoasfilter)) {
## build matrices with alignment scores (AS) and best secondary AS
asmat <- .buildOvValuesMatrix(empar, ov, alnAS, alnreadidx, rd_idx,
tx_idx)
salnbestasmat <- .buildOvValuesMatrix(empar, ov, salnbestAS,
alnreadidx, rd_idx, tx_idx)
## AS from primary alignments
asprimaryaln <- rowMaxs(palnmat * asmat)
## fetch maximum AS from secondary alignments
salnmaxas <- rowMaxs(salnbestasmat)
salnmaxas[salnmaxas == 0] <- -99
maskthird <- ((asprimaryaln - salnmaxas) >=
empar@suboptimalAlignmentCutoff)
}
cntvec <- rep(0, length(features(empar)))
if (isTRUE(avgene)) {
palnmatadj <- .adjustcommonTEgene(palnmatfilt = palnmat[maskthird, ],
readidx, mask, ov, maskthird, empar,
alnreadidx, rd_idx, tx_idx, alnNH,iste)
cntvec[tx_idx] <- colSums2(palnmatadj)
## counting counts of discarded reads mapping to genes
if (empar@geneCountMode == "all" && applysoasfilter) {
cntvecdisc <- rep(0, length(features(empar)))
cntvecdisc[tx_idx] <- colSums2(palnmat[!maskthird, ])
cntvec <- cntvec + cntvecdisc
}
} else {
cntvec[tx_idx] <- colSums2(palnmat[maskthird, ])
}
cntvec
}
.filteralnreads <- function(empar, alnreads, mask, thissalnmask,
avsoas, avgene) {
if (empar@fragments) {
if ((avsoas || is.na(empar@suboptimalAlignmentCutoff)) && !avgene) {
alnreads <- unlist(alnreads)[mask & !thissalnmask]
} else {
alnreads <- unlist(alnreads)[mask]
}
} else {
if ((avsoas || is.na(empar@suboptimalAlignmentCutoff)) && !avgene) {
alnreads <- alnreads[mask & !thissalnmask]
} else {
alnreads <- alnreads[mask]
}
}
alnreads
}
#' @importFrom BiocGenerics basename path
.ervmapGetCounts <- function(avsoas, salnmask, empar, avgene, ov, alnreadidx,
alnAS, salnbestAS, readidx, mask, alnNH, readids,
bf, ovdiscard, n, verbose, iste) {
## if suboptimal alignment scores are available or they are not but
## we do not have secondary alignments either, then simply count filtered
## reads. In the latter case, we issue a warning.
if (avsoas || !any(salnmask) || is.na(empar@suboptimalAlignmentCutoff)) {
if (avgene) {
cntvec <- .countbymatrix(empar, ov, alnreadidx, salnmask, alnAS,
salnbestAS, avgene, applysoasfilter = FALSE,
readidx, mask, alnNH, iste)
} else {
cntvec <- countSubjectHits(ov)
}
if (!is.na(empar@suboptimalAlignmentCutoff) && !avsoas) {
msg <- paste("suboptimal alignment scores and secondary",
"alignments not available: skipping suboptimal",
"alignment filtering.")
warning(msg)
}
} else {
salnbestAS <- salnbestAS[match(readids[alnreadidx], names(salnbestAS))]
salnbestAS[is.na(salnbestAS)] <- -99L
rm(readids)
gc()
cntvec <- .countbymatrix(empar, ov, alnreadidx, salnmask, alnAS,
salnbestAS, avgene, applysoasfilter = TRUE,
readidx, mask, alnNH, iste)
if (verbose > 1) {
fstr <- paste("%s: %d alignments processed (%d are primary and",
"pass subptimal alignment filtering).\n")
message(sprintf(fstr, basename(path(bf)), n, sum(cntvec)))
}
}
names(cntvec) <- names(features(empar))
if (isTRUE(avgene) && empar@geneCountMode == "all") {
## Adding counts of reads not passing ERVmap filters to genes
cntvec[!iste] <- cntvec[!iste] + countSubjectHits(ovdiscard)
}
if (length(empar@aggregateby) > 0) {
if (avgene) {
f <- .factoraggregateby(features(empar)[iste,], empar@aggregateby)
f <- c(f, names(features(empar))[!iste])
} else {
f <- .factoraggregateby(features(empar), empar@aggregateby)
}
stopifnot(length(f) == length(cntvec)) ## QC
cntvec <- tapply(cntvec, f, sum, na.rm=TRUE)
}
cntvec
}
.getreadsdiscard <- function(empar, alnreads, mask, thissalnmask) {
if (empar@fragments) {
alnreadsdiscard <- unlist(alnreads)[!mask & !thissalnmask]
} else {
alnreadsdiscard <- alnreads[!mask & !thissalnmask]
}
alnreadsdiscard
}
.fetchNHtag <- function(thissalnmask, avgene, avtags, alnreads) {
if (!any(thissalnmask) & avgene) {
if (isTRUE("NH" %in% avtags)) {
thisNH <- .getAlignmentTagScore(alnreads, tag="NH")
alnNH <- c(alnNH,thisNH)
return(alnNH)
} else {
warning("secondary alignments and NH tag not available in BAM file. Unique reads cannot be differentiated from multi-mapping reads. All reads aligned to overlapping regions between genes and TEs will be considered as gene counts.")
}
}
}
.pass2Filters <- function(alnreads, empar) {
aqw <- .getAlignmentQueryWidth(alnreads)
anm <- .getAlignmentMismatches(alnreads)
asc <- .getAlignmentSumClipping(alnreads)
## Tokuyama et al. (2018) pg. 12571. reads/fragments are kept
## if: (1) the ratio of sum of hard and soft clipping to the
## sequence read length is < 0.02; (2) the ratio of the edit
## distance to the sequence read length is < 0.02
mask <- ((asc / aqw) < empar@maxMismatchRate) &
((anm / aqw) < empar@maxMismatchRate)
mask
}
.pass3Filter <- function(mask, alnreads, empar, thisalnAS, thissalnmask,
avsoas) {
if (avsoas) {
sas <- .getAlignmentTagScore(alnreads, tag=empar@suboptimalAlignmentTag)
## (3) the difference between the alignment score from BWA (AS)
## and the suboptimal alignment score from BWA (field XS) >= 5,
mask <- mask & (thisalnAS - sas >= empar@suboptimalAlignmentCutoff)
} else if (!any(thissalnmask)) {
stop("The BAM file does not contain secondary alignments nor suboptimal alignment scores. Either set suboptimalAlignmentCutoff=NA or provide a BAM file with secondary alignments or suboptimal alignment scores.")
}
mask
}
.filterReadIDs <- function(avgene, empar, avsoas, alnreadids,
readids, readidx, alnreadidx, mask) {
if (avgene || (!is.na(empar@suboptimalAlignmentCutoff) & !avsoas)) {
## store unique read identifiers and what alignment comes from what read
## First multiple alignments from the same read in the same chunk are
## identified
alnreadids_uniq <- unique(alnreadids)
mt <- match(alnreadids, alnreadids_uniq)
l <- length(alnreadids_uniq)
alnreadids_uniq <- c(alnreadids_uniq, alnreadids_uniq[is.na(mt)])
mt[is.na(mt)] <- (l+1):(l+sum(is.na(mt)))
chunkidx <- mt
# Second, the index vectors are created
mt <- match(alnreadids_uniq, readids)
l <- length(readids)
# readids contains just once every readID of all reads in the BAM file
readids <- c(readids, alnreadids_uniq[is.na(mt)])
mt[is.na(mt)] <- (l+1):(l+sum(is.na(mt)))
# index for all alignments in the BAM file
readidx <- c(readidx, mt[chunkidx])
# index for all alignments with mask == TRUE.
alnreadidx <- c(alnreadidx, mt[chunkidx][mask])
}
list(readids = readids, readidx = readidx, alnreadidx = alnreadidx)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.