R/ERVmap.R

Defines functions .filterReadIDs .pass3Filter .pass2Filters .fetchNHtag .getreadsdiscard .ervmapGetCounts .filteralnreads .countbymatrix .adjustcommonTEgene .findSuboptAlignScore .getAlnreadids .soasAvail .getavtags .getsoatag .getAlignmentSumClipping .getAlignmentQueryWidth .getAlignmentMismatches .fetchNMtag .getAlignmentTagScore .secondaryAlignmentMask .suboptimalAlignmentFilteringCounts .setBamFlags .get_tags_in_BAM_pairedend .get_tags_in_BAM_singleend .ervmapQuantExpress .qtex_ervmap ERVmapParam

Documented in ERVmapParam

#' Build an ERVmap parameter object
#'
#' Build an object of the class \code{ERVmapParam}
#'
#' @param bfl A \code{BamFile} or \code{BamFileList} object, or a character
#' string vector of BAM filenames.
#'
#' @param teFeatures A \code{GRanges} or \code{GRangesList} object with the
#' transposable element (TE) annotated features to be quantified. Elements in 
#' this object should have names, which are used as a grouping factor for 
#' genomic ranges forming a common locus, unless other metadata column names 
#' are specified in the \code{aggregateby} parameter.
#'
#' @param aggregateby Character vector with column names in the annotation
#' to be used to aggregate quantifications. By default, this is an empty
#' vector, which means that the names of the input \code{GRanges} or 
#' \code{GRangesList} object given in the \code{teFeatures} parameter are used
#' to aggregate quantifications.
#' 
#' @param ovMode Character vector indicating the overlapping mode. Available
#' options are: "ovUnion" (default) and "ovIntersectionStrict",
#' which implement the corresponding methods from HTSeq
#' (\url{https://htseq.readthedocs.io/en/release_0.11.1/count.html}).
#' Ambiguous alignments (alignments overlapping > 1 feature) are addressed
#' as in the original ERVmap algorithm.
#'
#' @param geneFeatures (Default NULL) A \code{GRanges} or 
#' \code{GRangesList} object with the
#' gene annotated features to be quantified. Overlaps with unique reads are 
#' first tallied with respect to these gene features. Elements should have 
#' names indicating the gene name/id. In case that \code{geneFeatures} 
#' is a \code{GRanges} and contains
#' a metadata column named \code{type}, only the elements with 
#' \code{type} = \code{exon} are considered for the analysis. Then, exon
#' counts are summarized to the gene level. If NULL, gene expression is
#' not quantified.
#'
#' @param singleEnd (Default TRUE) Logical value indicating if reads are single
#' (\code{TRUE}) or paired-end (\code{FALSE}).
#'
#' @param strandMode (Default 1) Numeric vector which can take values 0, 1 or 
#' 2.
#'   The strand mode is a per-object switch on
#'   \code{\link[GenomicAlignments:GAlignmentPairs-class]{GAlignmentPairs}}
#'   objects that controls the behavior of the strand getter. See
#'   \code{\link[GenomicAlignments:GAlignmentPairs-class]{GAlignmentPairs}}
#'   class for further detail. If \code{singleEnd = TRUE}, then
#'   \code{strandMode} is ignored.
#'
#' @param ignoreStrand (Default TRUE) A logical which defines if the strand
#' should be taken into consideration when computing the overlap between reads
#' and annotated features. When \code{ignore_strand = FALSE}, an aligned read
#' is considered to be overlapping an annotated feature as long as they
#' have a non-empty intersecting genomic range on the same strand, while when
#' \code{ignoreStrand = TRUE} the strand is not considered.
#'
#' @param fragments (Default not \code{singleEnd}) A logical; applied to
#' paired-end data only. When \code{fragments=TRUE}, the read-counting
#' method in the original ERVmap algorithm is applied: each mate of a
#' paired-end read is counted (including ambiguous and not properly paired 
#' reads). When
#' \code{fragments=FALSE}, if the two mates of a paired-end read map to the
#' same element, they are counted as a single hit and singletons, reads with
#' unmapped pairs and other ambiguous or not properly paired fragments are 
#' not counted (see "Pairing criteria" in 
#' \code{\link[GenomicAlignments]{readGAlignments}()}).
#'
#' @param maxMismatchRate (Default 0.02) Numeric value storing the maximum
#' mismatch rate employed by the ERVmap algorithm to discard aligned reads
#' whose rate of sum of hard and soft clipping or whose rate of the edit
#' distance over the genome reference to the length of the read is above this
#' threshold.
#'
#' @param suboptimalAlignmentTag (Default "auto") Character string storing the
#' tag name in the BAM files that stores the suboptimal alignment score used in
#' the third filter of ERVmap; see
#' \href{https://doi.org/10.1073/pnas.1814589115}{Tokuyama et al. (2018)}.
#' The default, \code{suboptimalAlignmentTag="auto"}, first extracts the name
#' of the read mapper software from one or more BAM files. If BAM files were
#' generated by BWA, the suboptimal alignment scores are obtained from a tag
#' called \code{XS}. For other read mappers, the suboptimal alignment score
#' is considered to be missing since, except from BWA, no other aligner
#' provides a tag with suboptimal alignment scores. In this case, the available
#' secondary alignments are used to implement an analogous approach to that
#' of the third ERVmap filter. When \code{suboptimalAlignmentTag="none"}, it
#' also performs the latter approach even when the tag \code{XS} is available.
#' When this parameter is different from \code{"auto"} and \code{"none"}, a tag
#' with the given name is used to extract the suboptimal alignment score.
#'
#' @param suboptimalAlignmentCutoff (Default 5) Numeric value storing the
#' cutoff above which the difference between the alignment score and the
#' suboptimal alignment score is considered sufficiently large to retain the
#' alignment. When this value is set to \code{NA}, the filtering step based on
#' suboptimal alignment scores is skipped.
#' 
#' @param geneCountMode (Default \code{"all"}) Character string indicating if 
#' the ERVmap read filters applied to quantify TEs expression should also be 
#' applied when quantifying gene expression (\code{"ervmap"}) or not 
#' (\code{"all"}), in which case all primary alignments mapping to genes are 
#' counted.
#'
#' @param verbose (Default \code{TRUE}) Logical value indicating whether to
#' report progress.
#'
#' @details
#' This is the constructor function for objects of the class
#' \code{ERVmapParam-class}. This type of object is the input to the
#' function \code{\link{qtex}()} for quantifying expression of transposable
#' elements using the ERVmap method
#' \href{https://doi.org/10.1073/pnas.1814589115}{Tokuyama et al. (2018)}. The
#' ERVmap algorithm processes reads following conservative filtering criteria
#' to provide reliable raw count data for each TE.
#' 
#' @return A \linkS4class{ERVmapParam} object.
#'
#' @examples
#' bamfiles <- list.files(system.file("extdata", package="atena"),
#'                        pattern="*.bam", full.names=TRUE)
#' \dontrun{
#' rmskat <- annotaTEs(genome="dm6", parsefun=rmskatenaparser,
#'                     strict=FALSE, insert=500)
#' rmskLTR <- getLTRs(rmskat, relLength=0.8,
#'                    fullLength=TRUE,
#'                    partial=TRUE,
#'                    otherLTR=TRUE)
#' }
#'
#' ## DO NOT TYPE THIS INSTRUCTION, WHICH JUST LOADS A PRE-COMPUTED ANNOTATION
#' ## YOU SHOULD USE THE INSTRUCTIONS ABOVE TO FETCH ANNOTATIONS
#' rmskLTR <- readRDS(system.file("extdata", "rmskatLTRrlen80flenpartoth.rds",
#'                                package="atena"))
#'
#' ## build a parameter object for ERVmap
#' empar <- ERVmapParam(bamfiles,
#'                      teFeatures=rmskLTR,
#'                      singleEnd=TRUE,
#'                      ignoreStrand=TRUE,
#'                      suboptimalAlignmentCutoff=NA)
#' empar
#'
#' @references
#' Tokuyama M et al. ERVmap analysis reveals genome-wide transcription of human
#' endogenous retroviruses. PNAS. 2018;115(50):12565-12572. DOI:
#' \url{https://doi.org/10.1073/pnas.1814589115}
#'
#' @importFrom methods is new
#' @importFrom BiocGenerics path
#' @importFrom cli cli_alert_info cli_alert_success
#' @export
#' @rdname ERVmapParam-class
ERVmapParam <- function(bfl, teFeatures, aggregateby=character(0),
                        ovMode="ovUnion",
                        geneFeatures=NULL,
                        singleEnd=TRUE,
                        ignoreStrand=TRUE,
                        strandMode=1L,
                        fragments=!singleEnd,
                        maxMismatchRate=0.02,
                        suboptimalAlignmentTag="auto",
                        suboptimalAlignmentCutoff=5,
                        geneCountMode="all",
                        verbose=TRUE) {
    
    if (verbose)
        cli_alert_info("Locating BAM files")
    bfl <- .checkBamFileListArgs(bfl, singleEnd, fragments)
    
    if (!ovMode %in% c("ovUnion","ovIntersectionStrict"))
      stop("'ovMode' should be one of 'ovUnion', 'ovIntersectionStrict'")
    
    readmapper <- .checkBamReadMapper(path(bfl))
    
    if (verbose)
        cli_alert_info("Processing features")
    teFeaturesobjname <- deparse(substitute(teFeatures))
    geneFeaturesobjname <- deparse(substitute(geneFeatures))
    features <- .processFeatures(teFeatures, teFeaturesobjname,
                                 geneFeatures, geneFeaturesobjname,
                                 aggregateby, aggregateexons=TRUE)
    
    obj <- new("ERVmapParam", bfl=bfl, features=features,
               aggregateby=aggregateby, ovMode=ovMode,
               singleEnd=singleEnd, ignoreStrand=ignoreStrand,
               strandMode=as.integer(strandMode), fragments=fragments,
               maxMismatchRate=maxMismatchRate, readMapper=readmapper,
               suboptimalAlignmentTag=suboptimalAlignmentTag,
               suboptimalAlignmentCutoff=as.numeric(suboptimalAlignmentCutoff),
               geneCountMode=geneCountMode)
    if (verbose)
        cli_alert_success("Parameter object successfully created")
    obj
}

#' @param object A \linkS4class{ERVmapParam} object.
#'
#' @importFrom methods show
#' @importFrom GenomeInfoDb seqlevels
#'
#' @export
#' @aliases show,ERVmapParam-method
#' @rdname ERVmapParam-class
setMethod("show", "ERVmapParam",
        function(object) {
            cat(class(object), "object\n")
            cat(sprintf("# BAM files (%d): %s\n", length(object@bfl),
                        .pprintnames(names(object@bfl))))
            cat(sprintf("# features (%d): %s\n", length(features(object)),
                        ifelse(is.null(names(features(object))),
                               paste("on",
                                     .pprintnames(seqlevels(features(object)))),
                               .pprintnames(names(features(object))))))
            cat(sprintf("# %s, %s",
                        ifelse(object@singleEnd, "single-end", "paired-end"),
                        ifelse(object@ignoreStrand, "unstranded", "stranded")))
            if (!object@ignoreStrand)
                cat(sprintf(" (strandMode=%d)", object@strandMode))
            if (!object@singleEnd)
                cat(sprintf(", %s",
                            ifelse(object@fragments, 
                                "counting each paired-end mate",
                                "counting both paired-end mates")))
            cat("\n")
        })

#' @importFrom BiocParallel SerialParam bplapply
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @export
#' @aliases qtex
#' @aliases qtex,ERVmapParam-method
#' @rdname qtex
setMethod("qtex", "ERVmapParam",
        function(x, phenodata=NULL, mode=ovUnion, yieldSize=1e6L, verbose=1,
            BPPARAM=SerialParam(progressbar=ifelse(verbose==1, TRUE, FALSE))) {
            .checkPhenodata(phenodata, length(x@bfl))
            
            cnt <- bplapply(x@bfl, .qtex_ervmap, empar=x, mode=x@ovMode,
                            yieldSize=yieldSize, verbose=verbose,
                            BPPARAM=BPPARAM)
            cnt <- do.call("cbind", cnt)
            cdata <- .createColumnData(cnt, phenodata)
            colnames(cnt) <- rownames(cdata)
            
            features <- .consolidateFeatures(x, rownames(cnt))
            
            SummarizedExperiment(assays=list(counts=cnt),
                                 rowRanges=features,
                                 colData=cdata)
        })



#' @importFrom BiocGenerics basename path
#' @importFrom Rsamtools ScanBamParam yieldSize yieldSize<-
#' @importFrom methods formalArgs
#' @importFrom S4Vectors mcols
#' @importFrom IRanges findOverlapPairs
#' @importFrom GenomicRanges pintersect
.qtex_ervmap <- function(bf, empar, mode, yieldSize=1000000, verbose) {
    iste <- mcols(features(empar))$isTE
    if (any(duplicated(names(features(empar)[iste]))))
        stop(".qtex_ervmap: duplicated names in TE annotations.")

    mode <- match.fun(mode)
    readfun <- .getReadFunction(empar@singleEnd, empar@fragments)
    
    ## suboptimalAlignmentTag set to 'auto' implies following the original
    ## ERVmap algorithm; when the read mapper is BWA, it is assumed that
    ## the tag 'XS' stores the suboptimal alignment score.
    soatag <- .getsoatag(empar@suboptimalAlignmentTag, empar@readMapper)
    avtags <- .getavtags(empar, bf, soatag)
    avgene <- !all(iste)
    sbflags <- .setBamFlags(bf, empar, avtags, avgene)
    avsoas <- .soasAvail(empar@suboptimalAlignmentCutoff, soatag, avtags, bf,
                         verbose)
    if ((!is.na(empar@suboptimalAlignmentCutoff)) & 
        isTRUE(soatag %in% avtags)) {
            empar@suboptimalAlignmentTag <- soatag
    }
    param <- ScanBamParam(flag=sbflags, what="flag", tag=avtags)
    
    ## 'ov' is a 'Hits' object with the overlaps between reads and features
    ov <- Hits(nLnode=0, nRnode=length(features(empar)), sort.by.query=TRUE)
    ## 'ovdiscard' is a 'Hits' object with the overlaps of discarded reads
    ovdiscard <- Hits(nLnode=0, nRnode=length(features(empar)[!iste]),
                      sort.by.query=TRUE)
    ## 'salnmask' is a logical mask flagging whether an alignment is secondary
    salnmask <- logical(0)
    salnbestAS <- integer(0)
    ## 'alnAS' stores the alignment score
    alnAS <- integer(0)
    ## 'alnNH' stores the NH tag
    alnNH <- integer(0)
    ## 'readids' stores unique read identifiers only once, while 'alnreadidx'
    ## stores for each read index of the corresponding unique read identifier
    readids <- character(0) 
    alnreadidx <- integer(0) # index in 'readids' for alignm. passing filters
    readidx <- integer(0) # index in 'readids' for all alignm. (passing & not)
    thisalnAS <- integer(0)
    alnreadids <- character(0)
    n <- nfiltered <- 0
    
    strand_arg <- "strandMode" %in% formalArgs(readfun)
    yieldSize(bf) <- yieldSize
    cntvec <- .ervmapQuantExpress(bf, empar, mode, readfun, iste, param,avsoas,
                                  strand_arg, avgene, n, ov, ovdiscard,
                                  salnmask, salnbestAS, alnAS, alnNH, readids,
                                  alnreadidx, readidx, thisalnAS, alnreadids,
                                  nfiltered, verbose, avtags)
    cntvec
}


#' @importFrom methods formalArgs as
#' @importFrom IRanges findOverlapPairs
#' @importFrom GenomicRanges pintersect
#' @importFrom GenomicAlignments findOverlaps
#' @importFrom S4Vectors countQueryHits countSubjectHits
.ervmapQuantExpress <- function(bf, empar, mode, readfun, iste, param, avsoas,
                                strand_arg, avgene, n, ov, ovdiscard, salnmask,
                                salnbestAS, alnAS, alnNH, readids, alnreadidx,
                                readidx, thisalnAS, alnreadids, nfiltered,
                                verbose, avtags) {
    open(bf)
    while (length(alnreads <- do.call(readfun,
                                c(list(file=bf), list(param=param),
                                 list(strandMode=empar@strandMode)[strand_arg],
                                 list(use.names=(!avsoas || avgene)))))) {
        alnreads <- .matchSeqinfo(alnreads, features(empar))
        n <- n + length(alnreads)
        thissalnmask <- .secondaryAlignmentMask(alnreads) ## second. aln mask
        mask <- .pass2Filters(alnreads, empar) ## which alignments pass flt 1&2
    
        if (!is.na(empar@suboptimalAlignmentCutoff)) {
            thisalnAS <- .getAlignmentTagScore(alnreads, tag="AS")
            mask <- .pass3Filter(mask, alnreads, empar, thisalnAS, thissalnmask,
                                avsoas)
        }
    
        if (avgene || (!is.na(empar@suboptimalAlignmentCutoff) & !avsoas)) {
            alnreadids <- .getAlnreadids(alnreads, empar@fragments)
        }
    
        mask[thissalnmask] <- TRUE # Setting TRUE in mask to secondary alignments
        alnreadsdiscard <- .getreadsdiscard(empar, alnreads, mask, thissalnmask)
        alnreads <- .filteralnreads(empar,alnreads,mask,thissalnmask,avsoas,avgene)
        thisov <- mode(alnreads, features(empar), ignoreStrand=empar@ignoreStrand,
                       inter.feature=FALSE)
        ov <- .appendHits(ov, thisov)
    
        if (avgene && empar@geneCountMode == "all") {
            ## calculate and store overlaps between the discarded reads & genes
            thisovdiscard <- mode(alnreadsdiscard, features(empar)[!iste],
                                  ignoreStrand=empar@ignoreStrand, 
                                  inter.feature=FALSE)
            ovdiscard <- .appendHits(ovdiscard, thisovdiscard)
        }
    
        ## Reading NH tag if secondary alignments are not available
        alnNH <- .fetchNHtag(thissalnmask, avgene, avtags, alnreads)
        salnmask <- c(salnmask, thissalnmask[mask]) # secondary alignment mask
    
        if (!avsoas && !is.na(empar@suboptimalAlignmentCutoff)) {
            ## store best secondary alignment score
            salnbestAS <- .findSuboptAlignScore(thisalnAS, thissalnmask, 
                                                alnreadids, salnbestAS)
            ## filter and store alignment scores 
            alnAS <- c(alnAS, thisalnAS[mask])
        }
    
        ## filter now read identifiers
        listreadids <- .filterReadIDs(avgene, empar, avsoas, alnreadids,
                                      readids, readidx, alnreadidx, mask)
        readids <- listreadids$readids
        readidx <- listreadids$readidx
        alnreadidx <- listreadids$alnreadidx
    
        if (verbose > 1) {
            nfiltered <- nfiltered + sum(mask & !thissalnmask)
            message(sprintf("%s: %d alignments processed (%d are primary and pass filters).\n",
                            basename(path(bf)), n, nfiltered))
        }
    }
    on.exit(close(bf))
    if (length(ov) == 0) {
        stop(".ervmapQuantExpress: no overlaps were found between reads and features")
    }
    ## Expression quantification
    cntvec <- .ervmapGetCounts(avsoas, salnmask, empar, avgene, ov, alnreadidx,
                               alnAS, salnbestAS, readidx, mask, alnNH, readids,
                               bf, ovdiscard, n, verbose, iste)
    cntvec
}


## Function to know which tags are present in the BAM file (single-end).
## Input is a BamFile object. Returns a data.frame with tags as columns and the
## BAM file as row, indicating if each tag is present in the BAM file (TRUE) or
## not (FALSE).

#' @importFrom Rsamtools scanBamFlag ScanBamParam yieldSize yieldSize<-
#' @importFrom S4Vectors mcols
.get_tags_in_BAM_singleend <- function(bf, soatag="XS") {
    
    yieldSize(bf) <- 1000
    sbflags <- scanBamFlag(isUnmappedQuery = FALSE,
                            isDuplicate = FALSE,
                            isNotPassingQualityControls = FALSE,
                            isSupplementaryAlignment = FALSE,
                            ## XS is only set by BWA for primary alignments
                            isSecondaryAlignment=FALSE)
    
    param <- ScanBamParam(flag = sbflags,
                            tag = c("nM", "NM", "AS", "NH", soatag))
    
    tags <- param@tag
    tags_df <- data.frame(matrix(nrow = length(bf), ncol = length(tags)))
    colnames(tags_df) <- tags
    
    open(bf)
    r_test <- readGAlignments(bf, param = param)
    close(bf)
    
    # Testing, for each sample, if the files contain the different tags
    tags_df[, "NH"] <- ifelse(all(is.na(mcols(r_test)$NH)), FALSE, TRUE)
    tags_df[, "NM"] <- ifelse(all(is.na(mcols(r_test)$NM)), FALSE, TRUE)
    tags_df[, "nM"] <- ifelse(all(is.na(mcols(r_test)$nM)), FALSE, TRUE)
    tags_df[, "AS"] <- ifelse(all(is.na(mcols(r_test)$AS)), FALSE, TRUE)
    if (!is.null(soatag))
        tags_df[, soatag] <- ifelse(all(grepl("\\d", 
                                                (mcols(r_test)[[soatag]]))),
                                    TRUE, FALSE)
    
    return(tags_df)
}



## Function to know which tags are present in the BAM file (paired-end).
## Input is a BamFile object. Returns a data.frame with tags as columns and the
## BAM file as row, indicating if each tag is present in the BAM file (TRUE) or
## not (FALSE).

#' @importFrom Rsamtools scanBamFlag ScanBamParam yieldSize yieldSize<- asMates asMates<-
#' @importFrom S4Vectors mcols
.get_tags_in_BAM_pairedend <- function(bf, empar, soatag="XS") {
    yieldSize(bf) <- 1000
    asMates(bf) <- TRUE
    sbflags <- scanBamFlag(isUnmappedQuery = FALSE,
                            isProperPair = TRUE,
                            isDuplicate = FALSE,
                            isNotPassingQualityControls = FALSE,
                            isSupplementaryAlignment = FALSE,
                            ## XS is only set by BWA for primary alignments
                            isSecondaryAlignment=FALSE)
    
    param <- ScanBamParam(flag = sbflags,
                            tag = c("nM", "NM", "AS", "NH", soatag))
    
    tags <- param@tag
    tags_df <- data.frame(matrix(nrow = length(bf), ncol = length(tags)))
    colnames(tags_df) <- tags
    
    open(bf)
    r_test <- readGAlignmentPairs(bf, param = param, 
                                    strandMode = empar@strandMode)
    close(bf)
    
    # Testing, for each sample, if the files contain the different tags
    tags_df[, "NH"] <- ifelse(all(is.na(mcols(first(r_test))$NH)), FALSE, TRUE)
    tags_df[, "NM"] <- ifelse(all(is.na(mcols(first(r_test))$NM)), FALSE, TRUE)
    tags_df[, "nM"] <- ifelse(all(is.na(mcols(first(r_test))$nM)), FALSE, TRUE)
    tags_df[, "AS"] <- ifelse(all(is.na(mcols(first(r_test))$AS)), FALSE, TRUE)
    if (!is.null(soatag))
        tags_df[, soatag] <- ifelse(all(grepl("\\d", 
                                            (mcols(first(r_test))[[soatag]]))),
                                    TRUE, FALSE)
    
    return(tags_df)
}




#' @importFrom Rsamtools scanBamFlag
.setBamFlags <- function(bf, empar, avtags, avgene) {
    sbflags <- scanBamFlag()
    
    if (empar@singleEnd == TRUE | empar@fragments == TRUE) {
        ## if we use the 'XS' tag then do not read secondary alignments
        if ("XS" %in% avtags && empar@readMapper == "bwa" &&
            empar@suboptimalAlignmentTag %in% c("auto", "XS") && !avgene)
            sbflags <- scanBamFlag(isUnmappedQuery=FALSE,
                                    isDuplicate=FALSE,
                                    isNotPassingQualityControls=FALSE,
                                    isSupplementaryAlignment=FALSE,
                                    isSecondaryAlignment=FALSE)
        
        else ## otherwise do not exclude secondary alignments
            sbflags <- scanBamFlag(isUnmappedQuery=FALSE,
                                    isDuplicate=FALSE,
                                    isNotPassingQualityControls=FALSE,
                                    isSupplementaryAlignment=FALSE)
    } else {
        ## if we use the 'XS' tag then do not read secondary alignments
        if ("XS" %in% avtags && empar@readMapper == "bwa" &&
            empar@suboptimalAlignmentTag %in% c("auto", "XS") && !avgene)
            sbflags <- scanBamFlag(isUnmappedQuery=FALSE,
                                   isDuplicate=FALSE,
                                   isNotPassingQualityControls=FALSE,
                                   isSupplementaryAlignment=FALSE,
                                   isSecondaryAlignment=FALSE,
                                   isProperPair=TRUE)
        
        else ## otherwise do not exclude secondary alignments
            sbflags <- scanBamFlag(isUnmappedQuery=FALSE,
                                   isDuplicate=FALSE,
                                   isNotPassingQualityControls=FALSE,
                                   isSupplementaryAlignment=FALSE,
                                   isProperPair=TRUE)
    }
    sbflags
}




#' @importFrom S4Vectors queryHits subjectHits
#' @importFrom Matrix rowSums
#' @importFrom sparseMatrixStats rowMaxs colSums2 rowSums2
.suboptimalAlignmentFilteringCounts <- function(empar, ov, alnreadidx, alnAS,
                                                salnmask, salnbestAS) {
    ## suboptimal alignment filtering using the available secondary
    ## alignments. according to Tokuyama et al. (2018) pg. 12571.,
    ## reads/fragments are kept if the difference between the alignment
    ## score from BWA (AS) and the suboptimal alignment score from BWA
    ## (field XS) >= 5. here we calculate the suboptimal alignment score
    ## by searching for the secondary alignment with the highest score.
    
    ## fetch all different read identifiers from the overlapping alignments
    rd_idx <- sort(unique(alnreadidx[queryHits(ov)]))
    
    ## fetch all different transcripts from the overlapping alignments
    tx_idx <- sort(unique(subjectHits(ov)))
    
    ## build matrices with primary alignment flags, alignment scores (AS) 
    ## and best secondary alignment AS
    palnmat <- .buildOvValuesMatrix(empar, ov, !salnmask, alnreadidx,
                                    rd_idx, tx_idx)
    asmat <- .buildOvValuesMatrix(empar, ov, alnAS, alnreadidx,
                                    rd_idx, tx_idx)
    salnbestasmat <- .buildOvValuesMatrix(empar, ov, salnbestAS, alnreadidx,
                                            rd_idx, tx_idx)
    
    ## AS from primary alignments
    asprimaryaln <- rowMaxs(palnmat * asmat)
    ## fetch maximum AS from secondary alignments
    salnmaxas <- rowMaxs(salnbestasmat)
    salnmaxas[salnmaxas == 0] <- -99
    mask <- ((asprimaryaln - salnmaxas) >= empar@suboptimalAlignmentCutoff)
    
    palnmat <- palnmat[mask, ]
    cntvec <- rep(0, length(features(empar)))
    cntvec[tx_idx] <- colSums2(palnmat)
    
    cntvec
}

#' @importFrom S4Vectors mcols first second
#' @importFrom Rsamtools bamFlagTest
.secondaryAlignmentMask <- function(aln) {
    mask <- NULL
    if (is(aln, "GAlignments"))
        mask <- bamFlagTest(mcols(aln)$flag, "isSecondaryAlignment")
    else if (is(aln, "GAlignmentPairs")) {
        mask <- bamFlagTest(mcols(first(aln))$flag, "isSecondaryAlignment") |
            bamFlagTest(mcols(second(aln))$flag, "isSecondaryAlignment")
    } else if (is(aln, "GAlignmentsList")) {
        mask <- bamFlagTest(mcols(unlist(aln, use.names=FALSE))$flag,
                            "isSecondaryAlignment")
    } else
        stop(sprintf(".secondaryAlignmentMask: wrong class %s\n", class(aln)))
    
    mask
}

#' @importFrom S4Vectors mcols first second
.getAlignmentTagScore <- function(aln, tag, pairedsummaryfun=pmin.int) {
    if (is(aln, "GAlignments"))
        score <- as.integer(mcols(aln)[[tag]])
    else if (is(aln, "GAlignmentPairs")) {
        ## take the minimum score from both mates
        score <- pairedsummaryfun(as.integer(mcols(first(aln))[[tag]]), 
                                    mcols(second(aln))[[tag]])
    } else if (is(aln, "GAlignmentsList")) {
        score <- as.integer(mcols(unlist(aln, use.names=FALSE))[[tag]])
    } else
        stop(sprintf(".getAlignmentTagScore: wrong class %s\n", class(aln)))
    
    as.integer(score)
}


#' @importFrom S4Vectors mcols
.fetchNMtag <- function(aln) {
    nmtag <- "NM"
    if (is(aln, "GAlignments")) {
        if (is.null(mcols(aln)[[nmtag]]) || all(is.na(mcols(aln)[[nmtag]]))) {
            nmtag <- "nM"
            if (is.null(mcols(aln)[[nmtag]]) || all(is.na(mcols(aln)[[nmtag]])))
                stop("no NM or nM tag in BAM file.")
        }
    } else if (is(aln, "GAlignmentPairs")) {
        if (is.null(mcols(first(aln))[[nmtag]]) || 
            all(is.na(mcols(first(aln))[[nmtag]]))) {
            nmtag <- "nM"
            if (is.null(mcols(first(aln))[[nmtag]]) || 
                all(is.na(mcols(first(aln))[[nmtag]])))
                stop("no NM or nM tag in BAM file.")
        }
    } else if (is(aln, "GAlignmentsList")) {
        if (is.null(mcols(unlist(aln))[[nmtag]]) || 
            all(is.na(mcols(unlist(aln))[[nmtag]]))) {
            nmtag <- "nM"
            if (is.null(mcols(unlist(aln))[[nmtag]]) || 
                all(is.na(mcols(unlist(aln))[[nmtag]])))
                stop("no NM or nM tag in BAM file.")
        }
    } else
        stop(sprintf(".fetchNMtag: wrong class %s\n", class(aln)))
    
    nmtag
}

#' @importFrom S4Vectors mcols first second
#' @importFrom IRanges relist
.getAlignmentMismatches <- function(aln) {
    mm <- NULL
    nmtag <- .fetchNMtag(aln)
    
    if (is(aln, "GAlignments"))
        mm <- mcols(aln)[[nmtag]]
    else if (is(aln, "GAlignmentPairs")) {
        ## take the celing of the mean of mismatches from both mates
        mm1 <- as.integer(mcols(first(aln))[[nmtag]])
        mm2 <- as.integer(mcols(second(aln))[[nmtag]])
        mm1[is.na(mm1)] <- mm2[is.na(mm1)]
        mm2[is.na(mm2)] <- mm1[is.na(mm2)]
        mm <- ceiling((mm1 + mm2) / 2)
    } else if (is(aln, "GAlignmentsList")) {
        mm <- mcols(unlist(aln, use.names=FALSE))[[nmtag]]
    } else
        stop(sprintf(".getAlignmentMismatches: wrong class %s\n", class(aln)))
    
    as.integer(mm)
}

#' @importFrom S4Vectors mcols first second
#' @importFrom IRanges relist
#' @importFrom GenomicAlignments qwidth
#' @importClassesFrom GenomicAlignments GAlignments
.getAlignmentQueryWidth <- function(aln) {
    qw <- NULL
    if (is(aln, "GAlignments"))
        qw <- qwidth(aln)
    else if (is(aln, "GAlignmentPairs")) {
        ## take the ceiling of the mean of query widths from both mates
        qw1 <- qwidth(first(aln))
        qw2 <- qwidth(second(aln))
        qw1[is.na(qw1)] <- qw2[is.na(qw1)]
        qw2[is.na(qw2)] <- qw1[is.na(qw2)]
        qw <- ceiling((qw1 + qw2) / 2)
    } else if (is(aln, "GAlignmentsList")) {
        qw <- qwidth(unlist(aln, use.names=FALSE))
    } else
        stop(sprintf(".getAlignmentQueryWidth: wrong class %s\n", class(aln)))
    
    as.integer(qw)
}

#' @importFrom S4Vectors mcols first second List
#' @importFrom GenomicAlignments cigar explodeCigarOpLengths
.getAlignmentSumClipping <- function(aln) {
    sc <- NULL
    if (is(aln, "GAlignments"))
        sc <- vapply(explodeCigarOpLengths(cigar(aln), ops=c("H", "S")), 
                    FUN.VALUE = integer(1), FUN = sum)
    else if (is(aln, "GAlignmentPairs")) {
        ## take the ceiling of the mean of query widths from both mates
        sc1 <- vapply(explodeCigarOpLengths(cigar(first(aln)),ops=c("H", "S")), 
                    FUN.VALUE = integer(1), FUN = sum)
        sc2 <- vapply(explodeCigarOpLengths(cigar(second(aln)), 
                                            ops=c("H", "S")), 
                        FUN.VALUE = integer(1), FUN = sum)
        sc1[is.na(sc1)] <- sc2[is.na(sc1)]
        sc2[is.na(sc2)] <- sc1[is.na(sc2)]
        sc <- ceiling((sc1 + sc2) / 2)
    } else if (is(aln, "GAlignmentsList")) {
        sc <- vapply(explodeCigarOpLengths(cigar(unlist(aln, use.names=FALSE)),
                                            ops=c("H", "S")),
                        FUN.VALUE = integer(1), FUN = sum)
    } else
        stop(sprintf(".getAlignmentSumClipping: wrong class %s\n", class(aln)))
    
    as.integer(sc)
}

#' @importFrom S4Vectors queryHits subjectHits
#' @importFrom Matrix Matrix
# .buildOvValuesMatrix <- function(ov, values, aridx, ridx, fidx) {
#     stopifnot(class(values) %in% c("logical", "integer", "numeric")) ## QC
#     ovmat <- Matrix(do.call(class(values), list(1)),
#                     nrow=length(ridx), ncol=length(fidx))
#     mt1 <- match(aridx[queryHits(ov)], ridx)
#     mt2 <- match(subjectHits(ov), fidx)
#     ovmat[cbind(mt1, mt2)] <- values[queryHits(ov)]
#     
#     ovmat
# }

## Obtains the name of the tag containing the suboptimal alignment score, if
## present
.getsoatag <- function(suboptimalAlignmentTag, readMapper) {
    soatag <- NULL
    if (suboptimalAlignmentTag == "auto" && readMapper == "bwa") {
        soatag <- "XS"
    } else if (nchar(suboptimalAlignmentTag) == 2) {
        soatag <- suboptimalAlignmentTag
    }
    soatag
}

## Calls internal functions to obtain the name of the available tags depending
## on whether the data is single- or paired-end
#' @importFrom BiocGenerics basename path
.getavtags <- function(empar, bf, soatag) {
    avtags <- character(0)
    if (empar@singleEnd)
        avtags <- .get_tags_in_BAM_singleend(bf, soatag=soatag)
    else
        avtags <- .get_tags_in_BAM_pairedend(bf, empar=empar, soatag=soatag)
    
    avtags <- colnames(avtags)[unlist(avtags)]
    if (!empar@suboptimalAlignmentTag %in% c("auto", "none", avtags))
        stop(sprintf("suboptimal alignment tag '%s' not available in BAM file %s.",
                    empar@suboptimalAlignmentTag, basename(path(bf))))
    avtags
}


## Looks if the suboptimal alignment score tag reported by the user is present
## in the BAM file.
#' @importFrom BiocGenerics basename path
.soasAvail <- function(suboptimalAlignmentCutoff, soatag, avtags, bf, verbose) {
    avsoas <- FALSE 
    if (!is.na(suboptimalAlignmentCutoff)) {
        if (isTRUE(soatag %in% avtags)) {
            avsoas <- TRUE
            if (verbose > 1) {
                fstr <- "%s: using suboptimal alignment scores from tag '%s'.\n"
                message(sprintf(fstr, basename(path(bf)), soatag))
            }
        } else if (verbose > 1) {
            fstr <- paste("%s: suboptimal alignment filtering based on",
                          "secondary alignments.\n")
            message(sprintf(fstr, basename(path(bf))))
        }
    } else if (verbose > 1) {
        msg <- paste("suboptimal alignment cutoff value is 'NA', skipping",
                     "suboptimal alignment filtering.\n")
        message(msg)
    }
    
    avsoas
}


## Obtains the read identifier of the alignment and, when fragments = TRUE,
## sets a different name to each mate.
.getAlnreadids <- function(alnreads, fragments) {
    alnreadids <- character(0)
    alnreadids <- names(alnreads)
    if (fragments) ## with fragments each mate gets a different read identifier
        alnreadids <- paste(rep(make.names(names(alnreads)),lengths(alnreads)),
                            sequence(lengths(alnreads)), sep=".")
    alnreadids
}


## Find the suboptimal alignment score from the AS of the secondary alignments.
.findSuboptAlignScore <- function(thisalnAS, thissalnmask, alnreadids,
                                    salnbestAS) {
    maxsaasbyreadid <- split(thisalnAS[thissalnmask], alnreadids[thissalnmask])
    maxsaasbyreadid <- vapply(maxsaasbyreadid, FUN.VALUE = integer(1), FUN=max)
    mt <- match(names(maxsaasbyreadid), names(salnbestAS))
    if (any(is.na(mt))) {
        tmpas <- rep(NA_integer_, sum(is.na(mt)))
        names(tmpas) <- names(maxsaasbyreadid)[is.na(mt)]
        salnbestAS <- c(salnbestAS, tmpas)
    }
    mt <- match(names(maxsaasbyreadid), names(salnbestAS))
    stopifnot(all(!is.na(mt))) ## QC
    salnbestAS[mt] <- pmax.int(salnbestAS[mt], maxsaasbyreadid, na.rm=TRUE)
    salnbestAS
}



#' @importFrom Matrix rowSums
#' @importFrom sparseMatrixStats rowSums2
.adjustcommonTEgene <- function(palnmatfilt, readidx, mask, ov, maskthird, 
                                empar, alnreadidx, rd_idx, tx_idx, alnNH,
                                iste) {
    
    ## Identifying multi-mapping reads
    if (length(alnNH) > 0) {
        maskMulti <- alnNH > 2
    } else {
        maskMulti <- duplicated(readidx) | duplicated(readidx, fromLast = TRUE)
        maskMulti <- maskMulti[mask]
    }
    
    maskMultimat <- .buildOvValuesMatrix(empar, ov, maskMulti, alnreadidx,
                                         rd_idx, tx_idx)
    maskMultimat <- maskMultimat[maskthird, ]
    maskUniqmat <- .buildOvValuesMatrix(empar, ov, !maskMulti, alnreadidx,
                                        rd_idx, tx_idx)
    maskUniqmat <- maskUniqmat[maskthird, ]
    
    istex <- iste[tx_idx]
    ## which reads overlap both genes and TEs?
    idx <- (rowSums2(palnmatfilt[,istex]) > 0) &
        (rowSums2(palnmatfilt[,!istex])>0)
    ## of these, keep only the overlap with the TE in case of multimapping read
    palnmatfilt[idx,istex, drop = FALSE] <- as(as(as(
        maskMultimat[idx,istex, drop=FALSE]*palnmatfilt[idx,istex, drop=FALSE],
        "lMatrix"), "generalMatrix"), "CsparseMatrix")
    ## of these, keep only the overlap with the gene in case of a unique read
    palnmatfilt[idx,!istex, drop = FALSE] <- as(as(as(
        maskUniqmat[idx, !istex, drop=FALSE]*palnmatfilt[idx, !istex, drop=FALSE],
        "lMatrix"), "generalMatrix"), "CsparseMatrix")
    # sum((rowSums(palnmatfilt[,istex]) > 0) & (rowSums(palnmatfilt[,!istex]) > 0)) # [1] 0 (QC) 
    
    ## when multi-mapping / unique reads cannot be identified because there is
    ## no NH and no secondary alignments in the BAM file, 'maskMulti' is FALSE
    ## for all reads, which causes all reads aligned to overlapping genes and
    ## TEs to be assigned to genes.
    
    palnmatfilt
}


#' @importFrom S4Vectors queryHits subjectHits
#' @importFrom Matrix rowSums
#' @importFrom sparseMatrixStats rowMaxs colSums2 rowSums2
.countbymatrix <- function(empar, ov, alnreadidx, salnmask, alnAS, salnbestAS, 
                            avgene, applysoasfilter, readidx, mask, alnNH, 
                            iste) {
    ## fetch all different read identifiers from the overlapping alignments
    rd_idx <- sort(unique(alnreadidx[queryHits(ov)]))
    ## fetch all different transcripts from the overlapping alignments
    tx_idx <- sort(unique(subjectHits(ov)))
    ## build matrix with overlaps of primary alignments
    palnmat <- .buildOvValuesMatrix(empar, ov, !salnmask, alnreadidx,
                                    rd_idx, tx_idx)
    
    maskthird <- rep(TRUE, nrow(palnmat))
    
    if (isTRUE(applysoasfilter)) {
        ## build matrices with alignment scores (AS) and best secondary AS
        asmat <- .buildOvValuesMatrix(empar, ov, alnAS, alnreadidx, rd_idx,
                                        tx_idx)
        salnbestasmat <- .buildOvValuesMatrix(empar, ov, salnbestAS,
                                                alnreadidx, rd_idx, tx_idx)
        ## AS from primary alignments
        asprimaryaln <- rowMaxs(palnmat * asmat)
        ## fetch maximum AS from secondary alignments
        salnmaxas <- rowMaxs(salnbestasmat)
        salnmaxas[salnmaxas == 0] <- -99
        maskthird <- ((asprimaryaln - salnmaxas) >= 
                        empar@suboptimalAlignmentCutoff)
    }
    
    cntvec <- rep(0, length(features(empar)))
    
    if (isTRUE(avgene)) {
        palnmatadj <- .adjustcommonTEgene(palnmatfilt = palnmat[maskthird, ], 
                                        readidx, mask, ov, maskthird, empar,
                                        alnreadidx, rd_idx, tx_idx, alnNH,iste)
        cntvec[tx_idx] <- colSums2(palnmatadj)
        ## counting counts of discarded reads mapping to genes
        if (empar@geneCountMode == "all" && applysoasfilter) {
            cntvecdisc <- rep(0, length(features(empar)))
            cntvecdisc[tx_idx] <- colSums2(palnmat[!maskthird, ])
            cntvec <- cntvec + cntvecdisc
        }
        
    } else {
        cntvec[tx_idx] <- colSums2(palnmat[maskthird, ])
    }
    
    cntvec
}


.filteralnreads <- function(empar, alnreads, mask, thissalnmask,
                                avsoas, avgene) {
    
    if (empar@fragments) {
        if ((avsoas || is.na(empar@suboptimalAlignmentCutoff)) && !avgene) {
            alnreads <- unlist(alnreads)[mask & !thissalnmask]
        } else {
            alnreads <- unlist(alnreads)[mask]
        }
    } else {
        if ((avsoas || is.na(empar@suboptimalAlignmentCutoff)) && !avgene) {
            alnreads <- alnreads[mask & !thissalnmask]
        } else {
            alnreads <- alnreads[mask]
        }
    }
    alnreads
}

#' @importFrom BiocGenerics basename path
.ervmapGetCounts <- function(avsoas, salnmask, empar, avgene, ov, alnreadidx,
                            alnAS, salnbestAS, readidx, mask, alnNH, readids,
                            bf, ovdiscard, n, verbose, iste) {
    ## if suboptimal alignment scores are available or they are not but
    ## we do not have secondary alignments either, then simply count filtered
    ## reads. In the latter case, we issue a warning.
    if (avsoas || !any(salnmask) || is.na(empar@suboptimalAlignmentCutoff)) {
    
        if (avgene) {
            cntvec <- .countbymatrix(empar, ov, alnreadidx, salnmask, alnAS,
                                salnbestAS, avgene, applysoasfilter = FALSE,
                                readidx, mask, alnNH, iste)
        } else {
            cntvec <- countSubjectHits(ov)
        }
        if (!is.na(empar@suboptimalAlignmentCutoff) && !avsoas) {
            msg <- paste("suboptimal alignment scores and secondary",
                         "alignments not available: skipping suboptimal",
                         "alignment filtering.")
            warning(msg)
        }
    
    } else {
        salnbestAS <- salnbestAS[match(readids[alnreadidx], names(salnbestAS))]
        salnbestAS[is.na(salnbestAS)] <- -99L
        rm(readids)
        gc()
        cntvec <- .countbymatrix(empar, ov, alnreadidx, salnmask, alnAS,
                                salnbestAS, avgene, applysoasfilter = TRUE,
                                readidx, mask, alnNH, iste)
        if (verbose > 1) {
            fstr <- paste("%s: %d alignments processed (%d are primary and",
                          "pass subptimal alignment filtering).\n")
            message(sprintf(fstr, basename(path(bf)), n, sum(cntvec)))
        }
    }
    names(cntvec) <- names(features(empar))
    
    if (isTRUE(avgene) && empar@geneCountMode == "all") {
        ## Adding counts of reads not passing ERVmap filters to genes 
        cntvec[!iste] <- cntvec[!iste] + countSubjectHits(ovdiscard)
    }
    
    if (length(empar@aggregateby) > 0) {
        if (avgene) {
            f <- .factoraggregateby(features(empar)[iste,], empar@aggregateby)
            f <- c(f, names(features(empar))[!iste])
        } else {
            f <- .factoraggregateby(features(empar), empar@aggregateby)
        }
        
        stopifnot(length(f) == length(cntvec)) ## QC
        cntvec <- tapply(cntvec, f, sum, na.rm=TRUE)
    }
    cntvec
}


.getreadsdiscard <- function(empar, alnreads, mask, thissalnmask) {
    
    if (empar@fragments) {
        alnreadsdiscard <- unlist(alnreads)[!mask & !thissalnmask] 
    } else {
        alnreadsdiscard <- alnreads[!mask & !thissalnmask]
    }
    alnreadsdiscard
}


.fetchNHtag <- function(thissalnmask, avgene, avtags, alnreads) {
    
    if (!any(thissalnmask) & avgene) {
        if (isTRUE("NH" %in% avtags)) {
            thisNH <- .getAlignmentTagScore(alnreads, tag="NH")
            alnNH <- c(alnNH,thisNH)
            return(alnNH)
        } else {
            warning("secondary alignments and NH tag not available in BAM file. Unique reads cannot be differentiated from multi-mapping reads. All reads aligned to overlapping regions between genes and TEs will be considered as gene counts.")
        }
    }
}


.pass2Filters <- function(alnreads, empar) {
    
    aqw <- .getAlignmentQueryWidth(alnreads)
    anm <- .getAlignmentMismatches(alnreads)
    asc <- .getAlignmentSumClipping(alnreads)
    ## Tokuyama et al. (2018) pg. 12571. reads/fragments are kept
    ## if: (1) the ratio of sum of hard and soft clipping to the
    ## sequence read length is < 0.02; (2) the ratio of the edit
    ## distance to the sequence read length is < 0.02
    mask <- ((asc / aqw) < empar@maxMismatchRate) & 
        ((anm / aqw) < empar@maxMismatchRate)
    
    mask
}


.pass3Filter <- function(mask, alnreads, empar, thisalnAS, thissalnmask, 
                            avsoas) {
    if (avsoas) {
        sas <- .getAlignmentTagScore(alnreads, tag=empar@suboptimalAlignmentTag)
        ## (3) the difference between the alignment score from BWA (AS)
        ## and the suboptimal alignment score from BWA (field XS) >= 5,
        mask <- mask & (thisalnAS - sas >= empar@suboptimalAlignmentCutoff)
    } else if (!any(thissalnmask)) {
        stop("The BAM file does not contain secondary alignments nor suboptimal alignment scores. Either set suboptimalAlignmentCutoff=NA or provide a BAM file with secondary alignments or suboptimal alignment scores.")
    }
    
    mask
}


.filterReadIDs <- function(avgene, empar, avsoas, alnreadids,
                            readids, readidx, alnreadidx, mask) {
    
    if (avgene || (!is.na(empar@suboptimalAlignmentCutoff) & !avsoas)) {
        
        ## store unique read identifiers and what alignment comes from what read
        ## First multiple alignments from the same read in the same chunk are 
        ## identified
        alnreadids_uniq <- unique(alnreadids)
        mt <- match(alnreadids, alnreadids_uniq) 
        l <- length(alnreadids_uniq)
        alnreadids_uniq <- c(alnreadids_uniq, alnreadids_uniq[is.na(mt)])
        mt[is.na(mt)] <- (l+1):(l+sum(is.na(mt)))
        chunkidx <- mt
        # Second, the index vectors are created
        mt <- match(alnreadids_uniq, readids) 
        l <- length(readids)
        # readids contains just once every readID of all reads in the BAM file
        readids <- c(readids, alnreadids_uniq[is.na(mt)]) 
        mt[is.na(mt)] <- (l+1):(l+sum(is.na(mt)))
        # index for all alignments in the BAM file
        readidx <- c(readidx, mt[chunkidx]) 
        # index for all alignments with mask == TRUE.
        alnreadidx <- c(alnreadidx, mt[chunkidx][mask]) 
    
    }
    
    list(readids = readids, readidx = readidx, alnreadidx = alnreadidx)
}
functionalgenomics/atena documentation built on May 7, 2024, 10:33 a.m.