R/TEtranscripts.R

Defines functions .getScanBamFlag_tt .adjustalnmultiovTE .summarizeCounts .countUniqueRead .countMultiReadsGenes .correctPreference .ttLlh .ttFixedPointFun .ttMstep .ttEstep .correctForTxEffectiveLength .buildOvAlignmentsMatrix .ttEMstep .ttQuantExpress .getAveLen .qtex_tetranscripts TEtranscriptsParam

Documented in TEtranscriptsParam

#' Build a TEtranscripts parameter object
#'
#' Build an object of the class \code{TEtranscriptsParam}
#'
#' @param bfl a character string vector of BAM file names.
#'
#' @param teFeatures A \code{GRanges} or \code{GRangesList} object with the
#' TE annotated features to be quantified. Elements in this object should have
#' names, which are used as a grouping factor for genomic ranges forming a
#' common locus, unless other metadata column names are specified in the
#' \code{aggregateby} parameter.
#'
#' @param aggregateby Character vector with column names from the annotation
#' to be used to aggregate quantifications. By default, this is an empty
#' vector, which means that the names of the input \code{GRanges} or
#' \code{GRangesList} object given in the \code{teFeatures} parameter are used
#' to aggregate quantifications.
#' 
#' @param ovMode Character vector indicating the overlapping mode. Available
#' options are: "ovUnion" (default) and "ovIntersectionStrict",
#' which implement the corresponding methods from HTSeq
#' (\url{https://htseq.readthedocs.io/en/release_0.11.1/count.html}).
#' Ambiguous alignments (alignments overlapping > 1 feature) are addressed
#' as in the original TEtranscripts method.
#'
#' @param geneFeatures (Default NULL) A \code{GRanges} or 
#' \code{GRangesList} object with the
#' gene annotated features to be quantified. Following the TEtranscripts
#' algorithm, overlaps with unique reads are first tallied with respect to
#' these gene features. Elements should have names indicating the gene name/id.
#' In case that \code{geneFeatures} is a \code{GRanges} and contains a 
#' metadata column named
#' \code{type}, only the elements with \code{type} = \code{exon} are considered
#' for the analysis. Then, exon counts are summarized to the gene level. If
#' NULL, gene expression is not quantified.
#'
#' @param singleEnd (Default TRUE) Logical value indicating if reads are single
#' (\code{TRUE}) or paired-end (\code{FALSE}).
#'
#' @param strandMode (Default 1) Numeric vector which can take values 0, 1 or
#' 2.
#' The strand mode is a per-object switch on
#' \code{\link[GenomicAlignments:GAlignmentPairs-class]{GAlignmentPairs}}
#' objects that controls the behavior of the strand getter. See
#' \code{\link[GenomicAlignments:GAlignmentPairs-class]{GAlignmentPairs}}
#' class for further detail. If \code{singleEnd = TRUE}, then \code{strandMode}
#' is ignored.
#'
#' @param ignoreStrand (Default FALSE) Logical value that defines if the strand
#' should be taken into consideration when computing the overlap between reads
#' and annotated features. When \code{ignoreStrand = FALSE}, an aligned read
#' is considered to be overlapping an annotated feature as long as they
#' have a non-empty intersecting genomic range on the same strand, while when
#' \code{ignoreStrand = TRUE} the strand is not considered.
#'
#' @param fragments (Default TRUE) Logical value applied to paired-end data
#' only. In both cases (\code{fragments=FALSE} and \code{fragments=TRUE}), the
#' read-counting method discards not properly paired reads. Moreover,
#' when \code{fragments=FALSE}, only non-ambiguous properly paired reads are
#' counted. When \code{fragments=TRUE}, ambiguous reads are also counted 
#' (see "Pairing criteria" in \code{\link[GenomicAlignments]{readGAlignments}()}). 
#' \code{fragments=TRUE} is equivalent to
#' the behavior of the TEtranscripts algorithm. For further details see
#' \code{\link[GenomicAlignments]{summarizeOverlaps}()}.
#' 
#' @param tolerance A positive numeric scalar storing the minimum tolerance
#' above which the SQUAREM algorithm (Du and Varadhan, 2020) keeps iterating.
#' Default is \code{1e-4} and this value is passed to the \code{tol} parameter
#' of the \code{\link[SQUAREM]{squarem}()} function.
#'
#' @param maxIter A positive integer scalar storing the maximum number of
#' iterations of the SQUAREM algorithm (Du and Varadhan, 2020). Default
#' is 100 and this value is passed to the \code{maxiter} parameter of the
#' \code{\link[SQUAREM]{squarem}()} function.
#'
#' @param verbose (Default \code{TRUE}) Logical value indicating whether to
#' report progress.
#'
#' @details
#' This is the constructor function for objects of the class
#' \code{TEtranscriptsParam-class}. This type of object is the input to the
#' function \code{\link{qtex}()} for quantifying expression of transposable
#' elements using the TEtranscripts method
#' \href{https://doi.org/10.1093/bioinformatics/btv422}{Jin et al. (2015)}. The
#' TEtranscripts algorithm quantifies TE expression by using an EM algorithm
#' to optimally distribute ambiguously mapped reads.
#' 
#' @return A \linkS4class{TEtranscriptsParam} object.
#'
#' @examples
#' bamfiles <- list.files(system.file("extdata", package="atena"),
#'                        pattern="*.bam", full.names=TRUE)
#' \dontrun{
#' ## use the following two instructions to fetch annotations, they are here
#' ## commented out to enable running this example quickly when building and
#' ## checking the package
#' rmskat <- annotaTEs(genome="dm6", parsefun=rmskatenaparser,
#'                     strict=FALSE, insert=500)
#' rmskLTR <- getLTRs(rmskat, relLength=0.8,
#'                    fullLength=TRUE,
#'                    partial=TRUE,
#'                    otherLTR=TRUE)
#' }
#'
#' ## DO NOT TYPE THIS INSTRUCTION, WHICH JUST LOADS A PRE-COMPUTED ANNOTATION
#' ## YOU SHOULD USE THE INSTRUCTIONS ABOVE TO FETCH ANNOTATIONS
#' rmskLTR <- readRDS(system.file("extdata", "rmskatLTRrlen80flenpartoth.rds",
#'                                package="atena"))
#'
#'
#' library(TxDb.Dmelanogaster.UCSC.dm6.ensGene)
#' txdb <- TxDb.Dmelanogaster.UCSC.dm6.ensGene
#' txdb_genes <- genes(txdb)
#'
#' ## build a parameter object for TEtranscripts
#' ttpar <- TEtranscriptsParam(bamfiles,
#'                             teFeatures=rmskLTR,
#'                             geneFeatures=txdb_genes,
#'                             singleEnd=TRUE,
#'                             ignoreStrand=TRUE,
#'                             aggregateby="repName")
#' ttpar
#'
#' @references
#' Jin Y et al. TEtranscripts: a package for including transposable elements
#' in differential expression analysis of RNA-seq datasets.
#' Bioinformatics. 2015;31(22):3593-3599. DOI:
#' \url{https://doi.org/10.1093/bioinformatics/btv422}
#'
#' @importFrom methods is new
#' @importFrom Rsamtools BamFileList
#' @importFrom cli cli_alert_info cli_alert_success
#' @export
#' @rdname TEtranscriptsParam-class
TEtranscriptsParam <- function(bfl, teFeatures, aggregateby=character(0),
                               ovMode="ovUnion",
                               geneFeatures=NULL,
                               singleEnd=TRUE,
                               ignoreStrand=FALSE,
                               strandMode=1L,
                               fragments=TRUE,
                               tolerance=0.0001,
                               maxIter=100L,
                               verbose=TRUE) {

    if (verbose)
        cli_alert_info("Locating BAM files")
    bfl <- .checkBamFileListArgs(bfl, singleEnd, fragments)

    if (!ovMode %in% c("ovUnion","ovIntersectionStrict"))
      stop("'ovMode' should be one of 'ovUnion', 'ovIntersectionStrict'")
    
    if (verbose)
        cli_alert_info("Processing features")
    teFeaturesobjname <- deparse(substitute(teFeatures))
    geneFeaturesobjname <- deparse(substitute(geneFeatures))
    features <- .processFeatures(teFeatures, teFeaturesobjname,
                                 geneFeatures, geneFeaturesobjname,
                                 aggregateby, aggregateexons=TRUE)

    obj <- new("TEtranscriptsParam", bfl=bfl, features=features,
               aggregateby=aggregateby, ovMode=ovMode,
               singleEnd=singleEnd, ignoreStrand=ignoreStrand,
               strandMode=as.integer(strandMode), fragments=fragments,
               tolerance=tolerance, maxIter=as.integer(maxIter))
    if (verbose)
        cli_alert_success("Parameter object successfully created")
    obj
}

#' @param object A \linkS4class{TEtranscriptsParam} object.
#'
#' @importFrom GenomeInfoDb seqlevels
#' @export
#' @aliases show,TEtranscriptsParam-method
#' @rdname TEtranscriptsParam-class
setMethod("show", "TEtranscriptsParam",
        function(object) {
            cat(class(object), "object\n")
            cat(sprintf("# BAM files (%d): %s\n", length(object@bfl),
                        .pprintnames(names(object@bfl))))
            cat(sprintf("# features (%s length %d): %s\n",
                        class(object@features),
                        length(object@features),
                        ifelse(is.null(names(object@features)),
                                paste("on",
                                    .pprintnames(seqlevels(object@features))),
                                .pprintnames(names(object@features)))))
            if (length(object@aggregateby) > 0)
                cat(sprintf("# aggregated by: %s\n",
                            paste(object@aggregateby, collapse=", ")))
            cat(sprintf("# %s; %s",
                        ifelse(object@singleEnd, "single-end", "paired-end"),
                        ifelse(object@ignoreStrand, "unstranded", "stranded")))
            if (!object@ignoreStrand)
                cat(sprintf(" (strandMode=%d)", object@strandMode))
            if (!object@singleEnd)
                cat(sprintf("; %s",
                        ifelse(object@fragments, "counting properly paired, same-strand pairs, singletons, reads with unmapped pairs and other fragments",
                                "counting properly paired reads")))
            cat("\n")
        })

#' @importFrom BiocParallel SerialParam bplapply
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @export
#' @aliases qtex
#' @aliases qtex,TEtranscriptsParam-method
#' @rdname qtex
setMethod("qtex", "TEtranscriptsParam",
        function(x, phenodata=NULL, mode=ovUnion, yieldSize=1e6L,
                    BPPARAM=SerialParam(progressbar=TRUE)) {
            .checkPhenodata(phenodata, length(x@bfl))

            cnt <- bplapply(x@bfl, .qtex_tetranscripts, ttpar=x, mode=x@ovMode,
                            yieldSize=yieldSize, BPPARAM=BPPARAM)
            cnt <- do.call("cbind", cnt)
            cdata <- .createColumnData(cnt, phenodata)
            colnames(cnt) <- rownames(cdata)

            features <- .consolidateFeatures(x, rownames(cnt))

            SummarizedExperiment(assays=list(counts=cnt),
                                 rowRanges=features,
                                 colData=cdata)
        })

#' @importFrom stats setNames aggregate median
#' @importFrom Rsamtools scanBamFlag ScanBamParam yieldSize yieldSize<-
#' @importFrom GenomicRanges width
#' @importFrom GenomicAlignments readGAlignments readGAlignmentsList
#' @importFrom GenomicAlignments readGAlignmentPairs
#' @importFrom S4Vectors Hits queryHits subjectHits
#' @importFrom Matrix Matrix rowSums colSums t which
#' @importFrom IRanges ranges
.qtex_tetranscripts <- function(bf, ttpar, mode, yieldSize=1e6L) {
    mode <- match.fun(mode)
    readfun <- .getReadFunction(ttpar@singleEnd, ttpar@fragments)
    sbflags <- .getScanBamFlag_tt(ttpar@singleEnd)
    param <- ScanBamParam(flag=sbflags, what="flag", tag="AS")
    
    iste <- mcols(features(ttpar))$isTE
    if (any(duplicated(names(features(ttpar)[iste]))))
        stop(".qtex_tetranscripts: duplicated names in TE annotations.")

    ov <- Hits(nLnode=0, nRnode=length(ttpar@features), sort.by.query=TRUE)
    alnreadids <- character(0)
    avgreadlen <- integer(0)
    d <- numeric(0)
    salnmask <- logical(0)
    strand_arg <- "strandMode" %in% formalArgs(readfun)
    yieldSize(bf) <- yieldSize
    open(bf)
    while (length(alnreads <- do.call(readfun, 
                                c(list(file = bf), list(param=param),
                                list(strandMode=ttpar@strandMode)[strand_arg],
                                list(use.names=TRUE))))) {
        alnreads <- .matchSeqinfo(alnreads, ttpar@features)
        avgreadlen <- c(avgreadlen, .getAveLen(ttpar, alnreads))
        if (is(alnreads, "GAlignmentPairs")) {
            d <- c(d, abs(start(first(alnreads)) - start(second(alnreads))))
        } else if (is(alnreads, "GAlignmentsList")) {
            d <- c(d, max(abs(diff(start(alnreads)))))
            d[d < 0] <- 0
        }
        salnmask <- c(salnmask, any(.secondaryAlignmentMask(alnreads)))
        alnreadids <- c(alnreadids, names(alnreads))
        thisov <- mode(alnreads, ttpar@features,
                       ignoreStrand=ttpar@ignoreStrand, inter.feature=FALSE)
        ov <- .appendHits(ov, thisov)
    }
    on.exit(close(bf))
    .checkOvandsaln(ov, salnmask)
    ## get uniquely aligned-reads
    maskuniqaln <- .getMaskUniqueAln(alnreadids)
    if (ttpar@singleEnd == TRUE) {
        # unique + multi-mapping reads (only once) are considered
        avgreadlen <- mean(avgreadlen[!duplicated(alnreadids)])
    } else {
        # only unique reads are considered
        avgreadlen <- mean(avgreadlen[(d <= 500) & maskuniqaln])
    }
    ## fetch all different read identifiers from the overlapping alignments
    readids <- unique(alnreadids[queryHits(ov)])
    ## fetch all different transcripts from the overlapping alignments
    tx_idx <- sort(unique(subjectHits(ov)))
    
    cntvec <- .ttQuantExpress(ov, alnreadids, readids, tx_idx, ttpar, iste,
                              maskuniqaln, avgreadlen)
    ## Original TEtranscripts coerces fractional counts to integer
    setNames(as.integer(cntvec), names(cntvec))
}

#' @importFrom GenomicAlignments extractAlignmentRangesOnReference
#' @importFrom GenomicAlignments cigar start qwidth first second
.getAveLen <- function(ttpar, alnreads) {
    if (is(alnreads, "GAlignments")) {
        avgreadlenaln <- qwidth(alnreads)
    } else if (is(alnreads, "GAlignmentPairs")) {
        d <- abs(start(first(alnreads)) - start(second(alnreads)))
        d2 <- qwidth(second(alnreads))
        avgreadlenaln <- d + d2
    } else if (is(alnreads, "GAlignmentsList")) {
        d <- max(abs(diff(start(alnreads))))
        d[d<0] <- 0
        l <- lengths(alnreads)
        d2 <- unlist(qwidth(alnreads))[cumsum(l)]
        avgreadlenaln <- d + d2
    } else {
        stop(sprintf(".getAveLen: wrong class %s\n", class(alnreads)))
    }
    avgreadlenaln
}

.ttQuantExpress <- function(ov, alnreadids, readids, tx_idx, ttpar, iste,
                            maskuniqaln, avgreadlen) {
    
    ## build a matrix representation of the overlapping alignments
    ovalnmat <- .buildOvAlignmentsMatrix(ov, alnreadids, readids, tx_idx)
    
    mt <- match(readids, alnreadids)
    multigcnt <- rep(0L, length(ttpar@features))
    istex <- as.vector(iste[tx_idx])
    
    if (!all(iste)) {
        ## Correcting for preference of unique/multi-mapping reads to genes or TEs
        ovalnmat <- .correctPreference(ovalnmat, maskuniqaln, mt, istex)
    }
    
    # Getting counts from unique reads
    uniqcnt <- .countUniqueRead(ttpar, ovalnmat, maskuniqaln, mt, tx_idx, istex)
    
    if (!all(iste) & any(!maskuniqaln)) {
        ## Getting gene counts where a multimapping read maps to > 1 gene
        multigcnt <- .countMultiReadsGenes(ttpar, ovalnmat, maskuniqaln, mt,
                                           iste, istex, tx_idx, readids, 
                                           alnreadids, ov, uniqcnt)
    }
    
    # Adjusting 'ovalnmat' when alignment from multimapping read maps > 1 TE
    ovalnmat <- .adjustalnmultiovTE(iste, ov, alnreadids, readids, tx_idx, ovalnmat)
    
    cntvec <- rep(0L, length(ttpar@features))
    cntvec <- .ttEMstep(maskuniqaln, mt, ovalnmat, istex, tx_idx, readids, ttpar,
                        avgreadlen, cntvec)
    ## Summarize counts of unique and multimapping reads
    cntvec <- .summarizeCounts(iste, cntvec, uniqcnt, multigcnt, ttpar)
    cntvec
}



#' @importFrom Matrix Matrix sparseMatrix t which
#' @importFrom sparseMatrixStats colSums2 rowSums2
#' @importFrom SQUAREM squarem
#' @importFrom IRanges ranges
.ttEMstep <- function(maskuniqaln, mt, ovalnmat, istex, tx_idx, readids, ttpar,
                      avgreadlen, cntvec) {
if (sum(!maskuniqaln[mt]) > 0) { ## multi-mapping reads
    ## TEtranscripts doesn't use uniquely-aligned reads to inform the
    ## procedure of distributing multiple-mapping reads, as explained in
    ## Jin et al. (2015) pg. 3594, "to reduce potential bias to certain TEs."
    ## Hence, once counted, we discard unique alignments
    ovalnmat <- ovalnmat[!maskuniqaln[mt], istex]
    yesov <- rowSums2(ovalnmat)>0
    ovalnmat <- ovalnmat[yesov,]
    readids <- readids[!maskuniqaln[mt]][yesov]
    ## the Qmat matrix stores row-wise the probability that read i maps to
    ## a transcript j, assume uniform probabilities by now
    Qmat <- Matrix(0, nrow=length(readids), ncol=length(tx_idx[istex]),
                    dimnames=list(readids, NULL))
    Qmat[which(ovalnmat>0, arr.ind=TRUE)] <- 1
    # Qmat <- Qmat / rowSums2(ovalnmat)
    
    ## Pi, corresponding to rho in Equations (1), (2) and (3) in Jin et al.
    ## (2015) stores probabilities of expression for each transcript, corrected
    ## for its effective length as defined in Eq. (1) of Jin et al. (2015)
    # Pi <- colSums2(Qmat)
    # Pi <- colSums2(ovalnmat / rowSums2(ovalnmat))
    Pi <- colSums2(ovalnmat) / sum(ovalnmat)
    
    if (is(ttpar@features,"GRangesList")) {
        elen <- as.numeric(sum(width(ttpar@features[tx_idx][istex]))) - avgreadlen+1
    } else {
        elen <- width(ttpar@features[tx_idx][istex]) - avgreadlen + 1
    }
    Pi <- .correctForTxEffectiveLength(Pi, elen)
    
    ## use the SQUAREM algorithm to achieve faster EM convergence
    emres <- squarem(par=Pi, Q=Qmat, elen=elen,
                    fixptfn=.ttFixedPointFun,
                    control=list(tol=ttpar@tolerance, maxiter=ttpar@maxIter))
    Pi <- emres$par
    Pi[Pi < 0] <- 0 ## Pi estimates are sometimes negatively close to zero
    Pi <- Pi / sum(Pi)
    ## use the estimated transcript expression probabilities
    ## to finally distribute ambiguously mapping reads
    # probmassbyread <- as.vector(ovalnmat %*% Pi) 
    probmassbyread <- as.vector(Qmat %*% Pi)
    
    # Version 1
    # cntvecovtx <- rowSums(t(ovalnmat / probmassbyread) * Pi, na.rm=TRUE)
    # Version 2
    # wh <- which(ovalnmat, arr.ind=TRUE)
    cntvecovtx <- rep(0, ncol(ovalnmat)) #cntvecovtx <- rep(0, length(tx_idx))
    # x <- tapply(Pi[wh[, "col"]] / probmassbyread[wh[, "row"]], wh[, "col"],
    #             FUN=sum, na.rm=TRUE)
    wh <- which(Qmat==1, arr.ind=TRUE)
    x <- tapply(Pi[wh[, "col"]] / probmassbyread[wh[, "row"]], wh[, "col"],
                FUN=sum, na.rm=TRUE)
    
    cntvecovtx[as.integer(names(x))] <- x
    cntvec[tx_idx][istex] <- cntvecovtx
}
cntvec
}


## private function .buildOvAlignmentsMatrix()
## builds a matrix representation of the overlapping alignments
## with reads on the rows and transcripts on the columns and
## a cell (i, j) set to TRUE if read i aligns to transcript j.
## therefore, when a read i aligns more than once to the same
## transcript j this is represented only once in the matrix
## parameters: ov - Hits object with the overlaps between
##                  alignments and features
##             arids - alignment read identifiers
##             rids - unique read identifiers
##             fidx - features index

#' @importFrom Matrix sparseMatrix
#' @importFrom S4Vectors queryHits subjectHits
.buildOvAlignmentsMatrix <- function(ov, arids, rids, fidx) {
    mt1 <- match(arids[queryHits(ov)], rids)
    mt2 <- match(subjectHits(ov), fidx)
    positions <- cbind(mt1, mt2)
    oamat <- sparseMatrix(positions[,1], positions[,2], x=TRUE, 
                        dims = c(length(rids), length(fidx)))
    oamat
}

## private function .correctForTxEffectiveLength()
## corrects input transcript expression probabilities by
## the transcript effective length as defined in Eq. (1) of
## Jin et al. (2015)
## parameters: x - transcript expression probabilities
##             elen - effective length of each transcript
.correctForTxEffectiveLength <- function(x, elen) {
    x[elen > 0] <- x[elen > 0] / elen[elen > 0]
    x[elen <= 0] <- 0
    if (sum(x) > 0) {
        x <- x / sum(x)
    }
    x
}

## private function .ttEstep()
## E-step of the EM algorithm of TEtranscripts
.ttEstep <- function(Q, Pi) {
    ## this first part is a sparse optimization of doing
    ## X <- t(t(Q) * Pi)
    X <- Q
    j <- rep(seq_len(ncol(X)), diff(X@p))
    X@x <- X@x * Pi[j]
    ## End of first part
    X <- X[rowSums2(X)>0,, drop=FALSE]
    X <- X / rowSums2(X)
    X
}

## private function .ttEstep()
## M-step of the EM algorithm of TEtranscripts
.ttMstep <- function(X) {
    Pi <- colSums2(X) / sum(X@x)
    Pi
}

## private function .ttFixedPointFun()
## fixed point function of the EM algorithm of TEtranscripts
.ttFixedPointFun <- function(Pi, Q, elen) {
    X <- .ttEstep(Q, Pi)
    Pi2 <- .ttMstep(X)
    Pi2 <- .correctForTxEffectiveLength(Pi2, elen)
    Pi2
}

## private function .ttLlh()
## log-likelihood function of the TEtranscripts model
## (currently not used)
.ttLlh <- function(X, Pi, Q) {
    z <- t(t(Q) * Pi)
    mask <- z == 0
    z[mask] <- NA
    sum(X * log(z), na.rm=TRUE)
}


## private function .correctPreference()
## Corrects ovalnmat for preference of unique/multi-mapping reads to
## genes/TEs, respectively
.correctPreference <- function(ovalnmat, maskuniqaln, mt, istex) {
    indx <- (rowSums2(ovalnmat[,istex]) > 0) & (rowSums2(ovalnmat[,!istex]) > 0)
    
    ## Assigning unique reads mapping to both a TE and a gene as gene counts
    # Which unique reads overlap to both genes and TEs?
    idxu <- indx & maskuniqaln[mt]
    if (any(idxu)) {
        # ovalnmat[idxu,istex] <- FALSE
        whu <- which(ovalnmat[idxu,istex], arr.ind = TRUE)
        if (is(whu, "integer")) {
            whu <- as.matrix(data.frame(row = 1, col = whu))
        }
        whudf <- cbind(which(idxu)[whu[,"row"]], which(istex)[whu[,"col"]])
        ovalnmat[whudf] <- FALSE
    }
    
    ## Removing overlaps of multi-mapping reads to genes if at least one
    ## alignment of the read overlaps a TE
    idxm <- indx & !maskuniqaln[mt]
    if (any(idxm)) {
        # ovalnmat[idxm,!istex] <- FALSE
        whm <- which(ovalnmat[idxm,!istex], arr.ind = TRUE)
        if (is(whm, "integer")) {
            whm <- as.matrix(data.frame(row = 1, col = whm))
        }
        whmdf <- cbind(which(idxm)[whm[,"row"]], which(!istex)[whm[,"col"]])
        ovalnmat[whmdf] <- FALSE
    }
    
    ovalnmat
}


## private function .countMultiReadsGenes()
## Counts multi-mapping reads mapping to multiple genes by counting fraction
## counts
.countMultiReadsGenes <- function(ttpar, ovalnmat, maskuniqaln, mt, iste,
                                  istex, tx_idx, readids, alnreadids, ov,
                                  uniqcnt) {
    ovalnmat_multig <- ovalnmat[!maskuniqaln[mt], !istex, drop=FALSE]
    yesg <- rowSums2(ovalnmat_multig)>0
    
    ## Computing counts for reads with multiple alignments mapping to different
    ## reads and also for multimapping reads with only 1 alignment mapping to
    ## a gene
    ovalnmat_multig <- ovalnmat_multig[yesg,,drop=FALSE]
    
    # Getting the num of different alignments mapping to a gene for each read
    alnreadids_multig <-alnreadids[unique(queryHits(
                                            ov[!iste[subjectHits(ov)]]))]
    nalnperread <- table(alnreadids_multig) # getting only overlaps from genes
    readids_multig <- readids[!maskuniqaln[mt]][yesg]
    mt_multig <- match(readids_multig, names(nalnperread))
    
    # Counts provided by unique reads for genes to which multimapping 
    # reads map to
    matmultiguniqc <- t(t(ovalnmat_multig)*uniqcnt[tx_idx][!istex])
    rsum <- rowSums2(matmultiguniqc)
    rsum0 <- which(rsum == 0)
    
    # Reads for which the genes to which the read aligns have > 0 counts
    # matmultiguniqc[!rsum0,, drop=FALSE] <- matmultiguniqc[!rsum0,,drop=FALSE]/
    #     rsum[!rsum0]
    j <- matmultiguniqc@i + 1
    matmultiguniqc@x[!(j %in% rsum0)] <- matmultiguniqc@x[!(j %in% rsum0)] / 
        rsum[j[!(j %in% rsum0)]]
    
    
    # Reads for which the genes to which the read aligns have 0 counts
    # matmultiguniqc[rsum0,, drop=FALSE] <- (ovalnmat_multig[rsum0,, drop=FALSE] /
    #     rowSums2(ovalnmat_multig[rsum0,, drop=FALSE]))
    
    matmultiguniqc@x[(j %in% rsum0)] <- ovalnmat_multig@x[(j %in% rsum0)] / 
        rowSums2(ovalnmat_multig)[j[j %in% rsum0]]
    
    # Adjusting for number of alignments
    matmultiguniqc <- matmultiguniqc/as.numeric(nalnperread[mt_multig])
    
    multigcnt <- rep(0L, length(ttpar@features))
    multigcnt[tx_idx][!istex] <- colSums2(matmultiguniqc)
    multigcnt
}

## private function .countUniqueRead()
## Counts unique reads mapping to TEs and genes (if present)
.countUniqueRead <- function(ttpar, ovalnmat, maskuniqaln, mt, tx_idx, istex) {
    uniqcnt <- rep(0L, length(ttpar@features))
    ovalnmatuniq_g <- ovalnmat[maskuniqaln[mt], !istex, drop=FALSE]
    ovalnmatuniq_te <- ovalnmat[maskuniqaln[mt], istex, drop=FALSE]
    ovmultig <- rowSums2(ovalnmatuniq_g) > 1
    ovmultite <- rowSums2(ovalnmatuniq_te) > 1
    
    # Addressing gene counts: count divided by the number of different genes
    if (any(ovmultig)) {
        # Counting reads overlapping only 1 element
        uniqcnt[tx_idx][!istex] <- colSums2(ovalnmatuniq_g[!ovmultig,,drop=FALSE])
        # Counting reads overlapping more than 1 element
        mg <- ovalnmatuniq_g[ovmultig,,drop=FALSE] /
            rowSums2(ovalnmatuniq_g[ovmultig,,drop=FALSE])
        uniqcnt[tx_idx][!istex] <- uniqcnt[tx_idx][!istex] + colSums2(mg)
    
    } else {
        uniqcnt[tx_idx][!istex] <- colSums2(ovalnmatuniq_g)
    }
    
    # Addressing TE counts: count divided by the number of different TEs
    # proportionally to the expression level of each TE provided by unique counts
    if (any(ovmultite)) {
        # Counting reads overlapping only 1 element
        uniqcnt[tx_idx][istex] <- colSums2(ovalnmatuniq_te[!ovmultite,,drop=FALSE])
        # Counting reads overlapping more than 1 element
        mte <- t(t(
            ovalnmatuniq_te[ovmultite,,drop=FALSE])*uniqcnt[tx_idx][istex])
        nocounts <- which(rowSums2(mte) == 0)
        mte[nocounts,,drop=FALSE] <- ovalnmatuniq_te[
            ovmultite,,drop=FALSE][nocounts,]
        mte <- mte/rowSums2(mte)
        uniqcnt[tx_idx][istex] <- uniqcnt[tx_idx][istex] + colSums2(mte)
    
    } else {
        uniqcnt[tx_idx][istex] <- colSums2(ovalnmatuniq_te)
    }
    
    uniqcnt
}


## private function .summarizeCounts()
## Counts unique reads mapping to TEs and genes (if present)
.summarizeCounts <- function(iste, cntvec, uniqcnt, multigcnt, ttpar) {
    
    ## add multi-mapping and unique-mapping counts
    if (!all(iste)) {
        cntvec <- cntvec + uniqcnt + multigcnt
    } else {
        cntvec <- cntvec + uniqcnt
    }
    names(cntvec) <- names(ttpar@features)
    
    cntvec_t <- cntvec[iste]
    ## aggregate TE quantifications if necessary
    if (length(ttpar@aggregateby) > 0) {
        f <- .factoraggregateby(ttpar@features[iste], ttpar@aggregateby)
        stopifnot(length(f) == length(cntvec_t)) ## QC
        cntvec_t <- tapply(cntvec_t, f, sum, na.rm=TRUE)
    }
    
    ## aggregating exon counts to genes
    if (!all(iste)) {
        cntvec <- c(cntvec_t, cntvec[!iste])
    } else {
        cntvec <- cntvec_t
    }
    
    cntvec <- round(cntvec, digits = 8)
    cntvec
}

## private function .adjustalnmultiovTE()
## Adjusts ovalnmat when a multimapping reads have alignments mapping to > 1
## TE, by dividing the count value by the number of different TEs the
## alignment maps to

#' @importFrom S4Vectors queryHits subjectHits
.adjustalnmultiovTE <- function(iste, ov, alnreadids, readids, tx_idx,
                                ovalnmat) {
    isteov <- iste[subjectHits(ov)]
    multialn <- !(.getMaskUniqueAln(queryHits(ov)))
    ovte_multialn <- ov[isteov & multialn]
    mt1_tem <- match(alnreadids[queryHits(ovte_multialn)], readids)
    mt2_tem <- match(subjectHits(ovte_multialn), tx_idx)
    pos_tem <- cbind(mt1_tem, mt2_tem)
    novperaln <- table(queryHits(ovte_multialn))
    mt_multig <- match(queryHits(ovte_multialn), names(novperaln))
    ovalnmat <- as(as(as(ovalnmat, "dMatrix"), "generalMatrix"), "CsparseMatrix")
    ovalnmat[pos_tem] <- ovalnmat[pos_tem] / as.numeric(novperaln[mt_multig])
    ovalnmat
}

#' @importFrom Rsamtools scanBamFlag
.getScanBamFlag_tt <- function(singleEnd) {
    if (singleEnd == TRUE) {
        sbflags <- scanBamFlag(isUnmappedQuery=FALSE, isDuplicate=FALSE,
                               isNotPassingQualityControls=FALSE)
    } else {
        sbflags <- scanBamFlag(isUnmappedQuery=FALSE, isDuplicate=FALSE,
                               isNotPassingQualityControls=FALSE,
                               isProperPair=TRUE)
    }
  sbflags
}
functionalgenomics/atena documentation built on Nov. 4, 2024, 7:33 p.m.