R/Telescope.R

Defines functions .getScanBamFlag_ts .getOverlapLength .getAlignmentLength .getAlignmentASScoreTS .tsFixedPointFun .tsMstepTheta .tsMstepPi .tsEstep .tssummarizeCounts .correctPreferenceTS .rescaleAS .reassign .checkreassignModes .tsEMstep .getNoFeatureOv .qtex_telescope TelescopeParam

Documented in TelescopeParam

#' Build a Telescope parameter object
#'
#' Build an object of the class \code{TelescopeParam}.
#'
#' @param bfl A \code{BamFile} or \code{BamFileList} object, or a character
#' string vector of BAM filenames.
#'
#' @param teFeatures A \code{GRanges} or \code{GRangesList} object. Elements
#' in this object should have names, which are used as a grouping factor for
#' genomic ranges forming a common locus (equivalent to "locus" column in
#' Telescope). This grouping is performed previous to TE expression
#' quantification, unlike the aggregation of quantifications performed when
#' the \code{aggregateby} parameter is specified, which is performed after
#' individual TE instances are quantified.
#'
#' @param aggregateby Character vector with column names from the annotation
#' to be used to aggregate quantifications. By default, this is an empty
#' vector, which means that the names of the input \code{GRanges} or
#' \code{GRangesList} object given in the \code{teFeatures} parameter are used
#' to aggregate quantifications.
#' 
#' @param ovMode Character vector indicating the overlapping mode. Available
#' options are: "ovUnion" (default) and "ovIntersectionStrict",
#' which implement the corresponding methods from HTSeq
#' (\url{https://htseq.readthedocs.io/en/release_0.11.1/count.html}).
#' Ambiguous alignments (alignments overlapping > 1 feature) are addressed
#' as in the original Telescope method: the overlap with the longest
#' overlapping length is kept.
#'
#' @param geneFeatures (Default NULL) A \code{GRanges} or 
#' \code{GRangesList} object with the
#' gene annotated features to be quantified. The TEtranscripts approach for
#' gene expression quantification is used, in which overlaps with multi-mapping
#' reads are preferentially assigned to TEs. Elements should have names
#' indicating the gene name/id. In case that \code{geneFeatures} 
#' is a \code{GRanges} and contains a
#' metadata column named \code{type}, only the elements with
#' \code{type} = \code{exon} are considered for the analysis. Then, exon counts
#' are summarized to the gene level. If NULL, gene expression is not
#' quantified.
#'
#' @param singleEnd (Default TRUE) Logical value indicating if reads are single
#' (\code{TRUE}) or paired-end (\code{FALSE}).
#'
#' @param strandMode (Default 1) Numeric vector which can take values 0, 1 or
#' 2.
#' The strand mode is a per-object switch on
#' \code{\link[GenomicAlignments:GAlignmentPairs-class]{GAlignmentPairs}}
#' objects that controls the behavior of the strand getter. See
#' \code{\link[GenomicAlignments:GAlignmentPairs-class]{GAlignmentPairs}}
#' class for further detail. If \code{singleEnd = TRUE}, then \code{strandMode}
#' is ignored.
#'
#' @param ignoreStrand (Default FALSE) A logical which defines if the strand
#' should be taken into consideration when computing the overlap between reads
#' and annotated features. When \code{ignoreStrand = FALSE}, an aligned read
#' is considered to be overlapping an annotated feature as long as they
#' have a non-empty intersecting genomic range on the same strand, while when
#' \code{ignoreStrand = TRUE} the strand is not considered.
#'
#' @param fragments (Default FALSE) A logical; applied to paired-end data only.
#' When \code{fragments=FALSE}, the read-counting method only counts
#' ‘mated pairs’ from opposite strands (non-ambiguous properly paired reads), 
#' while when \code{fragments=TRUE} same-strand pairs, singletons, reads with 
#' unmapped pairs and other ambiguous or not properly paired fragments
#' are also counted (see "Pairing criteria" in 
#' \code{\link[GenomicAlignments]{readGAlignments}()}). \code{fragments=TRUE} 
#' is equivalent to the original Telescope algorithm. For further details see
#' \code{\link[GenomicAlignments]{summarizeOverlaps}()}.
#'
#' @param minOverlFract (Default 0.2) A numeric scalar. \code{minOverlFract}
#' is multiplied by the read length and the resulting value is used to
#' discard alignments for which the overlapping length (number of base
#' pairs the alignment and the feature overlap) is lower. When no minimum 
#' overlap is required, set \code{minOverlFract = 0}.
#'
#' @param pi_prior (Default 0) A positive integer scalar indicating the prior
#' on pi. This is equivalent to adding n unique reads.
#'
#' @param theta_prior (Default 0) A positive integer scalar storing the prior
#' on Q. Equivalent to adding n non-unique reads.
#'
#' @param em_epsilon (Default 1e-7) A numeric scalar indicating the EM
#' Algorithm Epsilon cutoff.
#'
#' @param maxIter A positive integer scalar storing the maximum number of
#' iterations of the EM SQUAREM algorithm (Du and Varadhan, 2020). Default
#' is 100 and this value is passed to the \code{maxiter} parameter of the
#' \code{\link[SQUAREM]{squarem}()} function.
#' 
#' @param reassign_mode (Default 'exclude') Character vector indicating
#' reassignment mode after EM step. 
#' Available methods are 'exclude' (reads with more than one best
#' assignment are excluded from the final counts), 'choose' (when reads have
#' more than one best assignment, one of them is randomly chosen), 'average'
#' (the read count is divided evenly among the best assignments) and 'conf'
#' (only assignments that exceed a certain threshold -defined by 
#' \code{conf_prob} parameter- are accepted, then the read count is
#' proportionally divided among the assignments above \code{conf_prob}).
#' 
#' @param conf_prob (Default 0.9) Minimum probability for high confidence
#' assignment.
#' 
#' @param verbose (Default \code{TRUE}) Logical value indicating whether to
#' report progress.
#'
#' @details
#' This is the constructor function for objects of the class
#' \code{TelescopeParam-class}. This type of object is the input to the
#' function \code{\link{qtex}()} for quantifying expression of transposable
#' elements, which will call the Telescope algorithm
#' \href{https://doi.org/10.1371/journal.pcbi.1006453}{Bendall et al. (2019)}
#' with this type of object.
#'
#' @return A \linkS4class{TelescopeParam} object.
#'
#' @examples
#' bamfiles <- list.files(system.file("extdata", package="atena"),
#'                        pattern="*.bam", full.names=TRUE)
#' \dontrun{
#' ## use the following two instructions to fetch annotations, they are here
#' ## commented out to enable running this example quickly when building and
#' ## checking the package
#' rmskat <- annotaTEs(genome="dm6", parsefun=rmskatenaparser,
#'                     strict=FALSE, insert=500)
#' rmskLTR <- getLTRs(rmskat, relLength=0.8,
#'                    fullLength=TRUE,
#'                    partial=TRUE,
#'                    otherLTR=TRUE)
#' }
#'
#' ## DO NOT TYPE THIS INSTRUCTION, WHICH JUST LOADS A PRE-COMPUTED ANNOTATION
#' ## YOU SHOULD USE THE INSTRUCTIONS ABOVE TO FETCH ANNOTATIONS
#' rmskLTR <- readRDS(system.file("extdata", "rmskatLTRrlen80flenpartoth.rds",
#'                                package="atena"))
#'
#' ## build a parameter object for Telescope
#' tspar <- TelescopeParam(bfl=bamfiles,
#'                         teFeatures=rmskLTR,
#'                         singleEnd=TRUE,
#'                         ignoreStrand=TRUE)
#' tspar
#'
#' @references
#' Bendall et al. Telescope: characterization of the retrotranscriptome by
#' accurate estimation of transposable element expression.
#' PLOS Comp. Biol. 2019;15(9):e1006453. DOI:
#' \url{https://doi.org/10.1371/journal.pcbi.1006453}
#'
#' @importFrom methods is new
#' @importFrom Rsamtools BamFileList
#' @importFrom S4Vectors mcols
#' @importFrom cli cli_alert_info cli_alert_success
#' @export
#' @rdname TelescopeParam-class
TelescopeParam <- function(bfl, teFeatures, aggregateby=character(0),
                            ovMode="ovUnion",
                            geneFeatures=NULL,
                            singleEnd=TRUE,
                            strandMode=1L,
                            ignoreStrand=FALSE,
                            fragments=FALSE,
                            minOverlFract=0.2,
                            pi_prior=0L,
                            theta_prior=0L,
                            em_epsilon=1e-7,
                            maxIter=100L,
                            reassign_mode="exclude",
                            conf_prob=0.9,
                            verbose=TRUE) {

    if (verbose)
        cli_alert_info("Locating BAM files")
    bfl <- .checkBamFileListArgs(bfl, singleEnd, fragments)
    
    if (!reassign_mode %in% c("exclude","choose","average","conf"))
      stop("'reassign_mode' should be one of 'exclude', 'choose', 'average' or 'conf'")
    
    if (!ovMode %in% c("ovUnion","ovIntersectionStrict"))
      stop("'ovMode' should be one of 'ovUnion', 'ovIntersectionStrict'")
    
    if (verbose)
        cli_alert_info("Processing features")
    features <- .processFeatures(teFeatures, deparse(substitute(teFeatures)),
                                geneFeatures,deparse(substitute(geneFeatures)),
                                aggregateby, aggregateexons=TRUE)
    
    obj <- new("TelescopeParam", bfl=bfl, features=features,
                aggregateby=aggregateby, ovMode=ovMode,
                singleEnd=singleEnd,ignoreStrand=ignoreStrand,
                strandMode=as.integer(strandMode), fragments=fragments,
                minOverlFract=minOverlFract, pi_prior=pi_prior,
                theta_prior=theta_prior, em_epsilon=em_epsilon,
                maxIter=as.integer(maxIter), reassign_mode=reassign_mode,
                conf_prob=conf_prob)
    if (verbose)
        cli_alert_success("Parameter object successfully created")
    obj
}

#' @param object A \linkS4class{TelescopeParam} object.
#'
#' @importFrom GenomeInfoDb seqlevels
#' @export
#' @aliases show,TelescopeParam-method
#' @rdname TelescopeParam-class
setMethod("show", "TelescopeParam",
        function(object) {
            cat(class(object), "object\n")
            cat(sprintf("# BAM files (%d): %s\n", length(object@bfl),
                        .pprintnames(names(object@bfl))))
            cat(sprintf("# features (%s length %d): %s\n",
                        class(object@features),
                        length(object@features),
                        ifelse(is.null(names(object@features)),
                                paste("on",
                                    .pprintnames(seqlevels(object@features))),
                                .pprintnames(names(object@features)))))
            if (length(object@aggregateby) > 0)
                cat(sprintf("# aggregated by: %s\n",
                            paste(object@aggregateby, collapse=", ")))
            cat(sprintf("# %s; %s",
                        ifelse(object@singleEnd, "single-end", "paired-end"),
                        ifelse(object@ignoreStrand, "unstranded", "stranded")))
            if (!object@ignoreStrand)
                cat(sprintf(" (strandMode=%d)", object@strandMode))
            if (!object@singleEnd)
                cat(sprintf("; %s",
                        ifelse(object@fragments, "counting properly paired, same-strand pairs, singletons, reads with unmapped pairs and other fragments",
                                "counting properly paired reads")))
            cat("\n")
        })

#' @importFrom BiocParallel SerialParam bplapply
#' @importFrom S4Vectors DataFrame
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @export
#' @aliases qtex
#' @aliases qtex,TelescopeParam-method
#' @rdname qtex
setMethod("qtex", "TelescopeParam",
          function(x, phenodata=NULL, mode=ovUnion, yieldSize=1e6L,
                   auxiliaryFeatures=FALSE,
                   BPPARAM=SerialParam(progressbar=TRUE)) {
            .checkPhenodata(phenodata, length(x@bfl))
            
            cnt <- bplapply(x@bfl, .qtex_telescope, tspar=x, mode=x@ovMode,
                            yieldSize=yieldSize, BPPARAM=BPPARAM)
            cnt <- do.call("cbind", cnt)
            cdata <- .createColumnData(cnt, phenodata)
            colnames(cnt) <- rownames(cdata)
            fnames <- rownames(cnt)[-nrow(cnt)]
            whnofeat <- 1L
            if (!auxiliaryFeatures) {
                cnt <- cnt[-nrow(cnt), ]
                whnofeat <- integer(0)
            }
            features <- .consolidateFeatures(x, fnames, whnofeat)
            SummarizedExperiment(assays=list(counts=cnt),
                                 rowRanges=features,
                                 colData=cdata)
        })


#' @importFrom stats setNames
#' @importFrom Rsamtools scanBamFlag ScanBamParam yieldSize yieldSize<-
#' @importFrom BiocGenerics path
#' @importFrom GenomicAlignments readGAlignments readGAlignmentsList
#' @importFrom GenomicAlignments readGAlignmentPairs
#' @importFrom S4Vectors mcols Hits queryHits subjectHits
#' @importFrom Matrix Matrix t which
#' @importFrom sparseMatrixStats rowSums2 colSums2
#' @importFrom SQUAREM squarem
.qtex_telescope <- function(bf, tspar, mode, yieldSize=1e6L) {
    mode <- match.fun(mode)
    readfun <- .getReadFunction(tspar@singleEnd, tspar@fragments)
    .checkreassignModes(tspar)
    sbflags <- .getScanBamFlag_ts(tspar@singleEnd, tspar@fragments)
    param <- ScanBamParam(flag=sbflags, what="flag", tag="AS")
    ## iste <- as.vector(attributes(tspar@features)$isTE[,1])
    iste <- mcols(features(tspar))$isTE
    if (any(duplicated(names(features(tspar)[iste]))))
        stop(".qtex_telescope: duplicated names in TE annotations.")

    ov <- Hits(nLnode=0, nRnode=length(tspar@features), sort.by.query=TRUE)
    alnreadids <- character(0)
    asvalues <- integer()
    alen <- numeric(0)
    mreadlen <- numeric(0)
    salnmask <- logical(0)
    strand_arg <- "strandMode" %in% formalArgs(readfun)
    yieldSize(bf) <- yieldSize
    open(bf)
    while (length(alnreads <- do.call(readfun,
                                c(list(file = bf), list(param=param),
                                list(strandMode=tspar@strandMode)[strand_arg],
                                list(use.names=TRUE))))) {
        alnreads <- .matchSeqinfo(alnreads, tspar@features)
        alnreadids <- c(alnreadids, names(alnreads))
        asvalues <- c(asvalues, .getAlignmentASScoreTS(alnreads, tag = "AS"))
        readlen <- .getAlignmentLength(alnreads)
        alen <- c(alen, readlen)
        salnmask <- c(salnmask, any(.secondaryAlignmentMask(alnreads)))
        thisov <- mode(alnreads, tspar@features,
                       ignoreStrand=tspar@ignoreStrand, inter.feature=FALSE)
        
        # Selecting the overlaps with min overlap higher than threshold
        ovlength <- .getOverlapLength(alnreads, thisov, tspar)
        yesminov <- ovlength > tspar@minOverlFract*readlen[queryHits(thisov)]
        thisov <- thisov[yesminov]
        ovlength <- ovlength[yesminov]
        
        # Selecting the best overlap for alignments overlapping > 1 feature
        multiov <- (duplicated(queryHits(thisov)) |
                        duplicated(queryHits(thisov), fromLast = TRUE))
        if (any(multiov)) {
          int <- ovlength[multiov]
          intmax <- aggregate(int, by = list(queryHits(thisov)[multiov]),
                              FUN = which.max)
          whpos <- which(!duplicated(queryHits(thisov)[multiov]))
          whpos <- whpos - 1
          intmax <- intmax[!duplicated(intmax$Group.1),]
          thisov <- thisov[-which(multiov)[-(whpos + intmax$x)]]
        }

        ov <- .appendHits(ov, thisov)
    }
    # close(bf)
    on.exit(close(bf))
    .checkOvandsaln(ov, salnmask)
    maskuniqaln <- .getMaskUniqueAln(alnreadids)
    ## fetch all different read identifiers from the overlapping alignments
    readids <- unique(alnreadids[queryHits(ov)])
    ## Adding "no_feature" overlaps to 'ov'
    ov <- .getNoFeatureOv(maskuniqaln, ov, alnreadids)
    mt <- match(readids, alnreadids)
    readids <- unique(alnreadids[queryHits(ov)]) # updating 'readids'
    cntvec <- .tsEMstep(tspar, alnreadids, readids, ov, asvalues,
                        iste, maskuniqaln, mt, alen)
    setNames(as.integer(cntvec), names(cntvec))
}

## private function .getNoFeatureOv(), obtains the overlaps to "no_feature"
#' @importFrom S4Vectors Hits queryHits subjectHits nRnode nLnode from to
.getNoFeatureOv <- function(maskuniqaln, ov, alnreadids) {
    if (!all(maskuniqaln)) {
        maskmultialn_ov <- !(maskuniqaln[queryHits(ov)])
        alnreadidx_all <- match(alnreadids, unique(alnreadids))
        ovid <- unique(alnreadidx_all[unique(queryHits(ov)[maskmultialn_ov])])
        alnall <- which(alnreadidx_all %in% ovid)
        nofeat <- setdiff(alnall, queryHits(ov)[maskmultialn_ov])
        from <- nofeat
        to <- rep(nRnode(ov) + 1, length(nofeat))
        nofeat_hits <- Hits(from = from, to = to, nLnode = nLnode(ov),
                            nRnode = max(to), sort.by.query = TRUE)
        
        # Adding overlaps to "no_feature" to 'ov'
        hits1 <- ov
        hits2 <- nofeat_hits
        
        hits <- c(Hits(from=from(hits1), to=to(hits1),
                        nLnode=nLnode(hits1),
                        nRnode=nRnode(hits2), sort.by.query=FALSE),
                    Hits(from=from(hits2), to=to(hits2),
                        nLnode=nLnode(hits2),
                        nRnode=nRnode(hits2), sort.by.query=FALSE))
        
        ovnofeat <- Hits(from = from(hits), to = to(hits), nLnode = nLnode(hits),
                        nRnode = nRnode(hits), sort.by.query=TRUE)
    } else {
      ovnofeat <- ov
    }
    return(ovnofeat)
}

#' @importFrom S4Vectors Hits queryHits subjectHits
#' @importFrom Matrix Matrix t which
#' @importFrom SQUAREM squarem
#' @importClassesFrom Matrix lgCMatrix
.tsEMstep <- function(tspar, alnreadids, readids, ov, asvalues,
                      iste, maskuniqaln, mt, alen) {
    ## initialize vector of counts derived from multi-mapping reads
    cntvec <- rep(0L, length(tspar@features) + 1)
    
    alnreadidx <- match(alnreadids, readids)
    rd_idx <- sort(unique(alnreadidx[queryHits(ov)]))
    
    ## fetch all different transcripts from the overlapping alignments
    tx_idx <- sort(unique(subjectHits(ov)))
    if (all(maskuniqaln)) {
        istex <- as.vector(iste[tx_idx])
    } else {
        istex <- as.vector(iste[tx_idx])[-length(tx_idx)] # removing no_feature
    }
    # asvalues <- (asvalues-min(asvalues)+1) / (max(asvalues)+1 - min(asvalues))
    asvalues <- .rescaleAS(asvalues, alen = alen)
    QmatTS <- .buildOvValuesMatrix(tspar, ov, asvalues, alnreadidx,
                                    rd_idx, tx_idx)
    
    if (!all(iste)) { ## Correcting for preference of reads to genes/TEs
        if (all(maskuniqaln)) {
          QmatTS <- .correctPreferenceTS(QmatTS, maskuniqaln, mt, istex)
          stopifnot(!any(rowSums2(QmatTS[,istex]) > 0 &
                           rowSums2(QmatTS[,!istex]) > 0))
        } else {
            QmatTS_nofeat <- QmatTS[,ncol(QmatTS), drop = FALSE]
            QmatTS <- .correctPreferenceTS(QmatTS[,-ncol(QmatTS)], maskuniqaln,
                                           mt, istex)
            stopifnot(!any(rowSums2(QmatTS[,istex]) > 0 &
                               rowSums2(QmatTS[,!istex]) > 0))
            QmatTS <- cbind(QmatTS, QmatTS_nofeat)
        }
    }
    
    
    # --- EM-step --- 
    # QmatTS <- QmatTS / rowSums2(QmatTS)
    QmatTS@x <- QmatTS@x / rowSums2(QmatTS)[QmatTS@i +1]
    
    # Getting 'y' indicator of unique and multi-mapping status from QmatTS 
    # maskmulti <- ifelse(rowSums2(QmatTS > 0) == 1, 0, 1)
    maskmulti <- rowSums2(QmatTS > 0) > 1
    
    # Telescope (Bendall et al.(2019)) defines the initial π estimate uniformly
    PiTS <- rep(1 / length(tx_idx), length(tx_idx))
    
    # Telescope defines an additional reassignment parameter (θ) as uniform
    Theta <- rep(1 / length(tx_idx), length(tx_idx))
    
    ## The SQUAREM algorithm to run the EM procedure
    a <- as.numeric(tspar@pi_prior) # 0
    b <- as.numeric(tspar@theta_prior) # 0
    Thetaenv <- new.env()
    assign("Theta", Theta, envir=Thetaenv)
    tsres <- squarem(par=PiTS, Thetaenv=Thetaenv, Q=QmatTS,maskmulti=maskmulti,
                    a=a, b=b, fixptfn=.tsFixedPointFun,
                    control=list(tol=tspar@em_epsilon, maxiter=tspar@maxIter))
    PiTS <- tsres$par
    PiTS[PiTS < 0] <- 0 ## Pi estimates are sometimes negatively close to zero
    # --- end EM-step ---
    
    Theta <- get("Theta", envir=Thetaenv)
    X <- .tsEstep(QmatTS, Theta, maskmulti, PiTS)
    cntvec <- .reassign(X, tspar@reassign_mode, tspar@conf_prob, cntvec, tx_idx)
    
    names(cntvec) <- c(names(tspar@features), "no_feature")
    nofeat <- cntvec["no_feature"]
    cntvec <- .tssummarizeCounts(cntvec[-length(cntvec)], iste, tspar)
    cntvec <- c(cntvec, nofeat)
    cntvec
}



## private function .checkreassignModes()
## Checks that 'reassign_mode' is valid
.checkreassignModes <- function(tspar) {
  if (!(tspar@reassign_mode %in% c("exclude","choose","average","conf")))
    stop("'reassign_mode' must be one of: 'exclude', 'choose', 'average' or 'conf'")
}


## private function .reassign()
## Implements 4 different reassigning methods from Telescope (exclude, choose,
## average and conf)
#' @importFrom sparseMatrixStats rowSums2 colSums2
#' @importFrom Matrix summary
#' @importFrom stats runif
.reassign <- function(X, reassign_mode, conf_prob, cntvec, tx_idx) {
  
  if (reassign_mode %in% c("exclude", "choose", "average")) {
    maxbyrow <- rowMaxs(X)
    # Older versions
    # Xind <- X == maxbyrow
    # Xind <- (X / maxbyrow) == 1
    Xind <- as(as(as(X, "lMatrix"), "generalMatrix"), "CsparseMatrix")
    Xind@x <- (X@x /maxbyrow[X@i+1]) == 1
    nmaxbyrow <- rowSums2(Xind)
    cntvec[tx_idx] <- colSums2(Xind[nmaxbyrow == 1, ])
  
    if (reassign_mode == "choose" & any(nmaxbyrow > 1)) {
      Xind2 <- Xind[nmaxbyrow > 1, ]
      Xind2_s <- summary(Xind2)
      Xind2_s <- Xind2_s[Xind2_s$x == TRUE,]
      Xind2_s <- Xind2_s[order(Xind2_s[,"i"]),]
      bestov <- as.vector(table(Xind2_s[,"i"]))
      # selected_ov <- vapply(bestov, FUN = function(x) sample(x = x, size = 1), 
      #                       FUN.VALUE = integer(1L))
      selected_ov <- ceiling(runif(n=nrow(Xind2))*rowSums2(Xind2))
      ovcol <- table(Xind2_s[c(0,cumsum(bestov)[-length(bestov)]) + selected_ov,"j"])
      whcol <- as.integer(names(ovcol))
      cntvec[tx_idx][whcol] <- cntvec[tx_idx][whcol] + as.integer(ovcol)
    }
    
    if (reassign_mode == "average" & any(nmaxbyrow > 1)) {
      Xind2 <- Xind[nmaxbyrow > 1, ]
      cntvec[tx_idx] <- cntvec[tx_idx] + colSums2(Xind2/rowSums2(Xind2))
    }
    
  } else if (reassign_mode == "conf") {
    whbelow <- X@x < conf_prob
    X@x[whbelow] <- 0
    X_conf <- X[rowSums2(X) > 0,]
    # X_conf@i indicates row and is 0-based
    # Faster sparse implementation of X_conf/rowSums2(X_conf)
    X_conf@x <- X_conf@x / rowSums2(X_conf)[X_conf@i +1] 
    cntvec[tx_idx] <- colSums2(X_conf)
    
  } else {
    stop("'reassign_mode' should be one of 'exclude', 'choose', 'average' or 'conf'")
  }
  
  cntvec
}


.rescaleAS <- function(asvalues, alen) {
  # asrescale <- rescale(asvalues, c(1, max(asvalues) - min(asvalues) + 1))
  asrescale <- asvalues - min(asvalues) + 1
  asrescale <- asrescale + alen
  asrescale <- asrescale/max(asrescale)
  asrescale <- expm1(asrescale*100)
  asrescale
}


## private function .correctPreferenceTS()
## Corrects QmatTS for preference of unique/multi-mapping reads to
## genes/TEs, respectively, in Telescope
.correctPreferenceTS <- function(QmatTS, maskuniqaln, mt, istex) {
    indx <- (rowSums2(QmatTS[,istex]) > 0) & (rowSums2(QmatTS[,!istex]) > 0)
    
    ## Assigning unique reads mapping to both a TE and a gene as gene counts
    # Which unique reads overlap to both genes and TEs?
    idxu <- indx & maskuniqaln[mt]
    if (any(idxu)) {
        # QmatTS[idxu,istex] <- FALSE
        whu <- which(QmatTS[idxu,istex] > 0, arr.ind = TRUE)
        if (is(whu, "integer")) {
            whu <- as.matrix(data.frame(row = 1, col = whu))
        }
        whudf <- cbind(which(idxu)[whu[,"row"]], which(istex)[whu[,"col"]])
        QmatTS[whudf] <- FALSE
    }
    
    ## Removing overlaps of multi-mapping reads to genes if at least one
    ## alignment of the read overlaps a TE
    idxm <- indx & !maskuniqaln[mt]
    if (any(idxm)) {
        # QmatTS[idxm,!istex] <- FALSE
        whm <- which(QmatTS[idxm,!istex] > 0, arr.ind = TRUE)
        if (is(whm, "integer")) {
          whm <- as.matrix(data.frame(row = 1, col = whm))
        }
        whmdf <- cbind(which(idxm)[whm[,"row"]], which(!istex)[whm[,"col"]])
        QmatTS[whmdf] <- FALSE
    }
    
    QmatTS
}

.tssummarizeCounts <- function(cntvec, iste, tspar) {
    cntvec_t <- cntvec[iste]
    ## aggregate TE quantifications if necessary
    if (length(tspar@aggregateby) > 0) {
        f <- .factoraggregateby(tspar@features[iste], tspar@aggregateby)
        stopifnot(length(f) == length(cntvec_t)) ## QC
        cntvec_t <- tapply(cntvec_t, f, sum, na.rm=TRUE)
    }
    
    ## aggregating exon counts to genes
    if (!all(iste)) {
        cntvec <- c(cntvec_t, cntvec[!iste])
    } else {
        cntvec <- cntvec_t
    }
    
    cntvec
}




## private function .tsEstep()
## E-step of the EM algorithm of Telescope
.tsEstep <- function(Q, Theta, maskmulti, Pi) {
    # X <- t(t(Q) * Pi)
    # # X[maskmulti, ] <- t(t(X[maskmulti, ]) * Theta)
    # # quicker computation of previous line
    # wh <- which(X*maskmulti > 0, arr.ind = TRUE)
    # X[wh] <- t(t(X[wh]) * Theta[wh[, "col"]])
    X <- Q
    j <- rep(seq_len(ncol(X)), diff(X@p))
    X@x <- X@x * Pi[j]
    #wh <- which(X@x * maskmulti[X@i + 1] > 0)
    whmulti <- as.integer(maskmulti[X@i + 1])
    wh <- which(X@x * whmulti > 0)
    j <- rep(seq_len(ncol(X)), diff(X@p))
    X@x[wh] <- X@x[wh] * Theta[j][wh]
    X <- X[rowSums2(X) > 0, , drop = FALSE]
    X <- X/rowSums2(X)
    X
}


## private function .tsMstepPi()
## M-step of the EM algorithm of Telescope
.tsMstepPi <- function(X, a) {
  Pi <- (colSums2(X) + a)/(sum(X@x) + a * ncol(X))
    Pi
}

## private function .tsMstepTheta()
## Update the estimate of the MAP value of θ
.tsMstepTheta <- function(X, maskmulti, b) {
    Theta <- ((colSums2(X[maskmulti, , drop=FALSE]) + b) / 
        (sum(maskmulti) + b*ncol(X)))
}

## private function .tsFixedPointFun()
## fixed point function of the EM algorithm of Telescope
.tsFixedPointFun <- function(Pi, Thetaenv, Q, maskmulti, a, b) {
    Theta <- get("Theta", envir=Thetaenv)
    X <- .tsEstep(Q, Theta, maskmulti, Pi)
    Pi2 <- .tsMstepPi(X, a)
    Theta2 <- .tsMstepTheta (X, maskmulti, b)
    assign("Theta", Theta2, envir=Thetaenv)
    
    Pi2
}


#' @importFrom S4Vectors mcols first second
.getAlignmentASScoreTS <- function(aln, tag) {
  if (is(aln, "GAlignments"))
    score <- as.integer(mcols(aln)[[tag]])
  else if (is(aln, "GAlignmentPairs")) {
    ## take the sum of the score of each mate
    score <- as.integer(mcols(first(aln))[[tag]]) + 
        as.integer(mcols(second(aln))[[tag]])
  } else if (is(aln, "GAlignmentsList")) {
    l <- lengths(aln)
    score <- aggregate(mcols(unlist(aln, use.names = FALSE))[[tag]],
                        by = list(rep(seq_along(l),l)), FUN = sum)
    score <- score$x
    mate_status <- mcols(aln)$mate_status == "mated"
    score[!mate_status] <- unlist(lapply(aln[!mate_status], 
                                  function(x) mean(mcols(x)[,tag])*2))
                                  # multiplied by 2 since there are 2 mates
  } else {
    stop(sprintf(".getAlignmentTagScore: wrong class %s\n", class(aln)))
  }
  as.integer(score)
}

#' @importFrom S4Vectors first second
#' @importFrom GenomicRanges granges
#' @importFrom GenomicAlignments seqnames
.getAlignmentLength <- function(alnreads) {
  if (is(alnreads, "GAlignments"))
    readlen <- qwidth(alnreads)
  else if (is(alnreads, "GAlignmentPairs")) {
      ## take the length of the region comprised by the 2 mates
      readlen <- width(granges(alnreads))
  } else if (is(alnreads, "GAlignmentsList")) {
      # readlen <- width(granges(alnreads, ignore.strand=TRUE))
      # # In case of reads with space between the two mates, the read length
      # # assigned corresponds to twice the maximum alignment length, to account
      # # for the length of the two mates, but not the region between them
      # maxlen <- max(qwidth(unlist(alnreads)))
      # readlen[readlen>maxlen] <- maxlen*2
      
      lsum <- sum(width(alnreads))
      # unmated alignments
      unmat <- mcols(alnreads)$mate_status == "unmated"
      # pairs with discordant chr
      unmat <- unmat | lengths(unique(seqnames(alnreads)))>1
      # Unmated alignments the assigned read length is the median read length
      lsum[unmat] <- median(width(unlist(alnreads)))
      ltogether <- integer(length = length(alnreads))
      ltogether[!unmat] <- width(granges(alnreads[!unmat], ignore.strand=TRUE))
      ltogether[unmat] <- median(width(unlist(alnreads)))
      ltogether[ltogether > lsum] <- lsum[ltogether > lsum]
      readlen <- ltogether
      
  } else {
    stop(sprintf(".getAlignmentLength: wrong class %s\n", class(alnreads)))
  }
  readlen
}


#' @importFrom S4Vectors Hits queryHits subjectHits
#' @importFrom GenomicRanges pintersect
#' @importFrom GenomeInfoDb seqlevels<- seqlevels
.getOverlapLength <- function(alnreads, thisov, tspar) {
    features <- tspar@features
    seqlev <- unique(c(seqlevels(features), seqlevels(alnreads)))
    seqlevels(features) <- seqlev
    seqlevels(alnreads) <- seqlev
    
    if (is(alnreads, "GAlignmentsList")) {
        l <- lengths(alnreads)[queryHits(thisov)]
        features_ov <- rep(features[subjectHits(thisov)], l)
        ovlength <- width(pintersect(GRanges(unlist(alnreads[queryHits(thisov)])),
                                      features_ov,
                                      ignore.strand = tspar@ignoreStrand,
                                      strict.strand=FALSE))
        if (is(ovlength, "CompressedIntegerList")) {
          ovlength <- max(ovlength)
        }
        ovlength_ag <- aggregate(ovlength, by = list(rep(seq_along(l), l)), 
                                  FUN = max)
        ovlength <- ovlength_ag$x
    } else {
        ovlength <- width(pintersect(GRanges(alnreads[queryHits(thisov)]),
                                      features[subjectHits(thisov)],
                                      ignore.strand = tspar@ignoreStrand,
                                      strict.strand=FALSE))
        if (is(ovlength, "CompressedIntegerList")) {
          ovlength <- max(ovlength)
        }
    }
    ovlength
}

#' @importFrom Rsamtools scanBamFlag
.getScanBamFlag_ts <- function(singleEnd, fragments) {
  if (singleEnd == TRUE | fragments == TRUE) {
    sbflags <- scanBamFlag(isUnmappedQuery=FALSE,
                           isDuplicate=FALSE,
                           isNotPassingQualityControls=FALSE)
  } else {
    sbflags <- scanBamFlag(isUnmappedQuery=FALSE,
                           isDuplicate=FALSE,
                           isNotPassingQualityControls=FALSE,
                           isProperPair=TRUE)
  }
  sbflags
}
functionalgenomics/atena documentation built on May 7, 2024, 10:33 a.m.