groHMM: GRO-seq Analysis Pipeline

Documented in pausingIndex

###########################################################################
##
##   Copyright 2013, 2014 Charles Danko and Minho Chae.
##
##   This program is part of the groHMM R package
##
##   groHMM is free software: you can redistribute it and/or modify it
##   under the terms of the GNU General Public License as published by
##   the Free Software Foundation, either version 3 of the License, or
##   (at your option) any later version.
##
##   This program is distributed in the hope that it will be useful, but
##   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
##   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
##   for more details.
##
##   You should have received a copy of the GNU General Public License along
##   with this program.  If not, see <http://www.gnu.org/licenses/>.
##
##########################################################################

################################################################################
##
##  4/24/2012
##
##  Thanks to Andre Martins for the following two functions, useful for
##  calculating the confidence interval for a ratio of Poisson random variables.
##
##  Based on: Ederer F, Mantel N (1974); Confidence Limits on the Ratio of Two
## Poisson Variables.  AMERICAN JOURNAL OP EPIDEMIOLOGY.
##
###############################################################################
approx_ratio_CI <- function(x1, x2, alpha=0.05) {
    t <- qnorm(1 - alpha/2)
    n <- x1 + x2
    zp <- (t^2/2 + x1 + 1/2)^2 - (n + t^2) / n * (x1 + 1/2)^2
    zn <- (t^2/2 + x1 - 1/2)^2 - (n + t^2) / n * (x1 - 1/2)^2

    a <- (t^2/2 + x1 + 1/2 + sqrt(zp)) / (n + t^2/2 - x1 - 1/2 - sqrt(zp))
    b <- (t^2/2 + x1 - 1/2 - sqrt(zn)) / (n + t^2/2 - x1 + 1/2 + sqrt(zn))

    return(c(b, a))
}

approx_ratios_CI <- function(num.counts, denom.counts, alpha=0.05) {
    stopifnot(length(num.counts) == length(denom.counts))
    N <- length(num.counts)

    result <- matrix(data=0, nrow=N, ncol=2)

    for (i in 1:N)
        result[i, ] <- approx_ratio_CI(num.counts[i], denom.counts[i], alpha)

    return(result)
}

#' Returns the pausing index for different genes.  TODO: DESCRIBE THE PAUSING
#' INDEX.
#'
#' @param features A GRanges object representing a set of genomic coordinates.
#' @param reads A GRanges object representing a set of mapped reads.
#' @param size The size of the moving window.
#' @param up Distance upstream of each f to align and histogram.
#' @param down Distance downstream of each f to align and histogram (NULL).
#' @param BPPARAM Registered backend for BiocParallel.
#' Default: BiocParallel::bpparam()
#' @return Returns a data.frame of the pausing indices for the input genes.
#' @author Charles G. Danko and Minho Chae.
#' @return Returns the pausing index for different genes.
#' @examples
#' library(GenomicAlignments)
#' features <- GRanges("chr7", IRanges(2394474,2420377), strand="+")
#' reads <- as(readGAlignments(system.file("extdata", "S0mR1.bam",
#'             package="groHMM")), "GRanges")
#' ## Not run:
#' # pi <- pausingIndex(features, reads)
##
##  Arguments:
##  f   -> data.frame of: CHR, START, END, STRAND.
##  p   -> data.frame of: CHR, START, END, STRAND.
##  size    -> The size of the moving window.
##  up  -> Distance upstream of each f to align and histogram.
##  down    -> Distance downstream of each f to align and histogram (NULL).
##
##  TODO:
##  (1) Write C function.
##  (2) Implement: ...
##      (a)  promWind-> As an alternative to up/down, specify a specific window
##      in which to check for the pause-peak.
##      Uses the same format as "f", expect that features are mapped with
##      respect to the start of f, where 0 indicates the start of
##      transcription, and negative numbers specify upstream sequence.
##      This is likely useful for rate; perhaps for identifying internal
##      paused-peaks...
pausingIndex <- function(features, reads, size=50, up=1000, down=1000,
    BPPARAM=bpparam()) {
    ## make sure reads are sorted
    reads <- reads[order(as.character(seqnames(reads)), start(reads)), ]
    f <- data.frame(chrom=as.character(seqnames(features)),
        start=as.integer(start(features)), end=as.integer(end(features)),
        strand=as.character(strand(features)))
    if ("symbol" %in% names(mcols(features))){
        f <- cbind(f, symbol=features$symbol)
    } else {
        f <- cbind(f, symbol=GeneID <- as.character(seq_len(NROW(f))))
    }
    p <- data.frame(
        chrom=as.character(seqnames(reads)), start=as.integer(start(reads)),
        end=as.integer(end(reads)), strand=as.character(strand(reads)))

    C <- sort(as.character(unique(f[[1]])))
    Pause <- rep(0, NROW(f))
    Body  <- rep(0, NROW(f))
    Fish  <- rep(0, NROW(f))
    GeneID <- rep("", NROW(f))
    CIl  <- rep(0, NROW(f))
    CIu  <- rep(0, NROW(f))

    ## Pass back information for the fisher test...
    PauseCounts <- rep(0, NROW(f))
    BodyCounts  <- rep(0, NROW(f))
    UpCounts    <- rep(0, NROW(f))
    UgCounts    <- rep(0, NROW(f))

    size        <- as.integer(size)
    up          <- as.integer(up)
    down        <- as.integer(down)

    ###### Calculate PLUS and MINUS index, for DRY compliance.
    PLUS_INDX <- which(f[[4]] == "+")
    MINU_INDX <- which(f[[4]] == "-")

    ###### Identify TSS -- Start for '+' strand, End for '-' strand.
    c_tss_indx <- rep(0, NROW(f))
    c_tss_indx[PLUS_INDX] <- 2
    c_tss_indx[MINU_INDX] <- 3
    c_tss <- unlist(lapply(c(1:NROW(f)), function(x) {
        f[x, c_tss_indx[x]]
    }
    ))

    ###### Now calculate left and right position for gene body, based
    ### on '+' or '-'.
    ### Calculate gene end.  Gene start is contiguous with the coordinates
    ###for the promoter.
    c_gene_end_indx <- rep(0, NROW(f))
    c_gene_end_indx[PLUS_INDX] <- 3
    c_gene_end_indx[MINU_INDX] <- 2
    c_gene_end <- unlist(lapply(c(1:NROW(f)), function(x) {
        f[x, c_gene_end_indx[x]]
    }
    ))

    ### Assign left and right.
    gLEFT   <- rep(0, NROW(c_tss))
    gRIGHT  <- rep(0, NROW(c_tss))
    gLEFT[PLUS_INDX]    <- c_tss[PLUS_INDX] + down
    gRIGHT[PLUS_INDX]   <- c_gene_end[PLUS_INDX]
    gLEFT[MINU_INDX]    <- c_gene_end[MINU_INDX]
    gRIGHT[MINU_INDX]   <- c_tss[MINU_INDX] - down

    ## Run parallel version.
    mcp <- bplapply(
        c(1:NROW(C)), pausingIndex_foreachChrom, C=C, f=f, p=p, gLEFT=gLEFT,
        gRIGHT=gRIGHT, c_tss=c_tss, size=size, up=up, down=down,
        BPPARAM=BPPARAM)

    ## Unlist and re-order values for printing in a nice data.frame.
    for (i in 1:NROW(C)) {
        ## Which KG?  prb?
        indxF <- which(as.character(f[[1]]) == C[i])
        indxPrb <- which(as.character(p[[1]]) == C[i])

        if ( (NROW(indxF) >0) & (NROW(indxPrb) >0)) {
            Pause[indxF][mcp[[i]][["ord"]]]     <- mcp[[i]][["Pause"]]
            Body[indxF][mcp[[i]][["ord"]]]  <- mcp[[i]][["Body"]]
            Fish[indxF][mcp[[i]][["ord"]]]  <- mcp[[i]][["Fish"]]
            GeneID[indxF][mcp[[i]][["ord"]]]    <- mcp[[i]][["GeneID"]]

            PauseCounts[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["PauseCounts"]]
            BodyCounts[indxF][mcp[[i]][["ord"]]]  <- mcp[[i]][["BodyCounts"]]
            UpCounts[indxF][mcp[[i]][["ord"]]]    <- mcp[[i]][["UpCounts"]]
            UgCounts[indxF][mcp[[i]][["ord"]]]    <- mcp[[i]][["UgCounts"]]

            CIl[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["CIl"]]
            CIu[indxF][mcp[[i]][["ord"]]] <- mcp[[i]][["CIu"]]
        }
    }

    return(data.frame(
        Pause=Pause, Body=Body, Fisher=Fish, GeneID=GeneID, CIlower=CIl,
        CIupper=CIu, PauseCounts=PauseCounts, BodyCounts=BodyCounts,
        uPCounts=UpCounts, uGCounts=UgCounts))
}

pausingIndex_foreachChrom <- function(i, C, f, p, gLEFT, gRIGHT, c_tss, size,
    up, down) {
    ## Which KG?  prb?
    indxF   <- which(as.character(f[[1]]) == C[i])
    indxPrb <- which(as.character(p[[1]]) == C[i])

    if ( (NROW(indxF) >0) & (NROW(indxPrb) >0)) {
        ## Order -- Make sure, b/c this is one of our main assumptions.
        ## Otherwise violated for DBTSS.
        Ford <- order(f[indxF, 2])
        Pord <- order(p[indxPrb, 2])

        ## Type coersions.
        FeatureStart    <- as.integer(gLEFT[indxF][Ford])
        FeatureEnd  <- as.integer(gRIGHT[indxF][Ford])
        FeatureStr  <- as.character(f[indxF, 4][Ford])
        FeatureTSS  <- as.integer(c_tss[indxF][Ford])

        PROBEStart  <- as.integer(p[indxPrb, 2][Pord])
        PROBEEnd    <- as.integer(p[indxPrb, 3][Pord])
        PROBEStr    <- as.character(p[indxPrb, 4][Pord])

        ## Set dimensions.
        dim(FeatureStart)   <- c(NROW(FeatureStart), NCOL(FeatureStart))
        dim(FeatureEnd)     <- c(NROW(FeatureEnd),   NCOL(FeatureEnd))
        dim(FeatureTSS)     <- c(NROW(FeatureTSS),   NCOL(FeatureTSS))
        dim(FeatureStr)     <- c(NROW(FeatureStr),   NCOL(FeatureStr))
        dim(PROBEStart)     <- c(NROW(PROBEStart),   NCOL(PROBEStart))
        dim(PROBEEnd)       <- c(NROW(PROBEEnd),     NCOL(PROBEEnd))
        dim(PROBEStr)       <- c(NROW(PROBEStr),     NCOL(PROBEStr))

        ## Run the calculations on the gene.
        ## Calculate the maximal 50 bp window.
        HPause <- .Call(
            "NumberOfReadsInMaximalSlidingWindow", FeatureTSS, FeatureStr,
            PROBEStart, PROBEEnd, PROBEStr, size, up, down, PACKAGE="groHMM")

        ## Run the calculate on the gene body...
        HGeneBody <- .Call(
            "CountReadsInFeatures", FeatureStart, FeatureEnd, FeatureStr,
            PROBEStart, PROBEEnd, PROBEStr, PACKAGE="groHMM")

        ## Get size of gene body
        Difference <- FeatureEnd-FeatureStart
        Difference[Difference < 0] <- 0
        ## Genes < 1kb, there is no suitable area in the body of the gene.

        ## Now use Fisher's Exact.
        ## Make uniform reads.
        Up <- round(HPause + HGeneBody) * size / (size + Difference)
        Ug <- round(HPause + HGeneBody) * Difference / (size + Difference)
        HFish <- unlist(lapply(c(1:NROW(Ford)), function(x) {
            fisher.test(
                data.frame(
                    c(HPause[x], round(Up[x])),
                    c(HGeneBody[x], round(Ug[x]))
                )
            )$p.value
        } ))

        ## Make return values.
        Pause_c     <- as.double(HPause / size)
        Body_c  <- as.double( (HGeneBody + 1) / Difference)
        ## 6-5-2012 -- Add a pseudocount here,
        ## forcing at least 1 read in the gene body.
        Fish_c  <- as.double(HFish)
        GeneID_c    <- as.character(f[indxF, 5][Ford])

        PauseCounts_c <- HPause
        BodyCounts_c  <- HGeneBody
        UpCounts_c    <- round(Up)
        UgCounts_c    <- round(Ug)

        aCI <- approx_ratios_CI(HPause, HGeneBody)
        scaleFactor <- Difference/size ## Body/ pause, must be 1/ PI units.
        CIl_c <- as.double(aCI[, 1]*scaleFactor)
        CIu_c <- as.double(aCI[, 2]*scaleFactor)

        return(list(
            Pause=Pause_c, Body=Body_c, Fish=Fish_c, GeneID=GeneID_c,
            PauseCounts=PauseCounts_c, BodyCounts=BodyCounts_c,
            UpCounts=UpCounts_c, UgCounts=UgCounts_c, CIl=CIl_c, CIu=CIu_c,
            ord=Ford))
    }
    return(integer(0))
}