groHMM: GRO-seq Analysis Pipeline

Documented in detectTranscripts

###########################################################################
##
##   Copyright 2013, 2014 Charles Danko and Minho Chae.
##
##   This program is part of the groHMM R package
##
##   groHMM is free software: you can redistribute it and/or modify it
##   under the terms of the GNU General Public License as published by
##   the Free Software Foundation, either version 3 of the License, or
##   (at your option) any later version.
##
##   This program is distributed in the hope that it will be useful, but
##   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
##   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
##   for more details.
##
##   You should have received a copy of the GNU General Public License along
##   with this program.  If not, see <http://www.gnu.org/licenses/>.
##
##########################################################################

#' detectTranscripts detects transcripts de novo using a two-state hidden
#' Markov model (HMM).
#'
#' Read counts can be specified as either a GRanges object (reads), or using a
#' fixed-step wiggle-format passed in a list (Fp and Fm).
#' Either reads or BOTH Fp and Fm must be specified.
#'
#' Reference: Hah N, Danko CG, Core L, Waterfall JJ, Siepel A, Lis JT,
#' Kraus WL. A rapid, extensive, and transient transcriptional response to
#' estrogen signaling in breast cancer cells. Cell. 2011 May 13;145(4):622-34.
#' doi: 10.1016/j.cell.2011.03.042.
#'
#' @param reads A GRanges object representing a set of mapped reads.
#' @param Fp Wiggle-formatted read counts on "+" strand. Optionally, Fp and Fm
#' represent list() filled with a vector of counts for each chromosome.
#' Can detect transcripts starting from a fixed-step wiggle.
#' @param Fm Wiggle-formatted read counts on "-" strand.
#' @param LtProbA Log probability of t... .  Default: -5. One of these is just
#' an initialization, and the final value is set by EM.  The other is a holdout
#' parameter.
#' @param LtProbB Log probability of t... .  Default: -200.
#' @param UTS Variance in read counts of the untranscribed sequence.
#' Default: 5.
#' @param size Log probability of t... .  Default: -5.
#' @param threshold Threshold change in total likelihood, below which EM exits.
#' @return Returns a list of emisParams, transParams, viterbiStates, and
#' transcripts.  The transcript element is a GRanges object representing the
#' predicted genomic coordinates of transcripts on both the + and - strand.
#' @author Charles G. Danko and Minho Chae
#' @examples
#' library(GenomicAlignments)
#' S0mR1 <- as(readGAlignments(system.file("extdata", "S0mR1.bam",
#'                package="groHMM")), "GRanges")
#' ## Not run:
#' # hmmResult <- detectTranscripts(S0mR1, LtProbB=-200, UTS=5, threshold=1)
#' # txHMM <- hmmResult$transcripts
## CGD: TODO: Test switch over to gamma, rather than dGamma?!

detectTranscripts <- function(reads=NULL, Fp=NULL, Fm=NULL, LtProbA=-5,
    LtProbB=-200, UTS=5, size=50, threshold=0.1) {

    stopifnot(!is.null(reads) | (!is.null(Fp) & !is.null(Fm)))

    ## Allow equivalent form of Fp and Fm to be specified in the function
    ## automatically.
    if (is.null(Fp) & is.null(Fm)) {
        Fp <- windowAnalysis(reads=reads, strand="+", windowSize=size)
        Fm <- windowAnalysis(reads=reads, strand="-", windowSize=size)
    }

    nFp <- NROW(Fp)
    CHRp <- as.character(names(Fp))
    CHRm <- as.character(names(Fm))

    size <- as.integer(size)
    ANS <- NULL

    ## Set up initial HMM variables.
    HMM <- list()
    HMM$nstates <- as.integer(2)
    HMM$ePrDist <- c("dgamma", "dgamma")
    ## CGD: 3-3-13: Still legacy. Switch to integrating gamma between read
    ## and read+1

    HMM$iProb <- as.double(log(c(1.0, 0.0)))
                                    ## Non-transcribed,  transcribed.
    HMM$ePrVars <- as.list(data.frame(c(UTS, 1/UTS, -1), c(0.5, 10, -1)))
    HMM$tProb <- as.list(data.frame(c(log(1-exp(LtProbA)), LtProbA),
        c(LtProbB, log(1-exp(LtProbB))) ))

    ## Cast counts to a real, and combine +/- strand into one list variable.
    ##  Treat like separate training sequences (they really are).
    FT <- list()    # MHC; 7/2/2014, bioconductor complains F thinking as False
    for (i in seq_along(Fp)) FT[[i]] <- as.double(Fp[[i]]+1)
    ## CGD: 3-3-13: Still legacy.  Switch to integrating gamma between read and
    ## read+1

    for (i in seq_along(Fm)) FT[[i+nFp]] <- as.double(Fm[[i]]+1)
    ## CGD: 3-3-13: Still legacy.  Switch to integrating gamma between read and
    ## read+1

    ## In case the above command copies, rather than points ...
    ## free unused memory.
    remove(Fp)
    remove(Fm)

    ## Run EM algorithm.
    BWem <- .Call(
        "RBaumWelchEM", HMM$nstates, FT, as.integer(1),
        HMM$ePrDist, HMM$ePrVars, HMM$tProb, HMM$iProb,
        as.double(threshold), c(TRUE, FALSE), c(FALSE, TRUE),
        as.integer(1), FALSE, PACKAGE="groHMM")
    ## Update Transitions, Emissions.

    ## Translate these into transcript positions.
    for (i in seq_along(CHRp)) {
        ans <- .Call(
            "getTranscriptPositions", as.double(BWem[[3]][[i]]),
            as.double(0.5), size, PACKAGE="groHMM")
        Nrep <- NROW(ans$Start)
        ANS <- rbind(
            ANS,
            data.frame(
                chrom=rep(CHRp[i], Nrep), start=ans$Start, end=ans$End,
                strand=rep("+", Nrep)))
    }

    for (i in seq_along(CHRm)) {
        ans <- .Call(
            "getTranscriptPositions", as.double(BWem[[3]][[i+nFp]]),
            as.double(0.5), size, PACKAGE="groHMM")
        Nrep <- NROW(ans$Start)
        ANS <- rbind(
            ANS,
            data.frame(
                chrom =rep(CHRm[i], NROW(ans$Start)), start=ans$Start,
                end=ans$End, strand =rep("-", Nrep)))
    }

    BWem[[4]] <- GRanges(
        seqnames = Rle(ANS$chrom),
        ranges = IRanges(ANS$start, ANS$end-1),
        strand = Rle(strand(ANS$strand)), type=Rle("tx", NROW(ANS)),
        ID=paste(ANS$chrom, "_", ANS$start, ANS$strand, sep=""))
    names(BWem) <- c(
        "emisParams", "transParams", "viterbiStates", "transcripts")

    return(BWem)
}