groHMM: GRO-seq Analysis Pipeline

Documented in breakTranscriptsOnGenes combineTranscripts

###########################################################################
##
##   Copyright 2013, 2014 Charles Danko and Minho Chae.
##
##   This program is part of the groHMM R package
##
##   groHMM is free software: you can redistribute it and/or modify it
##   under the terms of the GNU General Public License as published by
##   the Free Software Foundation, either version 3 of the License, or
##   (at your option) any later version.
##
##   This program is distributed in the hope that it will be useful, but
##   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
##   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
##   for more details.
##
##   You should have received a copy of the GNU General Public License along
##   with this program.  If not, see <http://www.gnu.org/licenses/>.
##
##########################################################################
plot2Ranges <- function(tr, gr, main = "NA", col = "black", sep = 0.5, ...) {
    ## nocov start
    height <- 1
    xlim <- c(min(min(start(tr), min(start(gr)))), max(max(end(tr)),
                max(end(gr))))

    plot.new()
    ybottom <- seq(1, length(tr) + length(gr))
    plot.window(xlim, c(0, max(ybottom + height)))

    if (length(tr) > 0) {
        tbottom <- seq(1, length(tr))
        rect(start(tr) - sep, tbottom, end(tr) + sep, tbottom + height - sep,
                col = "gray", ...)
    }

    bSymbol <- FALSE
    if ("symbol" %in% colnames(elementMetadata(gr)))
    bSymbol <- TRUE

    if (length(gr) > 0) {
        gbottom <- seq(1, length(gr))
        rect(
            start(gr) - sep, gbottom, end(gr) + sep, gbottom + height - sep,
            col=rgb(red=255, green=0, blue=0, alpha=150, maxColorValue=255))
        if (bSymbol)
            text(
                c(start(gr) + end(gr)) / 2, gbottom + 0.2,
                elementMetadata(gr)$symbol)
    }
    title(main)
    axis(1)
} # nocov end



#
# Break an Interval by break points with minimum gap
#------------------------------------------------------------------------------
breakInterval <- function(gr, brPos, gap=5, strand="+") {
    result <- rep(gr, length(brPos) + 1)
    if (strand == "+") {
        for (i in seq_along(brPos)) {
            end(result[i, ]) <- brPos[i] - gap
            start(result[i + 1, ]) <- brPos[i]
        }
    } else {
        for (i in seq_along(brPos)) {
            end(result[i, ]) <- brPos[i]
            start(result[i + 1, ]) <- brPos[i] + gap
        }
    }
    return(result)
}

#' breakTranscriptsOnGenes Breaks transcripts on genes
#'
#' Breaks transcripts when they are overlapped with multiple well annotated
#' genes.
#'
#' @param tx GRanges of transcripts.
#' @param annox GRanges of non-overlapping annotations for reference.
#' @param strand Takes "+" or "-" Default: "+"
#' @param geneSize Numeric. Minimum gene size in annox to be used as reference.
#' Default: 5000
#' @param threshold Numeric. Ratio of overlapped region relative to a gene
#' width.
#' Transcripts only greater than this threshold are subjected to be broken.
#' Default: 0.8
#' @param gap Numeric.  Gap (bp) between broken transcripts.  Default: 5
#' @param plot Logical.  If set to TRUE, show each step in a plot.
#' Default: FALSE
#' @author Minho Chae and Charles G. Danko
#' @return Returns GRanges object of broken transcripts.
#' @examples
#' library(GenomicRanges)
#' tx <- GRanges("chr7", IRanges(1000, 30000), strand="+")
#' annox <- GRanges(
#'     "chr7", IRanges(start=c(1000, 20000), width=c(10000,10000)), strand="+")
#' bPlus <- breakTranscriptsOnGenes(tx, annox, strand="+")
breakTranscriptsOnGenes <- function(tx, annox, strand="+", geneSize=5000,
    threshold=0.8, gap=5, plot=FALSE) {
    ## Validate inputs
    tx <- .normArgRanges(tx, errorOnEmpty = TRUE)
    annox <- .normArgRanges(annox)
    if (!isDisjoint(annox)) {
        stop("Annotations cannot have any overlaps. ",
             "Did you run ", sQuote("makeConsensusAnnotations"), "?")
    }

    tx <- tx[strand(tx) == strand, ]
    annox <- annox[strand(annox) == strand, ]
    mcols(tx)$status <- "NA"

    annox <- annox[width(annox) > geneSize, ]
    ol <- findOverlaps(tx, annox)

    ol.df <- data.frame(trans = queryHits(ol), gene = subjectHits(ol),
        ratio = width(pintersect(tx[queryHits(ol), ], annox[subjectHits(ol), ]))
        / width(annox[subjectHits(ol), ]))

    # Keep only major overlappings
    ol.df <- ol.df[ol.df$ratio > threshold, ]

    # Keep only multiple genes on a transcript cases
    dupTrans <- unique(ol.df$trans[duplicated(ol.df$trans)])
    ol.df <- ol.df[ol.df$trans %in% dupTrans, ]

    ol.tab <- table(ol.df$trans)
    ol.tabCS <- cumsum(ol.tab)

    bT <- NULL

    for (i in seq_along(ol.tab)) {
        txNo <- as.integer(names(ol.tab[i]))
        aNo <- ol.df$gene[(ol.tabCS[i] - ol.tab[i] + 1):ol.tabCS[i]]

        if (strand == "+") {
            # No need of break position for the first annotation
            brPos <- start(annox[aNo[-1], ])
        } else {
            # No need of break position for the last annotation
            brPos <- end(annox[aNo[-length(aNo)], ])
        }

        frags <- breakInterval(tx[txNo, ], brPos=brPos, gap=gap, strand=strand)

        if (is.null(bT)) {
            bT <- frags
        } else {
            bT <- c(bT, frags)
        }
        if (plot) {
            ## nocov start
            par(mfrow = c(2, 1))
            plot2Ranges(tx[txNo, ], annox[aNo, ], main="Before")
            plot2Ranges(frags, annox[aNo, ], main="After")
            Sys.sleep(5)
        } # nocov end
    }

    okTrans <- setdiff(1:length(tx), dupTrans)

    if (length(bT) == 0) {
        message("No join errors detected in transcripts")
        all <- tx[okTrans, ]
        return(all[order(as.character(seqnames(all)), start(all)), ])
    }

    mcols(bT)$status <- "broken"
    mcols(bT)$ID <- paste(seqnames(bT), "_", start(bT), strand(bT), sep="")
    all <- c(tx[okTrans, ], bT)
    message(length(unique(ol.df$trans)),
            " transcripts are broken into ", length(bT))

    return(all[order(as.character(seqnames(all)), start(all)), ])
}

#' combineTranscripts Combines transcripts.
#'
#' Combines transcripts  that are within the same gene annotation, combining
#' smaller transcripts for genes
#'  with low regulation into a single transcript representing the gene.
#'
#' @param tx GRanges of transcripts.
#' @param annox GRanges of non-overlapping annotations for reference.
#' @param geneSize Numeric. Minimum gene size in annotations to be used as
#' reference.
#' Default: 1000
#' @param threshold Numeric. Ratio of overlapped region relative to transcript
#' width.
#' Transcripts only greater than this threshold are subjected to be combined.
#' Default: 0.8
#' @param plot Logical.  If set to TRUE, show each step in a plot.
#' Default: FALSE
#' @return Returns GRanges object of combined transcripts.
#' @author Minho Chae and Charles G. Danko
#' @examples
#' library(GenomicRanges)
#' tx <- GRanges("chr7", IRanges(start=c(1000, 20000), width=c(10000,10000)),
#'     strand="+")
#' annox <- GRanges("chr7", IRanges(1000, 30000), strand="+")
#' combined <- combineTranscripts(tx, annox)
combineTranscripts <- function(tx, annox, geneSize = 1000, threshold = 0.8,
    plot = FALSE) {
    ## Validate inputs
    tx <- .normArgRanges(tx, errorOnEmpty = TRUE)
    annox <- .normArgRanges(annox)

    annox <- annox[width(annox) > geneSize, ]
    ol <- findOverlaps(tx, annox)
    ol.df <- data.frame(trans = queryHits(ol), gene = subjectHits(ol),
    ratio = width(pintersect(tx[queryHits(ol), ], annox[subjectHits(ol), ]))
                / width(tx[queryHits(ol), ]))

    # Keep only major overlappings
    ol.df <- ol.df[ol.df$ratio > threshold, ]

    # Keep only multiple transcripts on a gene cases
    dupGenes <- ol.df$gene[which(duplicated(ol.df$gene))]
    ol.df <- ol.df[ol.df$gene %in% dupGenes, ]

    uniqGene <- unique(ol.df$gene)
    N <- length(uniqGene)
    cT <- GRanges(seqnames=Rle(rep("chr1", N)),
        ranges = IRanges(rep(1, N), rep(max(end(tx)), N)),
        strand = rep("+", N),
        type = Rle(rep("tx", N)))
    seqlevels(cT) <- seqlevels(tx)

    for (i in seq_along(uniqGene)) {
        block <- ol.df[uniqGene[i] == ol.df$gene, ]
        strand(cT[i, ]) <- strand(tx[block[1, "trans"], ])
        seqnames(cT[i, ]) <- seqnames(tx[block[1, "trans"], ])
        start(cT[i, ]) <- start(tx[block[1, "trans"], ])
        end(cT[i, ]) <- end(tx[block[NROW(block), "trans"], ])
        if (plot) {
            ## nocov start
            # plot before and after the combine
            par(mfrow = c(2, 1))
            plot2Ranges(tx[ol.df[ol.df$gene == uniqGene[i], "trans"], ],
                annox[uniqGene[i], ], main = "Before")
            plot2Ranges(cT[i, ], annox[uniqGene[i], ], main = "After")
            Sys.sleep(5)
        } # nocov end
    }

    okTrans <- setdiff(seq_along(tx), ol.df$trans)

    if (length(cT) == 0) {
        message("No break errors detected in transcripts")
        all <- tx[okTrans, ]
        return(all[order(as.character(seqnames(all)), start(all)), ])
    }

    message(NROW(ol.df), " transcripts are combined to ", NROW(cT))
    mcols(cT)$ID <- paste(seqnames(cT), "_", start(cT), strand(cT), sep="")
    mcols(cT)$status <- "combined"

    all <- c(tx[okTrans, ], cT)
    return(all[order(as.character(seqnames(all)), start(all)), ])
}