###########################################################################
##
## Copyright 2013, 2014 Charles Danko and Minho Chae.
##
## This program is part of the groHMM R package
##
## groHMM is free software: you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
## or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
## for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program. If not, see <http://www.gnu.org/licenses/>.
##
##########################################################################
plot2Ranges <- function(tr, gr, main = "NA", col = "black", sep = 0.5, ...) {
## nocov start
height <- 1
xlim <- c(min(min(start(tr), min(start(gr)))), max(max(end(tr)),
max(end(gr))))
plot.new()
ybottom <- seq(1, length(tr) + length(gr))
plot.window(xlim, c(0, max(ybottom + height)))
if (length(tr) > 0) {
tbottom <- seq(1, length(tr))
rect(start(tr) - sep, tbottom, end(tr) + sep, tbottom + height - sep,
col = "gray", ...)
}
bSymbol <- FALSE
if ("symbol" %in% colnames(elementMetadata(gr)))
bSymbol <- TRUE
if (length(gr) > 0) {
gbottom <- seq(1, length(gr))
rect(
start(gr) - sep, gbottom, end(gr) + sep, gbottom + height - sep,
col=rgb(red=255, green=0, blue=0, alpha=150, maxColorValue=255))
if (bSymbol)
text(
c(start(gr) + end(gr)) / 2, gbottom + 0.2,
elementMetadata(gr)$symbol)
}
title(main)
axis(1)
} # nocov end
#
# Break an Interval by break points with minimum gap
#------------------------------------------------------------------------------
breakInterval <- function(gr, brPos, gap=5, strand="+") {
result <- rep(gr, length(brPos) + 1)
if (strand == "+") {
for (i in seq_along(brPos)) {
end(result[i, ]) <- brPos[i] - gap
start(result[i + 1, ]) <- brPos[i]
}
} else {
for (i in seq_along(brPos)) {
end(result[i, ]) <- brPos[i]
start(result[i + 1, ]) <- brPos[i] + gap
}
}
return(result)
}
#' breakTranscriptsOnGenes Breaks transcripts on genes
#'
#' Breaks transcripts when they are overlapped with multiple well annotated
#' genes.
#'
#' @param tx GRanges of transcripts.
#' @param annox GRanges of non-overlapping annotations for reference.
#' @param strand Takes "+" or "-" Default: "+"
#' @param geneSize Numeric. Minimum gene size in annox to be used as reference.
#' Default: 5000
#' @param threshold Numeric. Ratio of overlapped region relative to a gene
#' width.
#' Transcripts only greater than this threshold are subjected to be broken.
#' Default: 0.8
#' @param gap Numeric. Gap (bp) between broken transcripts. Default: 5
#' @param plot Logical. If set to TRUE, show each step in a plot.
#' Default: FALSE
#' @author Minho Chae and Charles G. Danko
#' @return Returns GRanges object of broken transcripts.
#' @examples
#' library(GenomicRanges)
#' tx <- GRanges("chr7", IRanges(1000, 30000), strand="+")
#' annox <- GRanges(
#' "chr7", IRanges(start=c(1000, 20000), width=c(10000,10000)), strand="+")
#' bPlus <- breakTranscriptsOnGenes(tx, annox, strand="+")
breakTranscriptsOnGenes <- function(tx, annox, strand="+", geneSize=5000,
threshold=0.8, gap=5, plot=FALSE) {
## Validate inputs
tx <- .normArgRanges(tx, errorOnEmpty = TRUE)
annox <- .normArgRanges(annox)
if (!isDisjoint(annox)) {
stop("Annotations cannot have any overlaps. ",
"Did you run ", sQuote("makeConsensusAnnotations"), "?")
}
tx <- tx[strand(tx) == strand, ]
annox <- annox[strand(annox) == strand, ]
mcols(tx)$status <- "NA"
annox <- annox[width(annox) > geneSize, ]
ol <- findOverlaps(tx, annox)
ol.df <- data.frame(trans = queryHits(ol), gene = subjectHits(ol),
ratio = width(pintersect(tx[queryHits(ol), ], annox[subjectHits(ol), ]))
/ width(annox[subjectHits(ol), ]))
# Keep only major overlappings
ol.df <- ol.df[ol.df$ratio > threshold, ]
# Keep only multiple genes on a transcript cases
dupTrans <- unique(ol.df$trans[duplicated(ol.df$trans)])
ol.df <- ol.df[ol.df$trans %in% dupTrans, ]
ol.tab <- table(ol.df$trans)
ol.tabCS <- cumsum(ol.tab)
bT <- NULL
for (i in seq_along(ol.tab)) {
txNo <- as.integer(names(ol.tab[i]))
aNo <- ol.df$gene[(ol.tabCS[i] - ol.tab[i] + 1):ol.tabCS[i]]
if (strand == "+") {
# No need of break position for the first annotation
brPos <- start(annox[aNo[-1], ])
} else {
# No need of break position for the last annotation
brPos <- end(annox[aNo[-length(aNo)], ])
}
frags <- breakInterval(tx[txNo, ], brPos=brPos, gap=gap, strand=strand)
if (is.null(bT)) {
bT <- frags
} else {
bT <- c(bT, frags)
}
if (plot) {
## nocov start
par(mfrow = c(2, 1))
plot2Ranges(tx[txNo, ], annox[aNo, ], main="Before")
plot2Ranges(frags, annox[aNo, ], main="After")
Sys.sleep(5)
} # nocov end
}
okTrans <- setdiff(1:length(tx), dupTrans)
if (length(bT) == 0) {
message("No join errors detected in transcripts")
all <- tx[okTrans, ]
return(all[order(as.character(seqnames(all)), start(all)), ])
}
mcols(bT)$status <- "broken"
mcols(bT)$ID <- paste(seqnames(bT), "_", start(bT), strand(bT), sep="")
all <- c(tx[okTrans, ], bT)
message(length(unique(ol.df$trans)),
" transcripts are broken into ", length(bT))
return(all[order(as.character(seqnames(all)), start(all)), ])
}
#' combineTranscripts Combines transcripts.
#'
#' Combines transcripts that are within the same gene annotation, combining
#' smaller transcripts for genes
#' with low regulation into a single transcript representing the gene.
#'
#' @param tx GRanges of transcripts.
#' @param annox GRanges of non-overlapping annotations for reference.
#' @param geneSize Numeric. Minimum gene size in annotations to be used as
#' reference.
#' Default: 1000
#' @param threshold Numeric. Ratio of overlapped region relative to transcript
#' width.
#' Transcripts only greater than this threshold are subjected to be combined.
#' Default: 0.8
#' @param plot Logical. If set to TRUE, show each step in a plot.
#' Default: FALSE
#' @return Returns GRanges object of combined transcripts.
#' @author Minho Chae and Charles G. Danko
#' @examples
#' library(GenomicRanges)
#' tx <- GRanges("chr7", IRanges(start=c(1000, 20000), width=c(10000,10000)),
#' strand="+")
#' annox <- GRanges("chr7", IRanges(1000, 30000), strand="+")
#' combined <- combineTranscripts(tx, annox)
combineTranscripts <- function(tx, annox, geneSize = 1000, threshold = 0.8,
plot = FALSE) {
## Validate inputs
tx <- .normArgRanges(tx, errorOnEmpty = TRUE)
annox <- .normArgRanges(annox)
annox <- annox[width(annox) > geneSize, ]
ol <- findOverlaps(tx, annox)
ol.df <- data.frame(trans = queryHits(ol), gene = subjectHits(ol),
ratio = width(pintersect(tx[queryHits(ol), ], annox[subjectHits(ol), ]))
/ width(tx[queryHits(ol), ]))
# Keep only major overlappings
ol.df <- ol.df[ol.df$ratio > threshold, ]
# Keep only multiple transcripts on a gene cases
dupGenes <- ol.df$gene[which(duplicated(ol.df$gene))]
ol.df <- ol.df[ol.df$gene %in% dupGenes, ]
uniqGene <- unique(ol.df$gene)
N <- length(uniqGene)
cT <- GRanges(seqnames=Rle(rep("chr1", N)),
ranges = IRanges(rep(1, N), rep(max(end(tx)), N)),
strand = rep("+", N),
type = Rle(rep("tx", N)))
seqlevels(cT) <- seqlevels(tx)
for (i in seq_along(uniqGene)) {
block <- ol.df[uniqGene[i] == ol.df$gene, ]
strand(cT[i, ]) <- strand(tx[block[1, "trans"], ])
seqnames(cT[i, ]) <- seqnames(tx[block[1, "trans"], ])
start(cT[i, ]) <- start(tx[block[1, "trans"], ])
end(cT[i, ]) <- end(tx[block[NROW(block), "trans"], ])
if (plot) {
## nocov start
# plot before and after the combine
par(mfrow = c(2, 1))
plot2Ranges(tx[ol.df[ol.df$gene == uniqGene[i], "trans"], ],
annox[uniqGene[i], ], main = "Before")
plot2Ranges(cT[i, ], annox[uniqGene[i], ], main = "After")
Sys.sleep(5)
} # nocov end
}
okTrans <- setdiff(seq_along(tx), ol.df$trans)
if (length(cT) == 0) {
message("No break errors detected in transcripts")
all <- tx[okTrans, ]
return(all[order(as.character(seqnames(all)), start(all)), ])
}
message(NROW(ol.df), " transcripts are combined to ", NROW(cT))
mcols(cT)$ID <- paste(seqnames(cT), "_", start(cT), strand(cT), sep="")
mcols(cT)$status <- "combined"
all <- c(tx[okTrans, ], cT)
return(all[order(as.character(seqnames(all)), start(all)), ])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.