#' Call continuous intervals of nascent transcription from NET-seq or GRO-seq data
#'
#' @param cov \code{GRanges} object.
#' @param min_signal Positive integer.
#' @param max_gapwidth Non-negative integer.
#' @param min_width Positive integer.
#' @section Details:
#' The input file is expected to be returned by the \code{load_BAM_files(mode = "nascent")} function. It contains sequencing coverage
#' of the chromosomes by nascent RNA-seq reads. Continuous intervals of coverage (with values not less than \code{min_signal})
#' are merged if the distance between them does not exceed \code{max_gapwidth} bp.
#' Finally, the merged intervals are filtered to have length not less than \code{min_width}.
#' @return List of length 2:
#' \enumerate{
#' \item \code{GRanges} object containing merged intervals of nascent transcription.
#' \item \code{GRanges} object containing low coverage sub-intervals (gaps) within the merged transcribed intervals.
#' }
#' @export
call_transcribed_intervals <- function(cov, min_signal = 3, max_gapwidth = 250, min_width = 500) {
stopifnot(class(cov) == "GRanges")
stopifnot(is.numeric(min_signal) && length(min_signal) == 1 && min_signal %% 1 == 0 && min_signal > 0)
stopifnot(is.numeric(max_gapwidth) && length(max_gapwidth) == 1 && max_gapwidth %% 1 == 0 && max_gapwidth >= 0)
stopifnot(is.numeric(min_width) && length(min_width) == 1 && min_width %% 1 == 0 && min_width > 0)
cov_above <- cov[BiocGenerics::score(cov) >= min_signal]
cov_below <- cov[BiocGenerics::score(cov) < min_signal]
transcribed <- cov_above %>% GenomicRanges::reduce(min.gapwidth = max_gapwidth) %>% `[`(BiocGenerics::width(.) >= min_width)
gaps <- cov_below %>% GenomicRanges::reduce() %>% IRanges::subsetByOverlaps(transcribed)
message("Called ", length(transcribed), " continuous intervals of nascent transcription;")
return(list("transcribed" = transcribed, "gaps" = gaps))
}
# ---------------------------------------------------------------------------------------------------------
#' Classify intervals of nascent transcription and add them to the gene model
#'
#' @param hml_genes,nascent,tss,pas \code{GRanges} objects.
#' @param reads_free \code{GrangesList} object, or NULL.
#' @param gaps \code{GRanges} object, or NULL.
#' @param trim_offset Non-negative integer.
#' @param min_score_2 Non-negative numeric.
#' @param min_lncrna_width Positive integer.
#' @param extend_along_nascent Logical.
#' @param extension_flanks Integer vector of length 2.
#' @section Details:
#' \code{hml_genes} contains the called genes (the first element in the list returned by the \code{call_transcripts_and_genes()} function).
#' \code{nascent} contains continuous intervals of nascent transcription (the first element in the list returned by the \code{call_transcribed_intervals()} function).
#' \code{tss} and \code{pas} are returned by \code{call_TCs()} on 5'- and 3'-tag sequencing data, respectively.
#' \code{gaps} contains intervals of low coverage within the continuous intervals of nascent transcription
#' (the second element in the list returned by the \code{call_transcribed_intervals()} function).
#' /code{reads_free} contains long reads which remain unused during the transcript calling procedure and are located outside of the called genes
#' (the fifth element in the list returned by the \code{call_transcripts_and_genes()} function).\cr
#' The intervals of nascent transcription are first classified by overlap with the called genes (on the same strand). The intervals which start upstream
#' and/or end downstream from a called genes, are considered upstream transcribed intervals and readthrough (RT) tails, respectively.
#' Borders of the called genes are extended to include such gene-associated intervals of nascent transcription. At that, the original gene coordinates
#' (which correspond to the mature RNA molecule) are saved as the "thick" mcols. If \code{extend_along_nascent == TRUE}, then the original ("thick") coordinates
#' of MC and LC genes can be further extended towards strong TSS and/or PAS (with scores not less than \code{min_score_2}) which are found within \code{extension_flanks}
#' windows centered at starts and ends, respectively, of the associated nascent interval.\cr
#' Other nascent intervals which are not associated with any mature transcript, are considered antisense or intergenic lncRNAs. They are filtered to have length
#' not less than \code{min_lncrna_width}.
#' @return List of length 3:
#' \enumerate{
#' \item \code{GRanges} object which contains intervals covered by mature RNA molecules (i.e. the original coordinates of HC, MC and LC genes,
#' with the exception that the borders of MC and LC genes could have been extended towards nearby strong TSS and/or PAS).
#' \item \code{GRanges} object which contains HC, MC and LC genes extended to include the gene-associated intervals of nascent transcription
#' (the interval of mature transcription moved to the "thick" mcols).
#' \item \code{GRanges} object which contains lncRNAs called from the nascent-only transcribed intervals.
#' }
#' @export
process_nascent_intervals <- function(hml_genes, nascent, tss, pas, reads_free = NULL, gaps = NULL, trim_offset = 20, min_score_2 = 5, min_lncrna_width = 500,
extend_along_nascent = TRUE, extension_flanks = c("tss" = -50, "pas" = 0)) {
stopifnot(class(hml_genes) == "GRanges")
stopifnot(class(nascent) == "GRanges")
stopifnot(class(tss) == "GRanges")
stopifnot(class(pas) == "GRanges")
stopifnot(BiocGenerics::grepl("GRangesList", class(reads_free)) || is.null(reads_free))
stopifnot(class(gaps) == "GRanges" || is.null(gaps))
stopifnot(is.numeric(trim_offset) && length(trim_offset) == 1 && trim_offset %% 1 == 0 && trim_offset >= 0)
stopifnot(is.numeric(min_score_2) && length(min_score_2) == 1 && min_score_2 >= 0)
stopifnot(is.numeric(min_lncrna_width) && length(min_lncrna_width) == 1 && min_lncrna_width %% 1 == 0 && min_lncrna_width > 0)
stopifnot(is.logical(extend_along_nascent) && length(extend_along_nascent) == 1)
stopifnot(is.numeric(extension_flanks) && length(extension_flanks) == 2 && extension_flanks[[1]] %% 1 == 0 && extension_flanks[[2]] %% 1 == 0)
genic <- GenomicRanges::reduce(hml_genes)
message("Calling and trimming tails...")
# Subtract called genes from transcribed intervals:
if (is.null(gaps)) {
nascent_2 <- GenomicRanges::setdiff(nascent, hml_genes)
} else {
# Extend hml_genes by overlapping gaps (to avoid considering a downstream lncRNA as an RT tail):
gaps_over_hml <- gaps[gaps %over% hml_genes]
hml_genes_ext <- GenomicRanges::union(hml_genes, gaps_over_hml)
nascent_2 <- GenomicRanges::setdiff(nascent, hml_genes_ext)
}
# Find nascent tails:
tails <- nascent_2[suppressWarnings(GenomicRanges::flank(nascent_2, 1)) %over% hml_genes]
# Trim tails by outer borders of used TSS:
tss_used <- tss[tss %over% GenomicRanges::resize(genic, 1, "start")]
tails <- trim_by_down_or_upstream_features(tails, tss_used, mode = "down", offset = trim_offset)
# Trim tails by strong unused TSS:
tss_strong <- tss[tss %outside% genic & BiocGenerics::score(tss) >= min_score_2]
tails <- trim_by_down_or_upstream_features(tails, tss_strong, mode = "down", offset = trim_offset)
# Subtract tails from the transcribed intervals:
nascent_3 <- GenomicRanges::setdiff(nascent_2, tails)
if (!is.null(gaps)) {
# Also trim tails by the first low coverage interval (without offset):
tails <- trim_by_down_or_upstream_features(tails, gaps, mode = "down", offset = 0)
}
message("Calling and trimming heads...")
# Find heads (transcribed intervals immediately upstream from called genes):
heads <- nascent_3[suppressWarnings(GenomicRanges::flank(nascent_3, 1, start = FALSE)) %over% hml_genes]
# Trim heads by outer borders of used PAS:
pas_used <- pas[pas %over% GenomicRanges::resize(genic, 1, "end")]
heads <- trim_by_down_or_upstream_features(heads, pas_used, mode = "up", offset = trim_offset)
# Trim heads by strong unused PAS:
pas_strong <- pas[pas %outside% genic & BiocGenerics::score(pas) >= min_score_2]
heads <- trim_by_down_or_upstream_features(heads, pas_strong, mode = "up", offset = trim_offset)
# The remaining transcribed intervals can be regarded as lncRNAs:
nascent_4 <- GenomicRanges::setdiff(nascent_3, heads)
if (isTRUE(extend_along_nascent)) {
message("Extending MC and LC genes along heads and tails towards unused TSS and PAS...")
# Extend MC and LC genes without TSS along plaNET-seq intervals towards nearby strong TSS (if any):
wo_tss <- GenomicRanges::resize(hml_genes, 1, "start") %outside% tss
results <- hml_genes[wo_tss] %>% extend_genes_along_nascent_intervals(tss_strong, heads, mode = "tss", flanks = extension_flanks)
hml_genes <- c(hml_genes[!wo_tss], results[[1]]) %>% BiocGenerics::sort()
heads <- results[[2]]
# Extend MC and LC genes without PAS along plaNET-seq intervals towards nearby strong PAS (if any):
wo_pas <- GenomicRanges::resize(hml_genes, 1, "end") %outside% pas
results <- hml_genes[wo_pas] %>% extend_genes_along_nascent_intervals(pas_strong, tails, mode = "pas", flanks = extension_flanks)
hml_genes <- c(hml_genes[!wo_pas], results[[1]]) %>% BiocGenerics::sort()
tails <- results[[2]]
# Observe that now an LC gene may behave like MC or HC (i.e. start in TSS and/or end in PAS);
# Similarly, an MC gene may behave like HC;
}
# Return trimmed heads to the lncRNA pool:
lncrna <- c(nascent_4, heads) %>% GenomicRanges::sort()
# Decorate genes with RT tails:
message(length(tails), " RT tails;")
S4Vectors::mcols(hml_genes)$thick <- GenomicRanges::granges(hml_genes) %>% unname()
hml_genes_with_fl <- hml_genes %>% decorate_genes(tails, mode = "down")
if (!is.null(reads_free)) {
# Merge lncRNAs if connected by a free_read:
range_free <- reads_free %>% range() %>% BiocGenerics::unlist(use.names = FALSE)
range_over_lncrna <- range_free[GenomicRanges::countOverlaps(range_free, lncrna) >= 2] %>% GenomicRanges::reduce()
if (length(range_over_lncrna) > 0) {
hits <- GenomicRanges::findOverlaps(range_over_lncrna, lncrna)
lncrna_p1 <- lncrna[S4Vectors::subjectHits(hits)] %>% S4Vectors::split(S4Vectors::queryHits(hits)) %>% range() %>% BiocGenerics::unlist() %>% unname()
lncrna_p2 <- lncrna[-S4Vectors::subjectHits(hits)]
lncrna <- c(lncrna_p1, lncrna_p2) %>% BiocGenerics::sort()
}
}
# Ensure that all lncRNAs have at least <trim_offset> gap with the nearest genes (both up- and downstream):
lncrna <- lncrna %>% trim_by_down_or_upstream_features(hml_genes, mode = "down", offset = trim_offset) %>%
trim_by_down_or_upstream_features(hml_genes, mode = "up", offset = trim_offset)
# Filter lncRNAs by width:
lncrna <- lncrna %>% `[`(BiocGenerics::width(.) >= min_lncrna_width)
# Prepare the output:
S4Vectors::mcols(lncrna)$score <- 0
S4Vectors::mcols(lncrna)$type <- "Nascent"
lncrna_id <- sprintf(paste0("%0", nchar(length(lncrna)), "i"), 1:length(lncrna))
S4Vectors::mcols(lncrna)$name <- BiocGenerics::paste(S4Vectors::mcols(lncrna)$type, lncrna_id, sep = "_")
message(length(lncrna), " lncRNAs called;")
out <- list("hml_genes" = hml_genes, "hml_genes_with_flanks" = hml_genes_with_fl, "lncRNA" = lncrna)
return(out)
}
# -----------------------------------------------------------------------------------------------------------
extend_genes_along_nascent_intervals <- function(genes, tc, nascent, mode = "tss", flanks) {
stopifnot(mode %in% c("tss", "pas"))
# mode == "tss": <nascent> and <TC> are expected to be <heads> and <tss_strong>, respectively;
# mode == "pas": <nascent> and <TC> are expected to be <tails> and <pas_strong>, respectively;
# This function searches for unused strong TSS/PAS within the whole head/tail;
# The <heads> may be additionally extended by <flanks[[1]]>, <tails> = by <flanks[[2]]>;
h1 <- GenomicRanges::flank(nascent, 1, start = !isTRUE(mode == "tss")) %>% GenomicRanges::findOverlaps(genes)
if (length(h1) == 0) {
warning("\tDid not find any ", ifelse(mode == "tss", "heads", "tails"), " among the nascent intervals!")
return(list("genes" = genes, "nascent" = nascent))
}
out_1 <- genes[-S4Vectors::subjectHits(h1)]
genes <- genes[S4Vectors::subjectHits(h1)]
win <- nascent[S4Vectors::queryHits(h1)] %>% GenomicRanges::resize(BiocGenerics::width(.) + abs(flanks[[mode]]), fix = ifelse(mode == "tss", "end", "start"))
h2 <- GenomicRanges::findOverlaps(win, tc)
if (length(h2) == 0) {
message("\tDid not find any unused strong ", stringr::str_to_upper(mode), " inside of the ", ifelse(mode == "tss", "heads", "tails"), ";")
return(list("genes" = genes, "nascent" = nascent))
}
# In case of multiple TC per window, choose the strongest one:
scores <- BiocGenerics::score(tc)[S4Vectors::subjectHits(h2)]
best <- BiocGenerics::tapply(scores, S4Vectors::queryHits(h2), find_uniq_max, simplify = FALSE) %>% BiocGenerics::unlist() %>% unname()
h2 <- h2[best]
out_2 <- genes[-S4Vectors::queryHits(h2)]
genes <- genes[S4Vectors::queryHits(h2)]
tc_par <- tc[S4Vectors::subjectHits(h2)]
used <- S4Vectors::queryHits(h1)[S4Vectors::queryHits(h2)]
# Extend gene border until the TC summit:
message("\t", length(tc_par), " genes found their missing ", stringr::str_to_upper(mode), ";")
IRanges::ranges(genes) <- S4Vectors::mcols(tc_par)$thick %>% GenomicRanges::punion(GenomicRanges::resize(genes, 1, fix = ifelse(mode == "tss", "end", "start")), fill.gap = TRUE) %>% IRanges::ranges() %>% unname()
# Trim the consumed nascent interval:
nascent_out <- GenomicRanges::setdiff(nascent, nascent[used])
genes_out <- c(out_1, out_2, genes) %>% BiocGenerics::sort()
return(list("genes" = genes_out, "nascent" = nascent_out))
}
# --------------------------------------------------------------------------------------------------------------
decorate_genes <- function(genes, planet, mode) {
stopifnot(mode %in% c("up", "down"))
if (mode == "up") {
hits <- suppressWarnings(GenomicRanges::flank(genes, 1) %>% GenomicRanges::trim() %>% GenomicRanges::findOverlaps(planet))
} else {
hits <- suppressWarnings(GenomicRanges::flank(genes, 1, start = FALSE) %>% GenomicRanges::trim() %>% GenomicRanges::findOverlaps(planet))
}
out_1 <- genes[-S4Vectors::queryHits(hits)]
par1 <- genes[S4Vectors::queryHits(hits)]
par2 <- planet[S4Vectors::subjectHits(hits)]
if (mode == "up") {
IRanges::ranges(par1) <- GenomicRanges::punion(GenomicRanges::resize(par2, 1, "start"), GenomicRanges::resize(par1, 1, "end"), fill.gap = TRUE) %>% IRanges::ranges() %>% unname()
} else {
IRanges::ranges(par1) <- GenomicRanges::punion(GenomicRanges::resize(par1, 1, "start"), GenomicRanges::resize(par2, 1, "end"), fill.gap = TRUE) %>% IRanges::ranges() %>% unname()
}
out <- c(out_1, par1) %>% BiocGenerics::sort()
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.