#' Call transcript and gene models from corrected full-length RNA-seq reads
#'
#' @param long_reads \code{GRangesList} object.
#' @param skip_minor_tx Numeric in the range (0, 1), or NULL.
#' @param max_overlap_called Numeric in the range [0, 1).
#' @param min_read_width Positive integer.
#' @param min_overlap_fusion Numeric in the range (0, 1].
#' @param clust_threshold Numeric in the range (0, 1].
#' @section Details:
#' The input \code{GRangesList} object is returned by the \code{detect_alignment_errors()} function.\cr
#' Long reads either marked as truncated by \code{extend_long_reads_to_TSS_and_PAS()}, or containing a misaligned exon (as revealed by \code{detect_alignment_errors()}), are skipped from the transcript calling procedure.
#' The remaining long reads are collapsed into transcripts. The transcripts are classified into high confidence (HC), medium confidence (MC) and low confidence (LC) groups:
#' \itemize{
#' \item HC transcripts are called from reads which start in a TSS and end in a PAS;
#' \item MC transcripts are called from TSS-only or PAS-only reads which do not overlap with any HC transcript by more than \code{max_overlap_called} fraction of either read or transcript length;
#' \item LC transcripts are called from reads which neither start in a TSS nor end in a PAS, and do not overlap with any HC or MC transcript by mode than \code{max_overlap_called}.
#' }
#' This iterative procedure of transcript calling ensures that highly expressed HC loci are not contaminated with less reliable MC or LC transcripts.
#' The MC/LC transcripts are not guaranteed to be full-length. To decrease the risk of picking up products of partial RNA degradation, MC and LC transcripts
#' can be called only from reads longer than \code{min_read_length} bp.\cr
#' The called HC, MC and LC transcripts are clustered into HC, MC and LC genes, respectively. A pair of transcripts of the same type having overlap (intersect/union) above the \code{clust_threshold}
#' are considered belonging to the same gene.\cr
#' Within each gene, the minor transcripts (collectively representing up to \code{skip_minor_tx} fraction of the reads) are skipped from further consideration. To suppress this behavior, set \code{skip_minor_tx = NULL}.\cr
#' Finally, transcripts which overlap at least two other disjoint transcripts by at least \code{min_overlap_fusion} fraction of their lengths, are considered fusion transcripts.
#' @return List of length 5:
#' \enumerate{
#' \item \code{GRanges} object (HC, MC and LC genes);
#' \item \code{GRangesList} object (HC, MC and LC transcripts);
#' \item \code{GRanges} object (fusion genes);
#' \item \code{GRangesList} object (fusion transcripts);
#' \item \code{GRangesList} object (unused long reads outside of the called genes and transcripts).
#' }
#' @export
call_transcripts_and_genes <- function(long_reads, skip_minor_tx = 0.01, max_overlap_called = 0.1, min_read_width = 1000, min_overlap_fusion = 0.5, clust_threshold = 0.8) {
stopifnot(BiocGenerics::grepl("GRangesList", class(long_reads)))
stopifnot(is.numeric(skip_minor_tx) && length(skip_minor_tx) == 1 && skip_minor_tx > 0 && skip_minor_tx < 1)
stopifnot(is.numeric(max_overlap_called) && length(max_overlap_called) == 1 && max_overlap_called >= 0 && max_overlap_called < 1)
stopifnot(is.numeric(min_read_width) && length(min_read_width) == 1 && min_read_width %% 1 == 0 && min_read_width > 0)
stopifnot(is.numeric(min_overlap_fusion) && length(min_overlap_fusion) == 1 && min_overlap_fusion > 0 && min_overlap_fusion <= 1)
stopifnot(is.numeric(clust_threshold) && length(clust_threshold) == 1 && clust_threshold > 0 && clust_threshold <= 1)
all_exons <- BiocGenerics::unlist(long_reads, use.names = FALSE)
# Skip truncated reads, as well as reads containing alignment errors:
mc <- S4Vectors::mcols(all_exons)
bad_id <- mc$read_id[!mc$complete] %>% BiocGenerics::unique()
message(length(bad_id), " truncated reads;")
if (!is.null(mc$aln_error)) {
bad_id_2 <- mc$read_id[mc$aln_error] %>% BiocGenerics::unique()
message(length(bad_id_2), " reads with alignment errors;")
bad_id <- c(bad_id, bad_id_2) %>% BiocGenerics::unique()
}
message(length(bad_id), " reads skipped from transcript calling;")
to_skip <- mc$read_id %in% bad_id
exons_unused <- all_exons[to_skip]
reads_unused <- S4Vectors::split(exons_unused, S4Vectors::mcols(exons_unused)$read_id)
good_exons <- all_exons[!to_skip]
# Call HC transcripts (from TSS+PAS reads):
lgl <- S4Vectors::mcols(good_exons)$over_tc == "both"
res_hc <- good_exons[lgl] %>% call_tx_and_find_fusion(prefix = "HC", min_overlap_fusion = min_overlap_fusion, clust_threshold = clust_threshold, skip = TRUE)
hc_tx <- res_hc[[1]]
fusion_tx <- res_hc[[2]]
#good_exons <- good_exons[!lgl] %>% update_free_reads(c(hc_tx, fusion_tx), max_overlap_called = max_overlap_called, min_read_width = min_read_width, clust_threshold = clust_threshold)
good_exons <- good_exons[!lgl] %>% update_free_reads(hc_tx, max_overlap_called = max_overlap_called, min_read_width = min_read_width, clust_threshold = clust_threshold)
# Call MC transcripts (from TSS-only or PAS-only reads):
lgl <- S4Vectors::mcols(good_exons)$over_tc != "no"
res_mc <- good_exons[lgl] %>% call_tx_and_find_fusion(prefix = "MC", known_tx = hc_tx, known_fusion = fusion_tx, min_overlap_fusion = min_overlap_fusion, clust_threshold = clust_threshold, skip = TRUE)
hc_mc_tx <- res_mc[[1]]
fusion_tx <- res_mc[[2]]
#good_exons <- good_exons[!lgl] %>% update_free_reads(c(hc_mc_tx, fusion_tx), max_overlap_called = max_overlap_called, min_read_width = min_read_width, clust_threshold = clust_threshold)
good_exons <- good_exons[!lgl] %>% update_free_reads(hc_mc_tx, max_overlap_called = max_overlap_called, min_read_width = min_read_width, clust_threshold = clust_threshold)
# Call LC transcripts (from noTSS-noPAS reads):
res_lc <- good_exons %>% call_tx_and_find_fusion(prefix = "LC", known_tx = hc_mc_tx, known_fusion = fusion_tx, min_overlap_fusion = min_overlap_fusion, clust_threshold = clust_threshold, skip = TRUE)
hml_tx <- res_lc[[1]]
fusion_tx <- res_lc[[2]]
#exons_free <- exons_unused %>% update_free_reads(c(hml_tx, fusion_tx), max_overlap_called = max_overlap_called, min_read_width = min_read_width, clust_threshold = clust_threshold)
exons_free <- exons_unused %>% update_free_reads(hml_tx, max_overlap_called = max_overlap_called, min_read_width = min_read_width, clust_threshold = clust_threshold)
reads_free <- S4Vectors::split(exons_free, S4Vectors::mcols(exons_free)$read_id)
# Split hml_tx into HC, MC and LC transcripts:
tx_spl <- split_tx_by_type(hml_tx)
hc_tx <- tx_spl[[1]]
mc_tx <- tx_spl[[2]]
lc_tx <- tx_spl[[3]]
# Call HC, MC and LC genes:
hc_genes <- call_genes(hc_tx, clust_threshold = clust_threshold)
mc_genes <- call_genes(mc_tx, clust_threshold = clust_threshold) # observe that some MC genes may behave as HC or LC (i.e. appear as TSS+PAS or noTSS-noPAS)
lc_genes <- call_genes(lc_tx, clust_threshold = clust_threshold)
# Skip minor transcripts within each gene:
if (!is.null(skip_minor_tx)) {
message("Skipping minor transcripts...")
out_hc <- skip_minor_transcripts(hc_genes, hc_tx, q = skip_minor_tx)
hc_tx <- out_hc[[1]]
message("\t", length(out_hc[[2]]), " minor HC transcripts skipped;")
out_mc <- skip_minor_transcripts(mc_genes, mc_tx, q = skip_minor_tx)
mc_tx <- out_mc[[1]]
message("\t", length(out_mc[[2]]), " minor MC transcripts skipped;")
out_lc <- skip_minor_transcripts(lc_genes, lc_tx, q = skip_minor_tx)
lc_tx <- out_lc[[1]]
message("\t", length(out_lc[[2]]), " minor LC transcripts skipped;")
}
# Recalculate genes (to update revmap) and generate unique gene IDs:
hml_genes <- c(hc_tx, mc_tx, lc_tx) %>% sort_grl() %>% call_HC_MC_LC_genes(clust_threshold = clust_threshold, generate_ids = TRUE)
# Generate unique transcript IDs:
hc_tx <- hml_genes[S4Vectors::mcols(hml_genes)$type == "HC"] %>% generate_tx_id(hc_tx, .)
mc_tx <- hml_genes[S4Vectors::mcols(hml_genes)$type == "MC"] %>% generate_tx_id(mc_tx, .)
lc_tx <- hml_genes[S4Vectors::mcols(hml_genes)$type == "LC"] %>% generate_tx_id(lc_tx, .)
# Combine HC, MC and LC transcripts:
message("Final stats:")
message("\t", length(hc_tx), " HC transcripts in ", length(hc_genes), " HC genes;")
message("\t", length(mc_tx), " MC transcripts in ", length(mc_genes), " MC genes;")
message("\t", length(lc_tx), " LC transcripts in ", length(lc_genes), " LC genes;")
hml_tx <- c(hc_tx, mc_tx, lc_tx) %>% sort_grl()
# Find more fusion reads among unused reads by overlap with called HC, MC and LC genes:
fusion_idx <- reads_unused %>% range() %>% BiocGenerics::unlist() %>% find_fusion_tx(hml_genes, skip = FALSE)
reads_fusion <- reads_unused %>% `[`(fusion_idx) %>% BiocGenerics::unlist(use.names = FALSE) %>% `[`(S4Vectors::mcols(.)$complete)
if (length(reads_fusion) > 0) {
reads_fusion <- reads_fusion %>% S4Vectors::split(S4Vectors::mcols(.)$read_id) %>% sort_grl()
# Convert them to new fusion transcripts:
reads_fusion_dedup <- deduplicate_grl(reads_fusion)
reads_fusion_unl <- BiocGenerics::unlist(reads_fusion_dedup, use.names = FALSE)
S4Vectors::mcols(reads_fusion_unl)$type <- "Fusion"
reads_fusion_dedup <- BiocGenerics::relist(reads_fusion_unl, reads_fusion_dedup)
fusion_tx <- c(fusion_tx, reads_fusion_dedup)
}
if (length(fusion_tx) > 0) {
fusion_tx <- fusion_tx %>% unname() %>% sort_grl()
fusion_tx_unl <- unlist(fusion_tx, use.names = FALSE)
S4Vectors::mcols(fusion_tx_unl)$grp <- rep(1:length(fusion_tx), times = S4Vectors::elementNROWS(fusion_tx))
fusion_tx <- BiocGenerics::relist(fusion_tx_unl, fusion_tx)
# Group fusion transcripts into "fusion TUs":
fusion_tu <- fusion_tx %>% call_tu(clust_threshold = clust_threshold) %>% skip_nested()
message("\t", length(fusion_tx), " fusion transcripts in ", length(fusion_tu), " fusion genes;")
S4Vectors::mcols(fusion_tu)$type <- "Fusion"
fusion_tu_id <- sprintf(paste0("%0", nchar(length(fusion_tu)), "i"), 1:length(fusion_tu))
S4Vectors::mcols(fusion_tu)$name <- BiocGenerics::paste(S4Vectors::mcols(fusion_tu)$type, "TU", fusion_tu_id, sep = "_")
# Generate unique IDs for fusion transcripts:
fusion_tx <- generate_tx_id(fusion_tx, fusion_tu)
} else {
fusion_tu <- GRangesList()
}
S4Vectors::mcols(hml_genes)$revmap <- NULL
S4Vectors::mcols(fusion_tu)$revmap <- NULL
out <- list("hml_genes" = hml_genes, "hml_tx" = hml_tx, "fusion_genes" = fusion_tu, "fusion_tx" = fusion_tx, "reads_free" = reads_free)
return(out)
}
# --------------------------------------------------------------------------------------------------------------------
call_transcripts <- function(all_exons, prefix) {
all_exons <- all_exons[BiocGenerics::order(S4Vectors::mcols(all_exons)$exon_id)] # in fact, they are assumed to be already ordered...
reads <- S4Vectors::split(all_exons, S4Vectors::mcols(all_exons)$read_id)
tx <- deduplicate_grl(reads)
tx_unl <- BiocGenerics::unlist(tx, use.names = FALSE)
S4Vectors::mcols(tx_unl)$type <- prefix
tx <- BiocGenerics::relist(tx_unl, tx)
return(tx)
}
#------------------------------------------------------------------------------------------------------------------
find_free_reads <- function(exons, genes, max_overlap_called, min_read_width) {
exons <- exons[BiocGenerics::order(S4Vectors::mcols(exons)$exon_id)]
reads <- S4Vectors::split(exons, S4Vectors::mcols(exons)$read_id)
gr <- reads %>% range() %>% BiocGenerics::unlist()
over <- gr %over% genes
hits <- GenomicRanges::findOverlaps(gr, genes)
par1 <- gr[S4Vectors::queryHits(hits)]
par2 <- genes[S4Vectors::subjectHits(hits)]
overlap <- GenomicRanges::pintersect(par1, par2) %>% BiocGenerics::width()
best <- BiocGenerics::tapply(overlap, S4Vectors::queryHits(hits), find_uniq_max, simplify = FALSE) %>% BiocGenerics::unlist() %>% unname()
best_overlap <- overlap[best]
hits <- hits[best]
par1 <- par1[best]
par2 <- par2[best]
weak <- best_overlap / BiocGenerics::width(par1) <= max_overlap_called & best_overlap / BiocGenerics::width(par2) <= max_overlap_called & BiocGenerics::width(par1) >= min_read_width
idx <- c(BiocGenerics::which(!over), S4Vectors::queryHits(hits)[weak]) %>% BiocGenerics::sort()
message("\t", length(idx), " reads remain free;")
out <- reads[idx] %>% BiocGenerics::unlist(use.names = FALSE)
return(out)
}
#------------------------------------------------------------------------------------------------------------------
find_fusion_tx <- function(gr1, gr2 = NULL, min_overlap_fusion = 0.5, skip = TRUE) {
if (is.null(gr2)) {
gr2 <- gr1
hits <- GenomicRanges::findOverlaps(gr1, gr2)
hits <- hits[S4Vectors::queryHits(hits) != S4Vectors::subjectHits(hits)]
} else {
hits <- GenomicRanges::findOverlaps(gr1, gr2)
}
if (length(hits) == 0) {
return(integer(0))
}
# Skip weak overlaps:
par1 <- gr1[S4Vectors::queryHits(hits)] # fusion candidates
par2 <- gr2[S4Vectors::subjectHits(hits)]
valid <- BiocGenerics::width(GenomicRanges::pintersect(par1, par2)) / BiocGenerics::width(par2) >= min_overlap_fusion
hits <- hits[valid]
if (length(hits) == 0) {
return(integer(0))
}
# Check that the fusion candidate covers at least 2 non-overlapping TUs:
grl <- gr2[S4Vectors::subjectHits(hits)] %>% S4Vectors::split(S4Vectors::queryHits(hits))
group <- rep(1:length(grl), times = S4Vectors::elementNROWS(grl))
good <- grl %>% GenomicRanges::reduce() %>% S4Vectors::elementNROWS() %>% `>=`(2) %>% `[`(group)
hits <- hits[good]
if (length(hits) == 0) {
return(integer(0))
}
# Skip fusion candidates which are transcribed higher than the overlapped TUs:
if (isTRUE(skip)) {
stopifnot(!is.null(BiocGenerics::score(gr1)) && !is.null(BiocGenerics::score(gr2)))
par1 <- gr1[S4Vectors::queryHits(hits)]
par2 <- gr2[S4Vectors::subjectHits(hits)]
grl <- S4Vectors::split(par2, S4Vectors::queryHits(hits))
group <- rep(1:length(grl), times = S4Vectors::elementNROWS(grl))
score_sum <- BiocGenerics::score(par2) %>% S4Vectors::split(S4Vectors::queryHits(hits)) %>% BiocGenerics::lapply(sum) %>% BiocGenerics::unlist() %>% unname() %>% `[`(group)
bad <- BiocGenerics::score(par1) > score_sum
hits <- hits[!bad]
if (length(hits) == 0) {
return(integer(0))
}
}
fusion_idx <- hits %>% S4Vectors::queryHits() %>% BiocGenerics::unique()
return(fusion_idx)
}
#------------------------------------------------------------------------------------------------------------------
call_tu <- function(tx, clust_threshold) {
gr <- tx %>% range() %>% BiocGenerics::unlist()
hits <- GenomicRanges::findOverlaps(gr, gr)
hits <- hits[S4Vectors::queryHits(hits) < S4Vectors::subjectHits(hits)]
par1 <- gr[S4Vectors::queryHits(hits)]
par2 <- gr[S4Vectors::subjectHits(hits)]
valid <- BiocGenerics::width(GenomicRanges::pintersect(par1, par2)) / BiocGenerics::width(GenomicRanges::punion(par1, par2)) >= clust_threshold # nested transcripts remain unmerged
hits <- hits[valid]
used_idx <- c(S4Vectors::queryHits(hits), S4Vectors::subjectHits(hits)) %>% BiocGenerics::unique()
used <- vector("logical", length(gr))
used[used_idx] <- TRUE
out_1 <- gr[!used] %>% unname()
if (length(out_1) > 0) {
S4Vectors::mcols(out_1)$revmap <- seq(1, length(gr))[!used] %>% S4Vectors::split(1:length(.)) %>% unname() %>% as("IntegerList")
}
# Merge tx with at least one strong overlap:
mat <- hits %>% as.matrix() %>% unname()
if (nrow(mat) > 0) {
clust <- cluster_indexes(mat)
group <- rep(1:length(clust), times = S4Vectors::elementNROWS(clust))
out_2 <- gr[BiocGenerics::unlist(clust)] %>% S4Vectors::split(group) %>% GenomicRanges::reduce() %>% BiocGenerics::unlist() %>% unname()
} else {
out_2 <- GRanges()
}
if (length(out_2) > 0) {
S4Vectors::mcols(out_2)$revmap <- clust %>% as("IntegerList")
}
out <- c(out_1, out_2) %>% BiocGenerics::sort()
# Calculate TU support (summary score of the merged transcripts):
if (!is.null(BiocGenerics::score(BiocGenerics::unlist(tx)))) {
tx_score <- BiocGenerics::unlist(tx) %>% BiocGenerics::score() %>% BiocGenerics::tapply(rep(1:length(tx), times = S4Vectors::elementNROWS(tx)), BiocGenerics::unique) %>% BiocGenerics::unlist() %>% unname()
revmap <- S4Vectors::mcols(out)$revmap
BiocGenerics::score(out) <- tx_score[BiocGenerics::unlist(revmap)] %>% S4Vectors::split(rep(1:length(revmap), times = S4Vectors::elementNROWS(revmap))) %>% BiocGenerics::lapply(sum) %>% BiocGenerics::unlist() %>% unname()
}
return(out)
}
# -----------------------------------------------------------------------------------------------------------------
skip_nested <- function(gr) {
hits <- GenomicRanges::findOverlaps(gr, gr, type = "within") %>% as.matrix()
c1 <- hits[, 1]
c2 <- hits[, 2]
good <- c1 <= c2
mat <- BiocGenerics::cbind(ifelse(good, c1, c2), ifelse(good, c2, c1))
mat <- mat[mat[, 1] < mat[, 2], , drop = FALSE]
if (nrow(mat) == 0) {
return(gr)
}
used <- c(mat[, 1], mat[, 2]) %>% BiocGenerics::unique()
out <- gr[-used]
if (length(out) > 0) {
if (!is.null(S4Vectors::mcols(gr)$revmap)) {
S4Vectors::mcols(out) <- BiocGenerics::subset(S4Vectors::mcols(out), select = c("revmap", "score"))
} else {
S4Vectors::mcols(out) <- BiocGenerics::subset(S4Vectors::mcols(out), select = "score")
}
} else {
S4Vectors::mcols(out) <- NULL
}
if (nrow(mat) > 0) {
clust <- cluster_indexes(mat)
group <- rep(1:length(clust), times = S4Vectors::elementNROWS(clust))
gr_2 <- gr[BiocGenerics::unlist(clust)]
out_2 <- gr_2 %>% S4Vectors::split(group) %>% GenomicRanges::reduce() %>% BiocGenerics::unlist() %>% unname()
if (!is.null(S4Vectors::mcols(gr)$revmap)) {
rm_2 <- S4Vectors::mcols(gr_2)$revmap %>% BiocGenerics::tapply(group, function(x) { BiocGenerics::Reduce(c, x) }) %>% as("IntegerList") %>% unname()
S4Vectors::mcols(out_2)$revmap <- rm_2
}
sc_2 <- BiocGenerics::score(gr_2) %>% BiocGenerics::tapply(group, sum) %>% BiocGenerics::unlist() %>% unname()
BiocGenerics::score(out_2) <- sc_2
out <- c(out, out_2)
}
return(out)
}
# ------------------------------------------------------------------------------------------------------------------
generate_tx_id <- function(tx, genes) {
stopifnot(!is.null(S4Vectors::mcols(genes)$revmap))
stopifnot(tx %>% BiocGenerics::unlist() %>% S4Vectors::mcols() %>% .$grp %>% is.null() %>% `!`)
revmap <- S4Vectors::mcols(genes)$revmap
tx <- BiocGenerics::unlist(tx, use.names = FALSE) %>% `[`(BiocGenerics::order(S4Vectors::mcols(.)$grp)) %>% S4Vectors::split(S4Vectors::mcols(.)$grp) %>% `[`(BiocGenerics::unlist(revmap))
max_num <- S4Vectors::elementNROWS(tx) %>% max() %>% nchar()
tx_num <- BiocGenerics::lapply(S4Vectors::elementNROWS(revmap), function(x) { sprintf(paste0("%0", max_num, "i"), seq(1, x)) }) %>% BiocGenerics::unlist()
names(tx) <- S4Vectors::mcols(genes)$name %>% rep(times = S4Vectors::elementNROWS(revmap)) %>% BiocGenerics::paste("tx", tx_num, sep = "_")
return(tx)
}
# ------------------------------------------------------------------------------------------------------------------
# <skip> fusion candidates which are transcribed higher than the overlapped TUs
call_tx_and_find_fusion <- function(input_exons, prefix, known_tx = GenomicRanges::GRangesList(), known_fusion = GenomicRanges::GRangesList(), min_overlap_fusion, clust_threshold, skip) {
message("Calling ", prefix, " transcripts...")
new_tx <- call_transcripts(input_exons, prefix = prefix) %>% BiocGenerics::sort() %>% sort_grl()
message("\t", length(new_tx), " ", prefix, " transcripts called;"); flush.console()
# Combine known and new transcripts:
all_tx <- c(known_tx, new_tx) %>% sort_grl()
# Call all TUs:
all_tu <- all_tx %>% call_tu(clust_threshold = clust_threshold)
# Find fusion transcripts on the combined data:
idx1 <- find_fusion_tx(all_tu, min_overlap_fusion = min_overlap_fusion, skip = skip)
idx2 <- S4Vectors::mcols(all_tu)$revmap %>% `[`(idx1) %>% BiocGenerics::unlist()
message("\t", length(idx2), " fusion transcripts detected;"); flush.console()
# Append novel fusion transcripts to the existing ones:
if (length(idx2) > 0) {
new_fusion <- all_tx[idx2]
new_unl <- BiocGenerics::unlist(new_fusion, use.names = FALSE)
S4Vectors::mcols(new_unl)$type <- "Fusion"
new_fusion <- BiocGenerics::relist(new_unl, new_fusion)
all_tx <- all_tx[-idx2]
known_fusion <- c(known_fusion, new_fusion)
}
out <- list("all_tx" = all_tx, "known_fusion" = known_fusion)
return(out)
}
# ------------------------------------------------------------------------------------------------------------------
update_unused_reads <- function(all_exons, used_id) {
unused_exons <- all_exons[!S4Vectors::mcols(all_exons)$read_id %in% used_id]
message("\t", S4Vectors::mcols(unused_exons)$read_id %>% BiocGenerics::unique() %>% length(), " reads remain unused;"); flush.console()
return(unused_exons)
}
# --------------------------------------------------------------------------------------------------------------------
update_free_reads <- function(exons, known_tx, max_overlap_called, min_read_width, clust_threshold) {
known_genes <- known_tx %>% call_tu(clust_threshold = clust_threshold) %>% skip_nested()
out <- find_free_reads(exons, known_genes, max_overlap_called = max_overlap_called, min_read_width = min_read_width)
return(out)
}
# -------------------------------------------------------------------------------------------------------------------
skip_minor_transcripts <- function(genes, tx, q) {
stopifnot(!is.null(BiocGenerics::score(BiocGenerics::unlist(tx))))
revmap <- S4Vectors::mcols(genes)$revmap
revmap_unl <- BiocGenerics::unlist(revmap)
grp1 <- rep(1:length(revmap), times = S4Vectors::elementNROWS(revmap))
grp2 <- rep(1:length(tx), times = S4Vectors::elementNROWS(tx))
score_tx <- tx %>% BiocGenerics::unlist() %>% BiocGenerics::score() %>% BiocGenerics::tapply(grp2, BiocGenerics::unique) %>% BiocGenerics::unlist() %>% unname() %>% `[`(revmap_unl)
good <- score_tx %>% S4Vectors::split(grp1) %>% BiocGenerics::lapply(quantile_trim, q = q, value = "lgl") %>% BiocGenerics::unlist() %>% unname()
skip_idx <- revmap_unl[!good]
#message(length(skip_idx), " transcripts were skipped due to low expression;")
if (length(skip_idx) > 0) {
out <- list("good_tx" = tx[-skip_idx], "bad_tx" = tx[skip_idx])
} else {
out <- list("good_tx" = tx, "bad_tx" = GenomicRanges::GRangesList())
}
return(out)
}
# ------------------------------------------------------------------------------------------------------------
call_genes <- function(tx, clust_threshold, type = NULL) {
genes <- tx %>% call_tu(clust_threshold = clust_threshold) %>% skip_nested()
if (!is.null(type) && class(type) == "character") {
S4Vectors::mcols(genes)$type <- type
}
return(genes)
}
# ------------------------------------------------------------------------------------------------------------
split_tx_by_type <- function(hml_tx) {
hml_tx_unl <- BiocGenerics::unlist(hml_tx, use.names = FALSE)
S4Vectors::mcols(hml_tx_unl)$grp <- rep(1:length(hml_tx), times = S4Vectors::elementNROWS(hml_tx))
hc_tx <- hml_tx_unl %>% `[`(S4Vectors::mcols(.)$type == "HC") %>% S4Vectors::split(S4Vectors::mcols(.)$grp)
mc_tx <- hml_tx_unl %>% `[`(S4Vectors::mcols(.)$type == "MC") %>% S4Vectors::split(S4Vectors::mcols(.)$grp)
lc_tx <- hml_tx_unl %>% `[`(S4Vectors::mcols(.)$type == "LC") %>% S4Vectors::split(S4Vectors::mcols(.)$grp)
out <- list("hc_tx" = hc_tx, "mc_tx" = mc_tx, "lc_tx" = lc_tx)
return(out)
}
# -----------------------------------------------------------------------------------------------------------
call_HC_MC_LC_genes <- function(hml_tx, clust_threshold, generate_ids = FALSE) {
# Split hml_tx into HC, MC and LC transcripts:
tx_spl <- split_tx_by_type(hml_tx)
# Call the genes:
hc_genes <- tx_spl[[1]] %>% call_genes(clust_threshold = clust_threshold, type = "HC")
mc_genes <- tx_spl[[2]] %>% call_genes(clust_threshold = clust_threshold, type = "MC")
lc_genes <- tx_spl[[3]] %>% call_genes(clust_threshold = clust_threshold, type = "LC")
hml_genes <- c(hc_genes, mc_genes, lc_genes) %>% BiocGenerics::sort()
if (isTRUE(generate_ids)) {
# Generate unique gene IDs:
gene_id <- sprintf(paste0("%0", nchar(length(hml_genes)), "i"), 1:length(hml_genes))
S4Vectors::mcols(hml_genes)$name <- BiocGenerics::paste(S4Vectors::mcols(hml_genes)$type, "gene", gene_id, sep = "_")
}
return(hml_genes)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.