#' Annotate hybrids
#'
#' Annotates hybrids.dt using iCount \code{regions.gtf} file (generated by \code{iCount segment})
#'
#' @param hybrids.dt Hybrids data.table
#' @param regions.gr iCount regions.gtf file as GRanges
#' @return \code{hybrids.dt} annotated with \code{region}, \code{gene_id}, \code{gene_name} and \code{biotype} columns
#' @export
#' @import data.table
annotate_hybrids <- function(hybrids.dt, regions.gr) {
# ==========
# Left
# ==========
L.gr <- toscatools::convert_to_granges(hybrids.dt, arm = "L", genomic = TRUE)
# Sync seqlevels
common.seqlevels <- unique(c(
GenomeInfoDb::seqlevels(L.gr),
GenomeInfoDb::seqlevels(regions.gr)
))
GenomeInfoDb::seqlevels(L.gr) <- common.seqlevels
GenomeInfoDb::seqlevels(regions.gr) <- common.seqlevels
ol <- GenomicRanges::findOverlaps(GenomicRanges::resize(L.gr, width = 1, fix = "center"), regions.gr)
stopifnot(all(!duplicated(S4Vectors::queryHits(ol))))
match.gr <- regions.gr[S4Vectors::subjectHits(ol)]
hybrids.dt[S4Vectors::queryHits(ol), `:=`(
L_region = match.gr$type,
L_gene_id = match.gr$gene_id,
L_gene_name = match.gr$gene_name,
L_biotype = match.gr$biotype
)]
# ==========
# Right
# ==========
R.gr <- toscatools::convert_to_granges(hybrids.dt, arm = "R", genomic = TRUE)
# Sync seqlevels
common.seqlevels <- unique(c(
GenomeInfoDb::seqlevels(R.gr),
GenomeInfoDb::seqlevels(regions.gr)
))
GenomeInfoDb::seqlevels(R.gr) <- common.seqlevels
GenomeInfoDb::seqlevels(regions.gr) <- common.seqlevels
ol <- GenomicRanges::findOverlaps(GenomicRanges::resize(R.gr, width = 1, fix = "center"), regions.gr)
stopifnot(all(!duplicated(S4Vectors::queryHits(ol))))
match.gr <- regions.gr[S4Vectors::subjectHits(ol)]
hybrids.dt[S4Vectors::queryHits(ol), `:=`(
R_region = match.gr$type,
R_gene_id = match.gr$gene_id,
R_gene_name = match.gr$gene_name,
R_biotype = match.gr$biotype
)]
# ==========
# rRNA
# ==========
hybrids.dt[grep("rRNA|rDNA", L_seqnames), `:=`(
L_gene_id = L_seqnames,
L_region = "rRNA",
L_gene_name = L_seqnames,
L_biotype = "rRNA"
)]
hybrids.dt[grep("rRNA|rDNA", R_seqnames), `:=`(
R_gene_id = R_seqnames,
R_region = "rRNA",
R_gene_name = R_seqnames,
R_biotype = "rRNA"
)]
# ==========
# tRNA
# ==========
hybrids.dt[grep("tRNA", L_seqnames), `:=`(
L_gene_id = L_seqnames,
L_region = "tRNA",
L_gene_name = L_seqnames,
L_biotype = "tRNA"
)]
hybrids.dt[grep("tRNA", R_seqnames), `:=`(
R_gene_id = R_seqnames,
R_region = "tRNA",
R_gene_name = R_seqnames,
R_biotype = "tRNA"
)]
# ==========
# Bug work-around for iCount missing a couple of intergenic regions
# ==========
hybrids.dt[is.na(L_region), `:=`(
L_region = "intergenic",
L_gene_id = ".",
L_gene_name = "None",
L_biotype = ""
)]
hybrids.dt[is.na(R_region), `:=`(
R_region = "intergenic",
R_gene_id = ".",
R_gene_name = "None",
R_biotype = ""
)]
stopifnot(all(!is.na(c(hybrids.dt$L_region, hybrids.dt$R_region))))
return(hybrids.dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.