#'
#' @title Additional result annotations
#'
#' @description These functions work on the result table and add additional
#' annotations to the reported introns: the type of potential impact on
#' splicing (e.g. exon skipping, exon truncation, ...), potential occurence
#' of frameshift, overlap with UTR regions as well as a flag for introns
#' that are located in blacklist regions of the genome.
#'
#' \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the
#' fds for each intron based on the overlap of the intron's location with
#' the reference annotation. Has to be run before the result table is
#' created so that the new column can be included in it (see examples).
#'
#' \code{\link{annotatePotentialImpact}} annotates each intron in the results
#' table with the type of potential impact on splicing and potential
#' occurence of frameshift (likely, unlikely, inconclusive). Can also
#' calculate overlap with annotated UTR regions. Potential impact can be:
#' annotatedIntron_increasedUsage, annotatedIntron_reducedUsage,
#' exonTruncation, exonElongation, exonTruncation&Elongation,
#' exonSkipping, splicingBeyondGene,
#' multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene,
#' complex (everything else).
#' Splice sites (theta metric) annotations indicate how the splice site is
#' located with respect to the reference annotation. The annotated types
#' are: annotatedSpliceSite, exonicRegion, intronicRegion.
#'
#' \code{\link{flagBlacklistRegions}} flags introns in the results table on
#' whether or not they are located in a blacklist region of the genome. By
#' default, the blacklist regions as reported in
#' \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from
#' \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here}
#' are used.
#'
#' @param fds A FraserDataSet
#' @param txdb A txdb object providing the reference annotation.
#' @param result A result table as generated by FRASER, including the column
#' \code{annotatedJunction} as generated by the function
#' \code{annotateIntronReferenceOverlap}.
#' @param addPotentialImpact Logical, indicating if the type of the potential
#' impact should be added to the results table. Defaults to \code{TRUE}.
#' @param addUTRoverlap Logical, indicating if the overlap with UTR regions
#' should checked and added to the results table. Defaults to \code{TRUE}.
#' @param minoverlap Integer value defining the number of base pairs around the
#' splice site that need to overlap with UTR or blacklist region,
#' respectivly, to be considered matching. Defaults to 5 bp.
#' @param blacklist_regions A BED file that contains the blacklist regions.
#' If \code{NULL} (default), the BED files that are packaged with FRASER
#' are used (see Details for more information).
#' @param assemblyVersion Indicates the genome assembly version of the intron
#' coordinates. Only used if blacklist_regions is NULL. For other versions,
#' please provide the BED file containing the blacklist regions directly.
#' @param BPPARAM For controlling parallelization behavior. Defaults to
#' \code{bpparam()}.
#' @return An annotated FraserDataSet or results table, respectively
#'
#' @name potentialImpactAnnotations
#' @rdname potentialImpactAnnotations
#'
#' @examples
#' # get data, fit and compute p-values and z-scores
#' fds <- createTestFraserDataSet()
#'
#' # load reference annotation
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#'
#' # add basic annotations for overlap with the reference annotation
#' # run this function before creating the results table
#' fds <- annotateIntronReferenceOverlap(fds, txdb)
#'
#' # extract results: for this small example dataset, no cutoffs used
#' # to get some results
#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
#'
#' # annotate the type of potential impact on splicing and UTR overlap
#' res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds)
#'
#' # annotate overlap with blacklist regions
#' res <- flagBlacklistRegions(result=res, assemblyVersion="hg19")
#'
#' # show results table containing additional annotations
#' res
#'
NULL
#' @describeIn potentialImpactAnnotations This method calculates basic annotations
#' based on overlap with the reference annotation (start, end, none, both)
#' for the full fds. The overlap type is added as a new column
#' \code{annotatedJunction} in \code{mcols(fds)}.
#' @export
annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
message("loading introns ...")
#seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
introns <- unique(unlist(intronsByTranscript(txdb)))
# reduce the introns to only the actually expressed introns
fds_known <- fds[unique(to(findOverlaps(introns,
rowRanges(fds, type = "j"), type = "equal"))),]
anno_introns <- as.data.table(rowRanges(fds_known,
type="j"))[,.(seqnames, start, end, strand)]
# calculate extra columns with mean/median intron expression count
# add the new columns
sampleCounts <- as.matrix(K(fds_known, type = "j"))
anno_introns[, meanCount := rowMeans(sampleCounts)]
anno_introns[, medianCount := rowMedians(sampleCounts)]
# order by medianCount (highest first)
setorderv(anno_introns, "medianCount", order=-1)
anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns,
keep.extra.columns = TRUE)
# get all fds junctions
fds_junctions <- rowRanges(fds, type = "j")
# Do the annotation just for the intron with highest median expression
message("start calculating basic annotations ...")
overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first")
annotations <- bplapply(seq_len(length(fds_junctions)),
function(i, overlaps, fds_junctions, anno_introns_ranges){
# only select first intron as already ordered by medianCount beforehand
overlap <- overlaps[i]
if(is.na(overlap)) return("none") #no overlap with any intron
hit_equal <- from(findOverlaps(fds_junctions[i],
anno_introns_ranges[overlap],
type="equal"))
if(length(hit_equal) > 0) return("both")
hit_start <- from(findOverlaps(fds_junctions[i],
anno_introns_ranges[overlap],
type="start"))
if(length(hit_start) > 0) return("start")
hit_end <- from(findOverlaps(fds_junctions[i],
anno_introns_ranges[overlap],
type="end"))
if(length(hit_end) > 0) return("end")
# overlaps but no start/end match
return("none")
}, overlaps=overlaps, fds_junctions=fds_junctions,
anno_introns_ranges=anno_introns_ranges, BPPARAM=BPPARAM)
annotations <- unlist(annotations)
rowRanges(fds)$annotatedJunction <- annotations
mcols(fds, type="ss")$annotatedJunction <- "not computed"
message("basic annotations done")
return(fds)
}
#' @describeIn potentialImpactAnnotations This method annotates the splice event
#' type to junctions in the given results table.
#' @export
annotatePotentialImpact <- function(result, txdb, fds, addPotentialImpact=TRUE,
addUTRoverlap=TRUE, minoverlap=5,
BPPARAM=bpparam()){
# convert to data.table if not already
if(!is.data.table(result)){
annoResult <- as.data.table(result)
} else{
annoResult <- result
}
# Create basic annotation of overlap with reference
if(!("annotatedJunction" %in% colnames(annoResult))){
stop("Column 'annotatedJunction' not found in the results table!\n",
"Please run 'fds <- annotateIntronReferenceOverlap(fds, txdb)' ",
"first and then extract \nthe results table using the ",
"'results(fds, ...)' function before calling this function.")
}
# Calculate splice types and frameshift
if(isTRUE(addPotentialImpact)){
annoResult <- addPotentialImpactLabels(annoResult, fds, txdb)
annoResult[potentialImpact == "singleExonSkipping",
potentialImpact := "exonSkipping"]
}
# Add UTR labels
if(isTRUE(addUTRoverlap)){
annoResult <- addUTRLabels(annoResult, txdb)
}
if(is(result, "GenomicRanges")){
annoResult <- makeGRangesFromDataFrame(annoResult,
keep.extra.columns=TRUE)
}
return(annoResult)
}
#' @describeIn potentialImpactAnnotations This method flags all introns and
#' splice sites in the given results table for which at least one splice
#' site (donor or acceptor) is located in a blacklist region. Blacklist
#' regions of the genome are determined from the provided BED file.
#' @export
flagBlacklistRegions <- function(result, blacklist_regions=NULL,
assemblyVersion=c('hg19', 'hg38'),
minoverlap=5){
# convert to data.table if not already
if(!is.data.table(result)){
annoResult <- as.data.table(result)
} else{
annoResult <- result
}
assemblyVersion <- match.arg(assemblyVersion)
if(is.null(blacklist_regions)){
blacklist_regions <-
system.file("extdata", "blacklist_regions",
paste0(assemblyVersion, "-blacklist.v2.bed.gz"),
package = "FRASER")
}
if(!file.exists(blacklist_regions)){
stop("BED file with blacklist regions does not exist: ",
blacklist_regions)
}
message("Importing blacklist regions ...")
blacklist_gr <- rtracklayer::import(blacklist_regions, format = "BED")
annoResult <- addBlacklistLabels(annoResult, blacklist_gr)
if(is(result, "GenomicRanges")){
annoResult <- makeGRangesFromDataFrame(annoResult,
keep.extra.columns=TRUE)
}
return(annoResult)
}
############# helper functions ##############################
#' blacklist annotation for aberrant splicing events
#' @noRd
addBlacklistLabels <- function(junctions_dt, blacklist_gr, minoverlap=5){
# add the blacklist information
colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
junctions_gr <- makeGRangesFromDataFrame(junctions_dt)
# get gr with start/end positions of each intron
gr_start_ss <- junctions_gr
end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1
start(gr_start_ss) <- start(gr_start_ss) - minoverlap
gr_end_ss <- junctions_gr
start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1
end(gr_end_ss) <- end(gr_end_ss) + minoverlap
# set to the same seqlevelsstyle
seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr)
## create overlap with blacklist and annotate extra column
message("finding blacklist overlap ...")
black_hits_start_ss <- unique(from(findOverlaps(gr_start_ss, blacklist_gr)))
black_hits_end_ss <- unique(from(findOverlaps(gr_end_ss, blacklist_gr)))
junctions_dt[, blacklist := FALSE]
junctions_dt[black_hits_start_ss, blacklist := TRUE]
junctions_dt[black_hits_end_ss, blacklist := TRUE]
colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
message("blacklist labels done")
return(junctions_dt)
}
#' adds UTR overlap annotation to results table
#' @noRd
addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){
colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
junctions_gr <- makeGRangesFromDataFrame(junctions_dt)
seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
# get gr with start/end positions of each intron
gr_start_ss <- junctions_gr
end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1
start(gr_start_ss) <- start(gr_start_ss) - minoverlap
gr_end_ss <- junctions_gr
start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1
end(gr_end_ss) <- end(gr_end_ss) + minoverlap
### UTR labels based on txdb file
### add 5' 3' UTR labels
message("finding UTR overlap ...")
threes_start <- unique(from(findOverlaps(gr_start_ss,
threeUTRsByTranscript(txdb, use.names = TRUE))))
threes_end <- unique(from(findOverlaps(gr_end_ss,
threeUTRsByTranscript(txdb, use.names = TRUE))))
fives_start <- unique(from(findOverlaps(gr_start_ss,
fiveUTRsByTranscript(txdb, use.names = TRUE))))
fives_end <- unique(from(findOverlaps(gr_end_ss,
fiveUTRsByTranscript(txdb, use.names = TRUE))))
junctions_dt[, UTR_overlap := "no"]
junctions_dt[threes_start, UTR_overlap := "3'-UTR"]
junctions_dt[threes_end, UTR_overlap := "3'-UTR"]
junctions_dt[fives_start, UTR_overlap := "5'-UTR"]
junctions_dt[fives_end, UTR_overlap := "5'-UTR"]
colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
message("UTR labels done")
return(junctions_dt)
}
#' adds type of splicing to each intron in the results table
#' @noRd
addPotentialImpactLabels <- function(junctions_dt, fds, txdb){
message("preparing ...")
psi_positions <- which(junctions_dt$type != "theta")
colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions],
keep.extra.columns = TRUE)
seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
introns_tmp <- unique(unlist(intronsByTranscript(txdb)))
exons <- exons(txdb)
# seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
fds_known <- fds[unique(to(findOverlaps(introns_tmp,
rowRanges(fds, type = "j"),
type = "equal"))),]
grIntrons <- rowRanges(fds_known, type="j")
introns <- as.data.table(grIntrons)
introns <- introns[,.(seqnames, start, end, strand)]
sampleCounts <- K(fds_known, type = "j")
introns[, "meanCount" := rowMeans(sampleCounts)]
introns[, "medianCount" := rowMedians(as.matrix(sampleCounts))]
intron_ranges <- makeGRangesFromDataFrame(introns,
keep.extra.columns = TRUE)
# prepare the results column
junctions_dt[, potentialImpact := "complex"]
junctions_dt[, causesFrameshift := "inconclusive"]
junctions_dt[annotatedJunction == "both" & deltaPsi >= 0,
potentialImpact := "annotatedIntron_increasedUsage"]
junctions_dt[annotatedJunction == "both" & deltaPsi < 0,
potentialImpact := "annotatedIntron_reducedUsage"]
junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"]
if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in%
colnames(junctions_dt))){
junctions_dt[potentialImpact == "annotatedIntron_reducedUsage" &
type == "jaccard" &
nonsplitProportion >= nonsplitProportion_99quantile + 0.05 &
nonsplitCounts >= 10,
potentialImpact := "(partial)intronRetention"]
# TODO check frameshift for intron retention
junctions_dt[potentialImpact == "(partial)intronRetention",
causesFrameshift := "inconclusive"]
}
starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start")
ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end")
nones <- which(junctions_dt[psi_positions]$annotatedJunction=="none")
message("calculating splice types ...")
# start junctions
start_results <- sapply(starts, function(i){
# find the most freq intron that overlaps again
overlap <- to(findOverlaps(junctions_gr[i], intron_ranges,
type = "start"))
expre <- sapply(overlap, function(j){
elementMetadata(intron_ranges[j])$medianCount
})
maxExpr <- which.max(expre)
return(compareEnds(junctions_gr, i, overlap[maxExpr], FALSE,
intron_ranges, exons))
})
junctions_dt[psi_positions[starts],
causesFrameshift:=start_results[2,]]
junctions_dt[psi_positions[starts],
potentialImpact := start_results[1,]]
# end junctions
end_results <- sapply(ends, function(i){
# find the most freq intron that overlaps again
overlap <- to(findOverlaps(junctions_gr[i], intron_ranges,
type = "end"))
expre <- sapply(overlap, function(j){
elementMetadata(intron_ranges[j])$medianCount
})
maxExpr <- which.max(expre)
return(compareStarts(junctions_gr, i, overlap[maxExpr], FALSE,
intron_ranges, exons))
})
junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]]
junctions_dt[psi_positions[ends], potentialImpact := end_results[1,]]
# none junctions pt1
none_results <- sapply(nones, function(i){
# find most freq intron
# check start and end
# find the most freq intron that overlaps again
overlap <- to(findOverlaps(junctions_gr[i], intron_ranges))
if(length(overlap) == 0) return(c("noOverlap", "inconclusive"))
expre <- sapply(overlap, function(j){
elementMetadata(intron_ranges[j])$medianCount
})
maxExpr <- which.max(expre)
# returns type of exon splicing, frameshift TRUE/FALSE, amount of shift
st = compareStarts(junctions_gr, i, overlap[maxExpr], TRUE,
intron_ranges, exons)
en = compareEnds(junctions_gr, i, overlap[maxExpr], TRUE,
intron_ranges, exons)
# merge, start and end results
# merge exon elongation/truncation
# if both likely/unlikely fine
# if one is likely -> return likely
# if one is notYet -> return notYet
if((st[1] == "singleExonSkipping" & !(en[1] %in%
c("singleExonSkipping", "exonSkipping"))) ||
(en[1] == "singleExonSkipping" & !(st[1] %in%
c("singleExonSkipping", "exonSkipping")))){
## only one is single exonSkipping, the other is trunc/elong
if((as.integer(st[3])+as.integer(en[3])) %% 3 != 0){
frs = "likely"
}else{ frs = "unlikely"}
return(c("singleExonSkipping", frs))
}
if(st[1] %in% c("exonSkipping", "singleExonSkipping") || en[1] %in%
c("exonSkipping", "singleExonSkipping")){
return(c("exonSkipping", "inconclusive"))
}
if((as.integer(st[3])+as.integer(en[3]))%%3 != 0){
frs = "likely"
}else{ frs = "unlikely"}
if( st[1] != en[1]){
combined = "exonTruncation&Elongation"
}else{combined = st[1]}
return(c(combined,frs))
})
junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]]
junctions_dt[psi_positions[nones], potentialImpact := none_results[1,]]
noLaps <-which(junctions_dt[psi_positions]$potentialImpact=="noOverlap")
refseq.genes<- genes(txdb)
# none junctions pt2
noLaps_results <- sapply(noLaps, function(i){
overlap <- to(findOverlaps(junctions_gr[i], exons))
# no overlap with an intron or an exon
if(length(overlap) == 0){
return(checkIntergenic(junctions_gr, i, refseq.genes))
}
# for the exons, check if splice site is contained in the exon
for(j in overlap){
exon_start = start(exons[j])
exon_end = end(exons[j])
if(exon_start <= start(junctions_gr[i]) &
exon_end >= end(junctions_gr[i])){
if((end(junctions_gr[i]) -
start(junctions_gr[i]) + 1) %% 3 != 0){
frs = "likely"
}else{ frs = "unlikely"}
return(c("exonTruncation", frs))
}
}
return(c("complex","inconclusive"))
})
junctions_dt[psi_positions[noLaps],
causesFrameshift:=noLaps_results[2,]]
junctions_dt[psi_positions[noLaps],
potentialImpact := noLaps_results[1,]]
# theta annotations
thetas <- which(junctions_dt$type == "theta")
junctions_gr <- makeGRangesFromDataFrame(junctions_dt[thetas,],
keep.extra.columns = TRUE)
# specify default type for theta results as NA
junctions_dt[thetas, potentialImpact := NA]
junctions_dt[thetas, causesFrameshift := NA]
# label all as intronic first if they have any intron overlap
intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp)))
junctions_dt[thetas[intronic], potentialImpact := "intronicRegion"]
# for exonic, check if theta is fully contained in an exon
# if one end is in an intron and the other in an exon it is a splice site
exonic <- unique(from(findOverlaps(junctions_gr, exons)))
within <- findOverlaps(junctions_gr, exons, type = "within")
all <- findOverlaps(junctions_gr, exons)
exonic_results <- sapply(exonic, function(i){
w <- unique(to(within)[which(from(within) == i)])
a <- unique(to(all)[which(from(all) == i)])
if(length(a) == length(w)) return("exonicRegion")
return("annotatedSpliceSite")
})
junctions_dt[thetas[exonic], potentialImpact := exonic_results]
# check cases that don't overlap with an exon/intron
nones <- which(is.na(junctions_dt[thetas,]$potentialImpact))
none_results <- sapply(nones, function(i){
if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA)
#return("intergenic")
if(start(refseq.genes[nearest(junctions_gr[i],
refseq.genes)]) > start(junctions_gr[i])){
ifelse(strand(junctions_gr[i]) == "+",
return("upstreamOfNearestGene"),
return("downstreamOfNearestGene"))
}else{
ifelse(strand(junctions_gr[i]) == "+",
return("downstreamOfNearestGene"),
return("upstreamOfNearestGene"))
}
})
junctions_dt[thetas[nones], potentialImpact := none_results]
# add distance to closest neighbour gene for intergenic results
# (both psi and theta)
message("adding distances to nearest gene ...")
up <- which(junctions_dt$potentialImpact == "upstreamOfNearestGene")
down <- which(junctions_dt$potentialImpact == "downstreamOfNearestGene")
# create full grange object containing psi and theta
junctions_gr <- makeGRangesFromDataFrame(junctions_dt,
keep.extra.columns = TRUE)
# Calculate distances
if(length(up) > 0){
distanceNearestGene_up <- sapply(up, function(i){
min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)})
if(length(distanceNearestGene_up > 0)){
junctions_dt[psi_positions[up],
distNearestGene := distanceNearestGene_up]
} else{
junctions_dt[psi_positions[up], distNearestGene := NA]
message("No distances found for upstream")
}
}else{message("No upstream targets")}
if(length(down) > 0){
distanceNearestGene_down <- sapply(down, function(i){
min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)})
if(length(distanceNearestGene_down > 0)){
junctions_dt[psi_positions[down],
distNearestGene := distanceNearestGene_down]
}else{
junctions_dt[psi_positions[down], distNearestGene := NA]
message("No distances found for downstream")
}
}else{message("No downstream targets")}
colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
message("done calculating splice types")
# Add the subtypes for exonSkipping and inconclusive
junctions_dt <- checkExonSkipping(junctions_dt, txdb)
junctions_dt <- checkInconclusive(junctions_dt, txdb)
return(junctions_dt)
}
#'
#' @noRd
compareStarts <- function(junctions_gr, i, max_lap, shift_needed,
intron_ranges, exons){
intron_start = start(intron_ranges[max_lap])
ss_start = start(junctions_gr[i])
# found the most freq intron with same end again
# check if intron starts before splice site -> exon elongation -> FRS
if(intron_start < ss_start){
if(((ss_start - intron_start) %% 3) != 0){
frs = "likely"
}else{ frs = "unlikely"}
ifelse(shift_needed,
return(c("exonElongation", frs,
(ss_start - intron_start))),
return(c("exonElongation", frs)))
}
# check if splice site ends in following exon -> exon truncation -> FRS
if(intron_start > ss_start){
# create dummy exon find all exons starting from that intron end
dummy_exon <- GRanges(
seqnames = toString(seqnames(intron_ranges[max_lap])),
ranges = IRanges(intron_start-2, end = intron_start -1),
strand = toString(strand(intron_ranges[max_lap]))
)
exonChoices <- to(findOverlaps(dummy_exon, exons, type = "end"))
for(j in exonChoices){
exon_start = start(exons[j])
if(exon_start < ss_start){
if((end(exons[j]) - ss_start + 1)%%3 != 0){
frs = "likely"
}else{frs = "unlikely"}
ifelse(shift_needed,
return(c("exonTruncation", frs,
(-1)*(end(exons[j]) - ss_start + 1))),
return(c("exonTruncation", frs)))
}
}
# check for single exon skipping
if(length(exonChoices) == 1){
# check if there is no other exon within the first intron:
# splice site end until exon end
dummyFirstItr <- GRanges(
seqnames = toString(seqnames(intron_ranges[max_lap])),
ranges = IRanges(end(exons[exonChoices[1]]) + 1,
end(junctions_gr[i])),
strand = toString(strand(intron_ranges[max_lap]))
)
if(length(findOverlaps(exons, dummyFirstItr,
type = "within")) > 0){
# another exon is contained within the most freq used intron
ifelse(shift_needed,
return(c("exonSkipping", "inconclusive", 0)),
return(c("exonSkipping", "inconclusive")))
}
secondItr <- GRanges(
seqnames = toString(intron_ranges[max_lap]@seqnames@values),
strand = toString(intron_ranges[max_lap]@strand@values),
ranges = IRanges(ss_start, start(exons[exonChoices[1]]) - 1)
# end of exon + 1, end of aberrant junction
)
secItrChoices <- to(findOverlaps(secondItr, intron_ranges,
type = "end"))
# only look at most used one
expre <- sapply(secItrChoices, function(j){
elementMetadata(intron_ranges[j])$medianCount
})
maxExpr <- which.max(expre)
if(length(secItrChoices) == 0){
ifelse(shift_needed,
return(c("exonSkipping", "inconclusive", 0)),
return(c("exonSkipping", "inconclusive")))
}
if(ss_start >= start(intron_ranges[secItrChoices[maxExpr]])){
# check if there is no other exon in that range
if(length(findOverlaps(exons,
intron_ranges[secItrChoices[maxExpr]],
type = "within")) == 0){
# clear exon skipping, only exon is skipped
# calculate frameshift, skipped exon plus possible exon
# elongation
shift = (-1)*(end(exons[exonChoices[1]]) -
start(exons[exonChoices[1]]) + 1) +
ss_start - start(intron_ranges[secItrChoices[maxExpr]])
frs = ifelse(shift %% 3 == 0,"unlikely","likely")
ifelse(shift_needed,
return(c("singleExonSkipping", "inconclusive",
shift)),
return(c("singleExonSkipping", frs)))
}
}
} # single exon skipping end
}
# splice site longer than one intron + exon -> not defined for now
ifelse(shift_needed,
return(c("exonSkipping", "inconclusive", 0)),
return(c("exonSkipping", "inconclusive")))
}
#'
#' @noRd
compareEnds <- function(junctions_gr, i, max_lap, shift_needed,
intron_ranges, exons){
intron_end = end(intron_ranges[max_lap])
ss_end = end(junctions_gr[i])
# found the most freq intron with same start again
# check if intron ends after splice site -> exon elongation -> FRS -> done
if(intron_end > ss_end){
if(((intron_end - ss_end) %% 3) != 0){
frs = "likely"
}else{ frs = "unlikely"}
ifelse(shift_needed,
return(c("exonElongation", frs, (intron_end - ss_end))),
return(c("exonElongation", frs)))
}
# check if splice site ends in following exon -> exon truncation -> FRS
if(intron_end < ss_end){
# create dummy exon find all exons starting from that intron end
dummy_exon <- GRanges(
seqnames = toString(intron_ranges[max_lap]@seqnames@values),
ranges = IRanges(intron_end + 1, end = intron_end + 2),
strand = toString(intron_ranges[max_lap]@strand@values)
)
exonChoices <- to(findOverlaps(dummy_exon, exons, type = "start"))
for(j in exonChoices){
exon_end = end(exons[j])
if(exon_end > ss_end){
if((ss_end - start(exons[j]) + 1)%%3 != 0){
frs = "likely"
}else{frs = "unlikely"}
ifelse(shift_needed,
return(c("exonTruncation",frs,
(-1)*(ss_end - start(exons[j]) + 1))),
return(c("exonTruncation",frs)))
}
}
# check for single exon skipping
if(length(exonChoices) == 1){
# check if there is no other exon within the first intron:
# splice site end until exon end
dummyFirstItr <- GRanges(
seqnames = toString(seqnames(intron_ranges[max_lap])),
ranges = IRanges(start(junctions_gr[i]),
start(exons[exonChoices[1]]) - 1),
strand = toString(strand(intron_ranges[max_lap]))
)
if(length(findOverlaps(exons, dummyFirstItr,
type = "within")) > 0){
# another exon is contained within the most freq used intron
ifelse(shift_needed,
return(c("exonSkipping", "inconclusive", 0)),
return(c("exonSkipping", "inconclusive")))
}
secondItr <- GRanges(
seqnames = toString(intron_ranges[max_lap]@seqnames@values),
strand = toString(intron_ranges[max_lap]@strand@values),
ranges = IRanges(end(exons[exonChoices[1]]) + 1, ss_end)
# end of exon + 1, end of aberrant junction
)
secItrChoices <- to(findOverlaps(secondItr, intron_ranges,
type = "start"))
# only look at most used one
expre <- sapply(secItrChoices, function(j){
elementMetadata(intron_ranges[j])$medianCount
})
maxExpr <- which.max(expre)
if(length(secItrChoices) == 0){
ifelse(shift_needed,
return(c("exonSkipping", "inconclusive", 0)),
return(c("exonSkipping", "inconclusive")))
}
if(ss_end <= end(intron_ranges[secItrChoices[maxExpr]])){
# check if there is no other exon in that range
if(length(findOverlaps(exons,
intron_ranges[secItrChoices[maxExpr]],
type = "within")) == 0){
# clear exon skipping, only exon is skipped
# calculate frameshift, skipped exon plus possible exon
# elongation at end
shift = (-1)*(end(exons[exonChoices[1]]) -
start(exons[exonChoices[1]]) + 1) +
end(intron_ranges[secItrChoices[maxExpr]]) - ss_end
frs = ifelse(shift%%3 == 0,"unlikely","likely")
ifelse(shift_needed,
return(c("singleExonSkipping", "inconclusive",
shift)),
return(c("singleExonSkipping", frs)))
}
}
} # single exon skipping end
}
# splice site longer than one intron + exon -> not defined for now
ifelse(shift_needed,
return(c("exonSkipping", "inconclusive", 0)),
return(c("exonSkipping", "inconclusive")))
}
#'
#' @noRd
checkIntergenic <- function(junctions_gr, i, refseq.genes){
# check if start > 1000
# start - 1000, end + 1000
start = start(junctions_gr[i])
# ifelse(start > 1000, start = start - 1000, start = 1)
# if(start > 1000){
# start = start - 1000
# }else{start = 1}
end = end(junctions_gr[i]) #+ 1000
if(start + 2 < end){
start = start + 1
end = end - 1
}
test_junction <- GRanges(
seqnames = seqnames(junctions_gr[i]),
ranges = IRanges(start, end),
strand = strand(junctions_gr[i])
)
# overlap with introns and exon
# IGNORE STRANDS? -> decided its not necessary
# check if distance to nearest is > 1000 -> intergenic
# otherwise up/downstream
dist = min(distance(test_junction, refseq.genes), na.rm = TRUE)
if(is.finite(dist) && dist > 0){
# find nearest and compare starts
if(start(refseq.genes[nearest(junctions_gr[i],
refseq.genes)]) > start){
ifelse(strand(junctions_gr[i]) == "+",
return(c("upstreamOfNearestGene", "unlikely")),
return(c("downstreamOfNearestGene", "unlikely")))
}else{
ifelse(strand(junctions_gr[i]) == "+",
return(c("downstreamOfNearestGene", "unlikely")),
return(c("upstreamOfNearestGene", "unlikely")))
}
}
return(c("complex", "inconclusive"))
}
#'
#' @noRd
checkExonSkipping <- function(junctions_dt, txdb){
psi_positions <- which(junctions_dt$type != "theta")
colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions],
keep.extra.columns = TRUE)
seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
refseq.genes<- genes(txdb)
exonSkip <- which(junctions_dt[psi_positions]$potentialImpact %in%
c("exonSkipping", "singleExonSkipping"))
message("start checking exonSkipping")
newSkip_results <- sapply(exonSkip, function(i){
start = start(junctions_gr[i])
end = end(junctions_gr[i])
# reduce the junction width so adjacent genes have a distance of 1
if(start + 2 < end){
start = start + 1
end = end - 1
}
test_start <- GRanges(
seqnames = seqnames(junctions_gr[i]),
strand = strand(junctions_gr[i]),
ranges = IRanges(start, start + 1)
)
test_end <- GRanges(
seqnames = seqnames(junctions_gr[i]),
strand = strand(junctions_gr[i]),
ranges = IRanges(end - 1, end)
)
# check for which genes distance to start is 0
start_genes <- which(distance(test_start, refseq.genes) == 0)
# start is not in a gene
if(length(start_genes) == 0) return("splicingBeyondGene")
# start is in a gene -> is end in same gene
for(to in start_genes){
# end is in same gene
if(distance(test_end, refseq.genes[to]) == 0){
return("exonSkipping")
}
}
end_genes <- which(distance(test_end, refseq.genes) == 0)
# end is not in a gene
if(length(end_genes) == 0) return("splicingBeyondGene")
# end is in a different gene
return("multigenicSplicing")
})
# checking exonSkipping done
if(length(exonSkip) > 0){
junctions_dt[psi_positions[exonSkip],
potentialImpact2 := newSkip_results]
junctions_dt[potentialImpact2 == "splicingBeyondGene",
potentialImpact := "splicingBeyondGene"]
junctions_dt[potentialImpact2 == "splicingBeyondGene",
causesFrameshift := "inconclusive"]
junctions_dt[potentialImpact2 == "multigenicSplicing",
potentialImpact := "multigenicSplicing"]
junctions_dt[potentialImpact2 == "multigenicSplicing",
causesFrameshift := "inconclusive"]
junctions_dt[, potentialImpact2 := NULL]
}
colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
return(junctions_dt)
}
#'
#' @noRd
checkInconclusive <- function(junctions_dt, txdb){
psi_positions <- which(junctions_dt$type != "theta")
colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions],
keep.extra.columns = TRUE)
seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
refseq.genes<- genes(txdb)
inconclusive <- which(junctions_dt[psi_positions
]$potentialImpact == "complex")
inconclusive_results <- sapply(inconclusive, function(i){
start = start(junctions_gr[i])
end = end(junctions_gr[i])
# reduce the junction width so adjacent genes have a distance of 1
if(start + 2 < end){
start = start + 1
end = end - 1
}
test_start <- GRanges(
seqnames = seqnames(junctions_gr[i]),
strand = strand(junctions_gr[i]),
ranges = IRanges(start, start + 1)
)
test_end <- GRanges(
seqnames = seqnames(junctions_gr[i]),
strand = strand(junctions_gr[i]),
ranges = IRanges(end - 1, end)
)
# check for which genes distance to start is 0
start_genes <- which(distance(test_start, refseq.genes) == 0)
# start is not in a gene
if(length(start_genes) == 0) return("splicingBeyondGene")
# start is in a gene -> is end in same gene
for(to in start_genes){
# end is in same gene
if(distance(test_end, refseq.genes[to]) == 0){
return("complex")
}
}
end_genes <- which(distance(test_end, refseq.genes) == 0)
# end is not in a gene
if(length(end_genes) == 0) return("splicingBeyondGene")
# end is in a different gene
return("multigenicSplicing")
})
colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
if(length(inconclusive) > 0){
junctions_dt[psi_positions[inconclusive],
potentialImpact := inconclusive_results]
}
return(junctions_dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.