R/resultAnnotations.R

Defines functions annotateIntronReferenceOverlap

Documented in annotateIntronReferenceOverlap

#' 
#' @title Additional result annotations
#' 
#' @description These functions work on the result table and add additional 
#'     annotations to the reported introns: the type of potential impact on 
#'     splicing (e.g. exon skipping, exon truncation, ...), potential occurence 
#'     of frameshift, overlap with UTR regions as well as a flag for introns 
#'     that are located in blacklist regions of the genome.
#'     
#' \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the 
#'     fds for each intron based on the overlap of the intron's location with 
#'     the reference annotation. Has to be run before the result table is 
#'     created so that the new column can be included in it (see examples).
#' 
#' \code{\link{annotatePotentialImpact}} annotates each intron in the results 
#'     table with the type of potential impact on splicing and potential 
#'     occurence of frameshift (likely, unlikely, inconclusive). Can also 
#'     calculate overlap with annotated UTR regions. Potential impact can be: 
#'     annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, 
#'     exonTruncation, exonElongation, exonTruncation&Elongation, 
#'     exonSkipping, splicingBeyondGene, 
#'     multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, 
#'     complex (everything else).     
#'     Splice sites (theta metric) annotations indicate how the splice site is 
#'     located with respect to the reference annotation. The annotated types 
#'     are: annotatedSpliceSite, exonicRegion, intronicRegion.
#' 
#' \code{\link{flagBlacklistRegions}} flags introns in the results table on 
#'     whether or not they are located in a blacklist region of the genome. By
#'     default, the blacklist regions as reported in 
#'     \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from 
#'     \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here} 
#'     are used.
#'
#' @param fds A FraserDataSet
#' @param txdb A txdb object providing the reference annotation.
#' @param result A result table as generated by FRASER, including the column 
#'     \code{annotatedJunction} as generated by the function
#'     \code{annotateIntronReferenceOverlap}.
#' @param addPotentialImpact Logical, indicating if the type of the potential 
#'     impact should be added to the results table. Defaults to \code{TRUE}.
#' @param addUTRoverlap Logical, indicating if the overlap with UTR regions
#'     should checked and added to the results table. Defaults to \code{TRUE}.
#' @param minoverlap Integer value defining the number of base pairs around the
#'     splice site that need to overlap with UTR or blacklist region, 
#'     respectivly, to be considered matching. Defaults to 5 bp. 
#' @param blacklist_regions A BED file that contains the blacklist regions. 
#'     If \code{NULL} (default), the BED files that are packaged with FRASER 
#'     are used (see Details for more information). 
#' @param assemblyVersion Indicates the genome assembly version of the intron 
#'     coordinates. Only used if blacklist_regions is NULL. For other versions, 
#'     please provide the BED file containing the blacklist regions directly.
#' @param BPPARAM For controlling parallelization behavior. Defaults to 
#'     \code{bpparam()}.
#' @return An annotated FraserDataSet or results table, respectively
#' 
#' @name potentialImpactAnnotations
#' @rdname potentialImpactAnnotations
#' 
#' @examples
#'   # get data, fit and compute p-values and z-scores
#'   fds <- createTestFraserDataSet()
#'   
#'   # load reference annotation
#'   library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#'   txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 
#'     
#'   # add basic annotations for overlap with the reference annotation
#'   # run this function before creating the results table
#'   fds <- annotateIntronReferenceOverlap(fds, txdb) 
#' 
#'   # extract results: for this small example dataset, no cutoffs used 
#'   # to get some results 
#'   res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA)
#'   
#'   # annotate the type of potential impact on splicing and UTR overlap
#'   res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds)
#'   
#'   # annotate overlap with blacklist regions
#'   res <- flagBlacklistRegions(result=res, assemblyVersion="hg19")
#'   
#'   # show results table containing additional annotations
#'   res
#'   
NULL

#' @describeIn potentialImpactAnnotations This method calculates basic annotations 
#'     based on overlap with the reference annotation (start, end, none, both) 
#'     for the full fds. The overlap type is added as a new column 
#'     \code{annotatedJunction} in \code{mcols(fds)}. 
#' @export
annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){
    message("loading introns ...")
    #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
    introns <- unique(unlist(intronsByTranscript(txdb)))
    # reduce the introns to only the actually expressed introns
    fds_known <- fds[unique(to(findOverlaps(introns, 
                                rowRanges(fds, type = "j"), type = "equal"))),]
    anno_introns <- as.data.table(rowRanges(fds_known, 
                                type="j"))[,.(seqnames, start, end, strand)]
    
    # calculate extra columns with mean/median intron expression count
    # add the new columns
    sampleCounts <- as.matrix(K(fds_known, type = "j"))
    anno_introns[, meanCount := rowMeans(sampleCounts)]
    anno_introns[, medianCount := rowMedians(sampleCounts)]
    # order by medianCount (highest first)
    setorderv(anno_introns, "medianCount", order=-1) 
    anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, 
                                                    keep.extra.columns = TRUE)
    
    # get all fds junctions
    fds_junctions <- rowRanges(fds, type = "j")
    
    # Do the annotation just for the intron with highest median expression
    message("start calculating basic annotations ...")    
    overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first")
    annotations <- bplapply(seq_len(length(fds_junctions)), 
            function(i, overlaps, fds_junctions, anno_introns_ranges){
        # only select first intron as already ordered by medianCount beforehand
        overlap <- overlaps[i]
        if(is.na(overlap)) return("none") #no overlap with any intron
        
        hit_equal <- from(findOverlaps(fds_junctions[i], 
                                        anno_introns_ranges[overlap], 
                                        type="equal"))
        if(length(hit_equal) > 0) return("both")
        
        hit_start <- from(findOverlaps(fds_junctions[i], 
                                        anno_introns_ranges[overlap], 
                                        type="start"))
        if(length(hit_start) > 0) return("start")
        hit_end   <- from(findOverlaps(fds_junctions[i], 
                                        anno_introns_ranges[overlap], 
                                        type="end"))
        if(length(hit_end) > 0) return("end")
        
        # overlaps but no start/end match
        return("none") 
    }, overlaps=overlaps, fds_junctions=fds_junctions, 
        anno_introns_ranges=anno_introns_ranges, BPPARAM=BPPARAM) 
    annotations <- unlist(annotations)
    
    rowRanges(fds)$annotatedJunction <- annotations
    mcols(fds, type="ss")$annotatedJunction <- "not computed"
    message("basic annotations done")
    return(fds)
}

#' @describeIn potentialImpactAnnotations This method annotates the splice event 
#'     type to junctions in the given results table.
#' @export
annotatePotentialImpact <- function(result, txdb, fds, addPotentialImpact=TRUE, 
                                    addUTRoverlap=TRUE, minoverlap=5, 
                                    BPPARAM=bpparam()){
    
    # convert to data.table if not already
    if(!is.data.table(result)){
        annoResult <- as.data.table(result)
    } else{
        annoResult <- result
    }
    
    # Create basic annotation of overlap with reference
    if(!("annotatedJunction" %in% colnames(annoResult))){
        stop("Column 'annotatedJunction' not found in the results table!\n",
            "Please run 'fds <- annotateIntronReferenceOverlap(fds, txdb)' ",
            "first and then extract \nthe results table using the ",
            "'results(fds, ...)' function before calling this function.")
    }
    
    # Calculate splice types and frameshift
    if(isTRUE(addPotentialImpact)){
        annoResult <- addPotentialImpactLabels(annoResult, fds, txdb)
        annoResult[potentialImpact == "singleExonSkipping", 
                    potentialImpact := "exonSkipping"]
    }
    
    # Add UTR labels
    if(isTRUE(addUTRoverlap)){
        annoResult <- addUTRLabels(annoResult, txdb)
    }
    
    if(is(result, "GenomicRanges")){
        annoResult <- makeGRangesFromDataFrame(annoResult, 
                                                keep.extra.columns=TRUE)
    } 
    
    return(annoResult)
}

#' @describeIn potentialImpactAnnotations This method flags all introns and 
#'     splice sites in the given results table for which at least one splice 
#'     site (donor or acceptor) is located in a blacklist region. Blacklist 
#'     regions of the genome are determined from the provided BED file.  
#' @export
flagBlacklistRegions <- function(result, blacklist_regions=NULL,
                                assemblyVersion=c('hg19', 'hg38'),
                                minoverlap=5){
    
    # convert to data.table if not already
    if(!is.data.table(result)){
        annoResult <- as.data.table(result)
    } else{
        annoResult <- result
    }
    
    assemblyVersion <- match.arg(assemblyVersion)
    if(is.null(blacklist_regions)){
        blacklist_regions <- 
            system.file("extdata", "blacklist_regions", 
                        paste0(assemblyVersion, "-blacklist.v2.bed.gz"), 
                        package = "FRASER")
    }
    if(!file.exists(blacklist_regions)){
        stop("BED file with blacklist regions does not exist: ", 
            blacklist_regions)
    }
    message("Importing blacklist regions ...")
    blacklist_gr <- rtracklayer::import(blacklist_regions, format = "BED")
    annoResult <- addBlacklistLabels(annoResult, blacklist_gr)  
    
    if(is(result, "GenomicRanges")){
        annoResult <- makeGRangesFromDataFrame(annoResult, 
                                                keep.extra.columns=TRUE)
    }
    
    return(annoResult)
}

############# helper functions ##############################

#' blacklist annotation for aberrant splicing events  
#' @noRd
addBlacklistLabels <- function(junctions_dt, blacklist_gr, minoverlap=5){
    # add the blacklist information
    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
    junctions_gr <- makeGRangesFromDataFrame(junctions_dt)
    
    # get gr with start/end positions of each intron
    gr_start_ss <- junctions_gr
    end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1
    start(gr_start_ss) <- start(gr_start_ss) - minoverlap
    gr_end_ss <- junctions_gr
    start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1
    end(gr_end_ss) <- end(gr_end_ss) + minoverlap
    
    # set to the same seqlevelsstyle
    seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr)
    
    ## create overlap with blacklist and annotate extra column
    message("finding blacklist overlap ...")
    black_hits_start_ss <- unique(from(findOverlaps(gr_start_ss, blacklist_gr)))
    black_hits_end_ss <- unique(from(findOverlaps(gr_end_ss, blacklist_gr)))
    junctions_dt[, blacklist := FALSE]
    
    junctions_dt[black_hits_start_ss, blacklist := TRUE]
    junctions_dt[black_hits_end_ss, blacklist := TRUE]
    colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
    
    message("blacklist labels done")
    return(junctions_dt)
}

#' adds UTR overlap annotation to results table
#' @noRd
addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){
    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
    junctions_gr <- makeGRangesFromDataFrame(junctions_dt)
    seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
    
    # get gr with start/end positions of each intron
    gr_start_ss <- junctions_gr
    end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1
    start(gr_start_ss) <- start(gr_start_ss) - minoverlap
    gr_end_ss <- junctions_gr
    start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1
    end(gr_end_ss) <- end(gr_end_ss) + minoverlap
    
    ### UTR labels based on txdb file
    ### add 5' 3' UTR labels
    message("finding UTR overlap ...")
    threes_start <- unique(from(findOverlaps(gr_start_ss, 
                            threeUTRsByTranscript(txdb, use.names = TRUE))))
    threes_end <- unique(from(findOverlaps(gr_end_ss, 
                            threeUTRsByTranscript(txdb, use.names = TRUE))))
    fives_start <- unique(from(findOverlaps(gr_start_ss, 
                            fiveUTRsByTranscript(txdb, use.names = TRUE))))
    fives_end <- unique(from(findOverlaps(gr_end_ss, 
                            fiveUTRsByTranscript(txdb, use.names = TRUE))))
    junctions_dt[, UTR_overlap := "no"]
    junctions_dt[threes_start, UTR_overlap := "3'-UTR"]
    junctions_dt[threes_end, UTR_overlap := "3'-UTR"]
    junctions_dt[fives_start, UTR_overlap := "5'-UTR"]
    junctions_dt[fives_end, UTR_overlap := "5'-UTR"]
    colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
    message("UTR labels done")
    return(junctions_dt)
}



#' adds type of splicing to each intron in the results table
#' @noRd
addPotentialImpactLabels <- function(junctions_dt, fds, txdb){
    message("preparing ...")
    psi_positions <- which(junctions_dt$type != "theta")
    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], 
                                                keep.extra.columns = TRUE)
    seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
    
    introns_tmp <- unique(unlist(intronsByTranscript(txdb)))
    exons <- exons(txdb)
    
    # seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
    fds_known <- fds[unique(to(findOverlaps(introns_tmp, 
                                            rowRanges(fds, type = "j"), 
                                            type = "equal"))),]
    grIntrons <- rowRanges(fds_known, type="j")
    introns <- as.data.table(grIntrons)
    introns <- introns[,.(seqnames, start, end, strand)]
    
    sampleCounts <- K(fds_known, type = "j")
    introns[, "meanCount" := rowMeans(sampleCounts)]
    introns[, "medianCount" := rowMedians(as.matrix(sampleCounts))]
    intron_ranges <- makeGRangesFromDataFrame(introns, 
                                                keep.extra.columns = TRUE)
    
    # prepare the results column
    junctions_dt[, potentialImpact := "complex"]
    junctions_dt[, causesFrameshift := "inconclusive"]
    junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, 
                    potentialImpact := "annotatedIntron_increasedUsage"]
    junctions_dt[annotatedJunction == "both" & deltaPsi < 0, 
                    potentialImpact := "annotatedIntron_reducedUsage"]
    junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"]
    
    if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in% 
            colnames(junctions_dt))){
        junctions_dt[potentialImpact == "annotatedIntron_reducedUsage" & 
                    type == "jaccard" &
                    nonsplitProportion >= nonsplitProportion_99quantile + 0.05 &
                    nonsplitCounts >= 10,
                potentialImpact := "(partial)intronRetention"]
        
        # TODO check frameshift for intron retention
        junctions_dt[potentialImpact == "(partial)intronRetention", 
                        causesFrameshift := "inconclusive"]
    }
    
    starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start")
    ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end")
    nones <- which(junctions_dt[psi_positions]$annotatedJunction=="none")
    
    message("calculating splice types ...")
    # start junctions
    start_results <- sapply(starts, function(i){
        # find the most freq intron that overlaps again
        overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, 
                                    type = "start"))
        expre <- sapply(overlap, function(j){
            elementMetadata(intron_ranges[j])$medianCount
        })
        maxExpr <- which.max(expre)
        return(compareEnds(junctions_gr, i, overlap[maxExpr], FALSE, 
                            intron_ranges, exons))
    })
    junctions_dt[psi_positions[starts], 
                    causesFrameshift:=start_results[2,]]
    junctions_dt[psi_positions[starts], 
                    potentialImpact := start_results[1,]]
    
    # end junctions
    end_results <- sapply(ends, function(i){
        # find the most freq intron that overlaps again
        overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, 
                                    type = "end"))
        expre <- sapply(overlap, function(j){
            elementMetadata(intron_ranges[j])$medianCount
        })
        maxExpr <- which.max(expre)
        return(compareStarts(junctions_gr, i, overlap[maxExpr], FALSE, 
                                intron_ranges, exons))
        
    })
    junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]]
    junctions_dt[psi_positions[ends], potentialImpact := end_results[1,]]
    
    # none junctions pt1
    none_results <- sapply(nones, function(i){
        # find most freq intron
        # check start and end
        
        # find the most freq intron that overlaps again
        overlap <- to(findOverlaps(junctions_gr[i], intron_ranges))
        if(length(overlap) == 0) return(c("noOverlap", "inconclusive"))
        expre <- sapply(overlap, function(j){
            elementMetadata(intron_ranges[j])$medianCount
        })
        maxExpr <- which.max(expre)
        
        # returns type of exon splicing, frameshift TRUE/FALSE, amount of shift
        st = compareStarts(junctions_gr, i, overlap[maxExpr], TRUE, 
                            intron_ranges, exons)
        en = compareEnds(junctions_gr, i, overlap[maxExpr], TRUE, 
                            intron_ranges, exons)
        
        # merge, start and end results
        # merge exon elongation/truncation
        # if both likely/unlikely fine
        # if one is likely -> return likely
        # if one is notYet -> return notYet
        if((st[1] == "singleExonSkipping" & !(en[1] %in% 
                                c("singleExonSkipping", "exonSkipping"))) ||
            (en[1] == "singleExonSkipping" & !(st[1] %in% 
                                c("singleExonSkipping", "exonSkipping")))){
            ## only one is single exonSkipping, the other is trunc/elong
            if((as.integer(st[3])+as.integer(en[3])) %% 3 != 0){
                frs = "likely"
            }else{ frs = "unlikely"}
            return(c("singleExonSkipping", frs))
        } 
        
        if(st[1] %in% c("exonSkipping", "singleExonSkipping") || en[1] %in% 
                c("exonSkipping", "singleExonSkipping")){
            return(c("exonSkipping", "inconclusive")) 
        }
        
        if((as.integer(st[3])+as.integer(en[3]))%%3 != 0){
            frs = "likely"
        }else{ frs = "unlikely"}
        if( st[1] != en[1]){
            combined = "exonTruncation&Elongation"
        }else{combined = st[1]}
        return(c(combined,frs))
        
    })
    junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]]
    junctions_dt[psi_positions[nones], potentialImpact := none_results[1,]]
    
    noLaps <-which(junctions_dt[psi_positions]$potentialImpact=="noOverlap")
    refseq.genes<- genes(txdb)
    
    # none junctions pt2
    noLaps_results <- sapply(noLaps, function(i){
        overlap <- to(findOverlaps(junctions_gr[i], exons))
        # no overlap with an intron or an exon
        if(length(overlap) == 0){
            return(checkIntergenic(junctions_gr, i, refseq.genes)) 
        }
        
        # for the exons, check if splice site is contained in the exon
        for(j in overlap){
            exon_start = start(exons[j])
            exon_end = end(exons[j])
            if(exon_start <= start(junctions_gr[i]) & 
                    exon_end >= end(junctions_gr[i])){
                if((end(junctions_gr[i]) - 
                        start(junctions_gr[i]) + 1) %% 3 != 0){
                    frs = "likely"
                }else{ frs = "unlikely"}
                return(c("exonTruncation", frs))
            }
        }
        
        return(c("complex","inconclusive"))
    })
    junctions_dt[psi_positions[noLaps], 
                    causesFrameshift:=noLaps_results[2,]]
    junctions_dt[psi_positions[noLaps], 
                    potentialImpact := noLaps_results[1,]]
    
    # theta annotations
    thetas <- which(junctions_dt$type == "theta")
    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[thetas,], 
                                            keep.extra.columns = TRUE)
    
    # specify default type for theta results as NA
    junctions_dt[thetas, potentialImpact := NA]
    junctions_dt[thetas, causesFrameshift := NA]
    
    # label all as intronic first if they have any intron overlap
    intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp)))
    junctions_dt[thetas[intronic], potentialImpact := "intronicRegion"]
    
    # for exonic, check if theta is fully contained in an exon
    # if one end is in an intron and the other in an exon it is a splice site
    exonic <- unique(from(findOverlaps(junctions_gr, exons)))
    within <- findOverlaps(junctions_gr, exons, type = "within")
    all <- findOverlaps(junctions_gr, exons)
    exonic_results <- sapply(exonic, function(i){
        w <- unique(to(within)[which(from(within) == i)])
        a <- unique(to(all)[which(from(all) == i)])
        if(length(a) == length(w)) return("exonicRegion")
        return("annotatedSpliceSite")
    })
    junctions_dt[thetas[exonic], potentialImpact := exonic_results]
    
    # check cases that don't overlap with an exon/intron
    nones <- which(is.na(junctions_dt[thetas,]$potentialImpact))
    none_results <- sapply(nones, function(i){
        if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA)
        #return("intergenic")
        if(start(refseq.genes[nearest(junctions_gr[i], 
                                    refseq.genes)]) > start(junctions_gr[i])){
            ifelse(strand(junctions_gr[i]) == "+", 
                    return("upstreamOfNearestGene"), 
                    return("downstreamOfNearestGene"))
        }else{
            ifelse(strand(junctions_gr[i]) == "+", 
                    return("downstreamOfNearestGene"), 
                    return("upstreamOfNearestGene"))
        }
    })
    junctions_dt[thetas[nones], potentialImpact := none_results]
    
    # add distance to closest neighbour gene for intergenic results 
    # (both psi and theta)
    message("adding distances to nearest gene ...")
    up <- which(junctions_dt$potentialImpact == "upstreamOfNearestGene")
    down <- which(junctions_dt$potentialImpact == "downstreamOfNearestGene")
    
    # create full grange object containing psi and theta
    junctions_gr <- makeGRangesFromDataFrame(junctions_dt, 
                                                keep.extra.columns = TRUE)
    
    # Calculate distances
    if(length(up) > 0){
        distanceNearestGene_up <- sapply(up, function(i){
            min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)})
        if(length(distanceNearestGene_up > 0)){
            junctions_dt[psi_positions[up], 
                            distNearestGene := distanceNearestGene_up]
        } else{
            junctions_dt[psi_positions[up], distNearestGene := NA]
            message("No distances found for upstream")
        }
    }else{message("No upstream targets")}
    
    if(length(down) > 0){
        distanceNearestGene_down <- sapply(down, function(i){
            min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)})
        if(length(distanceNearestGene_down > 0)){
            junctions_dt[psi_positions[down], 
                            distNearestGene := distanceNearestGene_down]
        }else{
            junctions_dt[psi_positions[down], distNearestGene := NA]
            message("No distances found for downstream")
        }
    }else{message("No downstream targets")}
    
    colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"
    message("done calculating splice types")
    
    # Add the subtypes for exonSkipping and inconclusive
    junctions_dt <- checkExonSkipping(junctions_dt, txdb)
    junctions_dt <- checkInconclusive(junctions_dt, txdb)
    
    return(junctions_dt)
}

#' 
#' @noRd
compareStarts <- function(junctions_gr, i, max_lap, shift_needed, 
                            intron_ranges, exons){
    intron_start = start(intron_ranges[max_lap])
    ss_start = start(junctions_gr[i])
    
    # found the most freq intron with same end again
    # check if intron starts before splice site -> exon elongation -> FRS 
    if(intron_start < ss_start){
        if(((ss_start - intron_start) %% 3) != 0){
            frs = "likely"
        }else{ frs = "unlikely"}
        
        ifelse(shift_needed, 
                return(c("exonElongation", frs, 
                            (ss_start - intron_start))), 
                return(c("exonElongation", frs)))
    }
    
    # check if splice site ends in following exon -> exon truncation -> FRS
    if(intron_start > ss_start){
        
        # create dummy exon find all exons starting from that intron end
        dummy_exon <- GRanges(
            seqnames = toString(seqnames(intron_ranges[max_lap])),
            ranges = IRanges(intron_start-2, end = intron_start -1),
            strand = toString(strand(intron_ranges[max_lap]))
        )
        exonChoices <- to(findOverlaps(dummy_exon, exons, type = "end"))
        for(j in exonChoices){
            exon_start = start(exons[j])
            if(exon_start < ss_start){
                if((end(exons[j]) - ss_start + 1)%%3 != 0){
                    frs = "likely"
                }else{frs = "unlikely"}
                ifelse(shift_needed, 
                        return(c("exonTruncation", frs, 
                                    (-1)*(end(exons[j]) - ss_start + 1))), 
                        return(c("exonTruncation", frs)))
            }
        }
        
        # check for single exon skipping
        if(length(exonChoices) == 1){
            
            # check if there is no other exon within the first intron: 
            # splice site end until exon end
            dummyFirstItr <- GRanges(
                seqnames = toString(seqnames(intron_ranges[max_lap])),
                ranges = IRanges(end(exons[exonChoices[1]]) + 1, 
                                end(junctions_gr[i])),
                strand = toString(strand(intron_ranges[max_lap]))
            )
            
            if(length(findOverlaps(exons, dummyFirstItr, 
                                    type = "within")) > 0){
                # another exon is contained within the most freq used intron
                ifelse(shift_needed, 
                        return(c("exonSkipping", "inconclusive", 0)), 
                        return(c("exonSkipping", "inconclusive")))
            }
            
            
            secondItr <- GRanges(
                seqnames = toString(intron_ranges[max_lap]@seqnames@values),
                strand = toString(intron_ranges[max_lap]@strand@values),
                ranges = IRanges(ss_start, start(exons[exonChoices[1]]) - 1) 
                # end of exon + 1, end of aberrant junction
            )
            secItrChoices <- to(findOverlaps(secondItr, intron_ranges, 
                                            type = "end"))
            # only look at most used one
            expre <- sapply(secItrChoices, function(j){
                elementMetadata(intron_ranges[j])$medianCount
            })
            maxExpr <- which.max(expre)
            
            if(length(secItrChoices) == 0){
                ifelse(shift_needed, 
                        return(c("exonSkipping", "inconclusive", 0)), 
                        return(c("exonSkipping", "inconclusive")))
            }
            
            if(ss_start >= start(intron_ranges[secItrChoices[maxExpr]])){
                # check if there is no other exon in that range
                if(length(findOverlaps(exons, 
                                        intron_ranges[secItrChoices[maxExpr]], 
                                        type = "within")) == 0){
                    # clear exon skipping, only exon is skipped 
                    # calculate frameshift, skipped exon plus possible exon 
                    #    elongation 
                    
                    shift = (-1)*(end(exons[exonChoices[1]]) - 
                                        start(exons[exonChoices[1]]) + 1) + 
                        ss_start - start(intron_ranges[secItrChoices[maxExpr]])
                    
                    frs = ifelse(shift %% 3 == 0,"unlikely","likely")
                    ifelse(shift_needed, 
                            return(c("singleExonSkipping", "inconclusive", 
                                        shift)), 
                            return(c("singleExonSkipping", frs)))
                }
            }
        } # single exon skipping end
        
    }
    
    # splice site longer than one intron + exon -> not defined for now
    ifelse(shift_needed, 
            return(c("exonSkipping", "inconclusive", 0)), 
            return(c("exonSkipping", "inconclusive")))
}

#' 
#' @noRd
compareEnds <- function(junctions_gr, i, max_lap, shift_needed, 
                            intron_ranges, exons){
    intron_end = end(intron_ranges[max_lap])
    ss_end = end(junctions_gr[i])
    
    # found the most freq intron with same start again
    # check if intron ends after splice site -> exon elongation -> FRS -> done
    if(intron_end > ss_end){
        if(((intron_end - ss_end) %% 3) != 0){
            frs = "likely"
        }else{ frs = "unlikely"}
        
        ifelse(shift_needed, 
                return(c("exonElongation", frs, (intron_end - ss_end))), 
                return(c("exonElongation", frs)))
    }
    
    # check if splice site ends in following exon -> exon truncation -> FRS
    if(intron_end < ss_end){
        
        # create dummy exon find all exons starting from that intron end
        dummy_exon <- GRanges(
            seqnames = toString(intron_ranges[max_lap]@seqnames@values),
            ranges = IRanges(intron_end + 1, end = intron_end + 2),
            strand = toString(intron_ranges[max_lap]@strand@values)
        )
        exonChoices <- to(findOverlaps(dummy_exon, exons, type = "start"))
        for(j in exonChoices){
            exon_end = end(exons[j])
            if(exon_end > ss_end){
                if((ss_end - start(exons[j]) + 1)%%3 != 0){
                    frs = "likely"
                }else{frs = "unlikely"}
                ifelse(shift_needed, 
                        return(c("exonTruncation",frs, 
                                    (-1)*(ss_end - start(exons[j]) + 1))), 
                        return(c("exonTruncation",frs)))
            }
        }
        
        # check for single exon skipping
        if(length(exonChoices) == 1){
            
            # check if there is no other exon within the first intron: 
            #    splice site end until exon end
            dummyFirstItr <- GRanges(
                seqnames = toString(seqnames(intron_ranges[max_lap])),
                ranges = IRanges(start(junctions_gr[i]), 
                                    start(exons[exonChoices[1]]) - 1),
                strand = toString(strand(intron_ranges[max_lap]))
            )
            
            if(length(findOverlaps(exons, dummyFirstItr, 
                                    type = "within")) > 0){
                # another exon is contained within the most freq used intron
                ifelse(shift_needed, 
                        return(c("exonSkipping", "inconclusive", 0)), 
                        return(c("exonSkipping", "inconclusive")))
            }
            
            
            secondItr <- GRanges(
                seqnames = toString(intron_ranges[max_lap]@seqnames@values),
                strand = toString(intron_ranges[max_lap]@strand@values),
                ranges = IRanges(end(exons[exonChoices[1]]) + 1, ss_end) 
                # end of exon + 1, end of aberrant junction
            )
            secItrChoices <- to(findOverlaps(secondItr, intron_ranges, 
                                                type = "start"))
            # only look at most used one
            expre <- sapply(secItrChoices, function(j){
                elementMetadata(intron_ranges[j])$medianCount
            })
            maxExpr <- which.max(expre)
            
            if(length(secItrChoices) == 0){
                ifelse(shift_needed, 
                        return(c("exonSkipping", "inconclusive", 0)), 
                        return(c("exonSkipping", "inconclusive")))
            }
            
            if(ss_end <= end(intron_ranges[secItrChoices[maxExpr]])){
                # check if there is no other exon in that range
                if(length(findOverlaps(exons, 
                                        intron_ranges[secItrChoices[maxExpr]], 
                                        type = "within")) == 0){
                    # clear exon skipping, only exon is skipped 
                    # calculate frameshift, skipped exon plus possible exon 
                    #    elongation at end
                    shift = (-1)*(end(exons[exonChoices[1]]) - 
                                        start(exons[exonChoices[1]]) + 1) + 
                        end(intron_ranges[secItrChoices[maxExpr]]) - ss_end
                    frs = ifelse(shift%%3 == 0,"unlikely","likely")
                    ifelse(shift_needed, 
                            return(c("singleExonSkipping", "inconclusive", 
                                        shift)), 
                            return(c("singleExonSkipping", frs)))
                }
            }
        } # single exon skipping end
        
        
    }
    
    # splice site longer than one intron + exon -> not defined for now
    ifelse(shift_needed, 
            return(c("exonSkipping", "inconclusive", 0)), 
            return(c("exonSkipping", "inconclusive")))
}

#' 
#' @noRd
checkIntergenic <- function(junctions_gr, i, refseq.genes){
    # check if start > 1000
    # start - 1000, end + 1000
    start = start(junctions_gr[i]) 
    # ifelse(start > 1000, start = start - 1000, start = 1)
    # if(start > 1000){
    #     start = start - 1000
    # }else{start = 1}
    
    end = end(junctions_gr[i])  #+ 1000
    if(start + 2 < end){
        start = start + 1
        end = end - 1
    }
    
    test_junction <- GRanges(
        seqnames = seqnames(junctions_gr[i]),
        ranges = IRanges(start, end),
        strand = strand(junctions_gr[i])
    )
    
    # overlap with introns and exon
    # IGNORE STRANDS? -> decided its not necessary
    
    # check if distance to nearest is > 1000 -> intergenic
    # otherwise up/downstream
    dist = min(distance(test_junction, refseq.genes), na.rm = TRUE)
    if(dist > 0){
        # find nearest and compare starts
        if(start(refseq.genes[nearest(junctions_gr[i], 
                                        refseq.genes)]) > start){
            ifelse(strand(junctions_gr[i]) == "+", 
                    return(c("upstreamOfNearestGene", "unlikely")), 
                    return(c("downstreamOfNearestGene", "unlikely")))
        }else{
            ifelse(strand(junctions_gr[i]) == "+", 
                    return(c("downstreamOfNearestGene", "unlikely")), 
                    return(c("upstreamOfNearestGene", "unlikely")))
        }
    }
    return(c("complex", "inconclusive"))
}

#' 
#' @noRd
checkExonSkipping <- function(junctions_dt, txdb){
    psi_positions <- which(junctions_dt$type != "theta")
    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], 
                                                keep.extra.columns = TRUE)
    seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
    
    refseq.genes<- genes(txdb)
    
    exonSkip <- which(junctions_dt[psi_positions]$potentialImpact %in% 
                            c("exonSkipping", "singleExonSkipping"))
    
    message("start checking exonSkipping")
    newSkip_results <- sapply(exonSkip, function(i){
        start = start(junctions_gr[i]) 
        end = end(junctions_gr[i])
        
        # reduce the junction width so adjacent genes have a distance of 1
        if(start + 2 < end){
            start = start + 1
            end = end - 1
        }
        
        test_start <- GRanges(
            seqnames = seqnames(junctions_gr[i]),
            strand = strand(junctions_gr[i]),
            ranges = IRanges(start, start + 1)
        )
        
        test_end <- GRanges(
            seqnames = seqnames(junctions_gr[i]),
            strand = strand(junctions_gr[i]),
            ranges = IRanges(end - 1, end)
        )
        
        # check for which genes distance to start is 0
        start_genes <- which(distance(test_start, refseq.genes) == 0)
        # start is not in a gene
        if(length(start_genes) == 0) return("splicingBeyondGene")
        
        # start is in a gene -> is end in same gene
        for(to in start_genes){
            # end is in same gene
            if(distance(test_end, refseq.genes[to]) == 0){
                return("exonSkipping")
            }
        }
        
        end_genes <- which(distance(test_end, refseq.genes) == 0)
        # end is not in a gene
        if(length(end_genes) == 0) return("splicingBeyondGene")
        # end is in a different gene
        return("multigenicSplicing")
    })
    
    # checking exonSkipping done
    if(length(exonSkip) > 0){
        junctions_dt[psi_positions[exonSkip], 
                        potentialImpact2 := newSkip_results]
        junctions_dt[potentialImpact2 == "splicingBeyondGene", 
                        potentialImpact := "splicingBeyondGene"]
        junctions_dt[potentialImpact2 == "splicingBeyondGene", 
                        causesFrameshift := "inconclusive"]
        junctions_dt[potentialImpact2 == "multigenicSplicing", 
                        potentialImpact := "multigenicSplicing"]
        junctions_dt[potentialImpact2 == "multigenicSplicing", 
                        causesFrameshift := "inconclusive"]
        junctions_dt[, potentialImpact2 := NULL]
    }
    
    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
    return(junctions_dt)
}

#' 
#' @noRd
checkInconclusive <- function(junctions_dt, txdb){
    psi_positions <- which(junctions_dt$type != "theta")
    colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2"
    junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], 
                                                keep.extra.columns = TRUE)
    seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr)
    
    refseq.genes<- genes(txdb)
    
    inconclusive <- which(junctions_dt[psi_positions
                                        ]$potentialImpact == "complex")
    
    inconclusive_results <- sapply(inconclusive, function(i){
        start = start(junctions_gr[i]) 
        end = end(junctions_gr[i])
        
        # reduce the junction width so adjacent genes have a distance of 1
        if(start + 2 < end){
            start = start + 1
            end = end - 1
        }
        
        test_start <- GRanges(
            seqnames = seqnames(junctions_gr[i]),
            strand = strand(junctions_gr[i]),
            ranges = IRanges(start, start + 1)
        )
        
        test_end <- GRanges(
            seqnames = seqnames(junctions_gr[i]),
            strand = strand(junctions_gr[i]),
            ranges = IRanges(end - 1, end)
        )
        
        # check for which genes distance to start is 0
        start_genes <- which(distance(test_start, refseq.genes) == 0)
        # start is not in a gene
        if(length(start_genes) == 0) return("splicingBeyondGene")
        
        # start is in a gene -> is end in same gene
        for(to in start_genes){
            # end is in same gene
            if(distance(test_end, refseq.genes[to]) == 0){
                return("complex")
            } 
        }
        
        end_genes <- which(distance(test_end, refseq.genes) == 0)
        # end is not in a gene
        if(length(end_genes) == 0) return("splicingBeyondGene")
        # end is in a different gene
        return("multigenicSplicing")
    })
    
    colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND"

    if(length(inconclusive) > 0){
        junctions_dt[psi_positions[inconclusive], 
                        potentialImpact := inconclusive_results]
    } 
    
    return(junctions_dt)
}
c-mertes/FraseR documentation built on April 25, 2024, 9:44 p.m.