R/AltSplicingJunctionSupported.R

Defines functions removeDuplicateTranscripts replaceJunction findJunctionPairs

Documented in findJunctionPairs removeDuplicateTranscripts replaceJunction

#' Find alternative junctions for Whippet alternative splicing events
#'
#' Find junctions that pair with each end of an AA (alt. acceptor) or
#' AD (alt. donor) whippet range
#' Find junctions that pair with the upsteam/downstream exon of an
#' AF (alt. first exon) or an AL (alt. last exon)
#' @param whippetDataSet whippetDataSet generated from \code{readWhippetDataSet()}
#' @param type type of Whippet event (AA/AD/AF/AL).
#' Note only one event type should be processed at a time.
#' @return GRanges object with alternative junctions.
#' Each event should have a set of X (for which the psi measurement is reported) junctions,
#' and alternative Y junctions.
#' @export
#' @import GenomicRanges
#' @family whippet splicing isoform creation
#' @author Beth Signal
#' @examples
#' gtf <- rtracklayer::import(system.file("extdata", "gencode.vM25.small.gtf", package = "GeneStructureTools"))
#' exons <- gtf[gtf$type == "exon"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'
#' whippetFiles <- system.file("extdata", "whippet_small/",
#'     package = "GeneStructureTools"
#' )
#' wds <- readWhippetDataSet(whippetFiles)
#'
#' wds.altAce <- filterWhippetEvents(wds, eventTypes = "AA", psiDelta = 0.1, probability = 0.8)
#' jncPairs.altAce <- findJunctionPairs(wds.altAce, type = "AA")
#'
#' wds.altDon <- filterWhippetEvents(wds, eventTypes = "AD", psiDelta = 0.1, probability = 0.8)
#' jncPairs.altDon <- findJunctionPairs(wds.altDon, type = "AD")
#'
#' wds.altFirst <- filterWhippetEvents(wds, eventTypes = "AF", psiDelta = 0.1, probability = 0.8)
#' jncPairs.altFirst <- findJunctionPairs(wds.altFirst, type = "AF")
#'
#' wds.altLast <- filterWhippetEvents(wds, eventTypes = "AL", psiDelta = 0.1, probability = 0.8)
#' jncPairs.altLast <- findJunctionPairs(wds.altLast, type = "AL")
findJunctionPairs <- function(whippetDataSet, type = NA) {
    whippetDataSet <- filterWhippetEvents(whippetDataSet,
        probability = 0,
        psiDelta = 0,
        eventTypes = type
    )

    eventCoords <- coordinates(whippetDataSet)
    jncCoords <- junctions(whippetDataSet)

    eventCoords$type <- type
    # search for alternatives to the left or the right?
    eventCoords$search <- "right"
    eventCoords$search[(eventCoords$type == "AA" &
        as.logical(strand(eventCoords) == "+")) |
        (eventCoords$type == "AD" &
            as.logical(strand(eventCoords) == "-")) |
        (eventCoords$type == "AF" &
            as.logical(strand(eventCoords) == "-")) |
        (eventCoords$type == "AL" &
            as.logical(strand(eventCoords) == "+"))] <-
        "left"

    junctionSJA.right <- eventCoords[eventCoords$search == "right"]
    junctionSJA.left <- eventCoords[eventCoords$search == "left"]

    # right if AA&-
    # left if AA&+
    olA.from <- vector()
    if (length(junctionSJA.right) > 0) {
        if (type %in% c("AA", "AD")) {
            start(junctionSJA.right) <- start(junctionSJA.right) - 1
            end(junctionSJA.right) <- start(junctionSJA.right)
        } else {
            start(junctionSJA.right) <- end(junctionSJA.right)
        }
        olA.right <- findOverlaps(junctionSJA.right, jncCoords, type = "start")
        olA.from <- append(
            olA.from,
            as.character(junctionSJA.right$id[olA.right@from])
        )
        junctionsA <- jncCoords[olA.right@to]
    }
    if (length(junctionSJA.left) > 0) {
        end(junctionSJA.left) <- start(junctionSJA.left)
        olA.left <- findOverlaps(junctionSJA.left, jncCoords, type = "end")
        olA.from <- append(
            olA.from,
            as.character(junctionSJA.left$id[olA.left@from])
        )
        if (exists("junctionsA")) {
            junctionsA <- c(junctionsA, jncCoords[olA.left@to])
        } else {
            junctionsA <- jncCoords[olA.left@to]
        }
    }

    if (length(junctionsA) > 0) {
        junctionsA$event_id <- olA.from
        junctionsA$search <- eventCoords$search[match(
            junctionsA$event_id,
            eventCoords$id
        )]
        junctionsA$set <- "A"
    }

    if (type %in% c("AA", "AD")) {
        if (length(junctionsA) > 0) {

            # junction B only required if AA/AD
            junctionSJB.right <- eventCoords[eventCoords$search == "right"]
            junctionSJB.left <- eventCoords[eventCoords$search == "left"]

            # same for B junctions
            olB.from <- vector()
            if (length(junctionSJB.right) > 0) {
                start(junctionSJB.right) <- end(junctionSJB.right)
                olB.right <- findOverlaps(junctionSJB.right, jncCoords,
                    type = "start"
                )
                olB.from <- append(
                    olB.from,
                    as.character(
                        junctionSJB.right$id[olB.right@from]
                    )
                )
                junctionsB <- jncCoords[olB.right@to]
            }
            if (length(junctionSJB.left) > 0) {
                end(junctionSJB.left) <- end(junctionSJB.left) + 1
                start(junctionSJB.left) <- end(junctionSJB.left)
                olB.left <- findOverlaps(junctionSJB.left, jncCoords, type = "end")
                olB.from <- append(
                    olB.from,
                    as.character(junctionSJB.left$id[olB.left@from])
                )
                if (exists("junctionsB")) {
                    junctionsB <- c(junctionsB, jncCoords[olB.left@to])
                } else {
                    junctionsB <- jncCoords[olB.left@to]
                }
            }
            if (length(junctionsB) > 0 & length(junctionsA) > 0) {
                junctionsB$event_id <- olB.from
                junctionsB$search <- eventCoords$search[match(
                    junctionsB$event_id,
                    eventCoords$id
                )]
                junctionsB$set <- "B"
                junctions <- c(junctionsA, junctionsB)
            } else {
                junctions <- NULL
            }
        } else {
            junctions <- NULL
        }
    }

    if (type %in% c("AF", "AL")) {
        if (length(junctionsA) > 0) {
            junctionsA.left <- junctionsA[junctionsA$search == "left"]
            junctionsA.right <- junctionsA[junctionsA$search == "right"]

            if (length(junctionsA.left) > 0) {
                end(junctionsA.left) <- start(junctionsA.left)
                olC.left <- findOverlaps(junctionsA.left, jncCoords, type = "start")
                junctionsC.left <- jncCoords[olC.left@to]
                junctionsC.left$event_id <-
                    junctionsA.left$event_id[olC.left@from]
                junctionsC.left$search <- junctionsA.left$search[olC.left@from]
                ol <- findOverlaps(junctionsC.left, junctionsA, type = "equal")
                if (length(ol) > 0) {
                    junctionsC.left <- junctionsC.left[-ol@from]
                }
                junctionsC <- junctionsC.left
            }
            if (length(junctionsA.right) > 0) {
                start(junctionsA.right) <- end(junctionsA.right)
                olC.right <- findOverlaps(junctionsA.right, jncCoords, type = "end")
                junctionsC.right <- jncCoords[olC.right@to]
                junctionsC.right$event_id <-
                    junctionsA.right$event_id[olC.right@from]
                junctionsC.right$search <- junctionsA.right$search[olC.right@from]
                ol <- findOverlaps(junctionsC.right, junctionsA, type = "equal")
                if (length(ol) > 0) {
                    junctionsC.right <- junctionsC.right[-ol@from]
                }
                if (exists("junctionsC")) {
                    junctionsC <- c(junctionsC, junctionsC.right)
                } else {
                    junctionsC <- junctionsC.right
                }
            }

            if (length(junctionsC) > 0 & length(junctionsA) > 0) {
                junctionsC$set <- "C"
                junctions <- c(junctionsA, junctionsC)
            } else {
                junctions <- NULL
            }
        } else {
            junctions <- NULL
        }
    }

    if (!is.null(junctions)) {
        keep <- which(width(junctions) > 2)

        # replace junction codes
        if (type %in% c("AA", "AD")) {
            junctions$set[which((junctions$set == "A" &
                as.logical(strand(junctions) == "+")) |
                (junctions$set == "B" &
                    as.logical(strand(junctions) == "-")))] <- "X"
            junctions$set[which((junctions$set == "A" &
                as.logical(strand(junctions) == "-")) |
                (junctions$set == "B" &
                    as.logical(strand(junctions) == "+")))] <- "Y"
        }
        if (type %in% c("AF", "AL")) {
            junctions$set[which(junctions$set == "A")] <- "X"
            junctions$set[which(junctions$set == "C")] <- "Y"
        }

        junctions <- junctions[keep]
        return(junctions)
    } else {
        return(NULL)
    }
}

#' Find transcripts containing/overlapping junctions and replace them with alternative junctions
#'
#' @param whippetDataSet whippetDataSet generated from \code{readWhippetDataSet()}
#' @param junctionPairs GRanges object with alternative Whippet junctions.
#' Generated by findJunctionPairs()
#' @param exons GRanges object made from a GTF containing exon coordinates
#' @param type type of Whippet event (AA/AD/AF/AL).
#' Note only one event type should be processed at a time.
#' @return GRanges object with transcripts containing alternative junctions.
#' @export
#' @importFrom rtracklayer import
#' @import GenomicRanges
#' @family whippet splicing isoform creation
#' @author Beth Signal
#' @examples
#' gtf <- rtracklayer::import(system.file("extdata", "gencode.vM25.small.gtf", package = "GeneStructureTools"))
#' exons <- gtf[gtf$type == "exon"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'
#' whippetFiles <- system.file("extdata", "whippet_small/",
#'     package = "GeneStructureTools"
#' )
#' wds <- readWhippetDataSet(whippetFiles)
#'
#' wds.altAce <- filterWhippetEvents(wds, eventTypes = "AA", psiDelta = 0.1, probability = 0.8)
#' jncPairs.altAce <- findJunctionPairs(wds.altAce, type = "AA")
#' transcripts.altAce <- replaceJunction(wds.altAce, jncPairs.altAce, exons, type = "AA")
#'
#' wds.altDon <- filterWhippetEvents(wds, eventTypes = "AD", psiDelta = 0.1, probability = 0.8)
#' jncPairs.altDon <- findJunctionPairs(wds.altDon, type = "AD")
#' transcripts.altDon <- replaceJunction(wds.altDon, jncPairs.altDon, exons, type = "AD")
#'
#' wds.altFirst <- filterWhippetEvents(wds, eventTypes = "AF", psiDelta = 0.1, probability = 0.8)
#' jncPairs.altFirst <- findJunctionPairs(wds.altFirst, type = "AF")
#' transcripts.altFirst <- replaceJunction(wds.altFirst, jncPairs.altFirst, exons, type = "AF")
#'
#' wds.altLast <- filterWhippetEvents(wds, eventTypes = "AL", psiDelta = 0.1, probability = 0.8)
#' jncPairs.altLast <- findJunctionPairs(wds.altLast, type = "AL")
#' transcripts.altLast <- replaceJunction(wds.altLast, jncPairs.altLast, exons, type = "AL")
#'
#'
replaceJunction <- function(whippetDataSet, junctionPairs, exons, type = NA) {
    junctionPairs$type <- type
    range <- junctionPairs

    eventCoords <- coordinates(whippetDataSet)
    eventCoords <- eventCoords[eventCoords$id %in% junctionPairs$event_id]


    if (type %in% c("AA", "AD")) {
        ## find exons that use/overlap the junction - at the side where it's alternative
        end(range)[which(range$search == "right")] <-
            start(range)[which(range$search == "right")]
        start(range)[which(range$search == "left")] <-
            end(range)[which(range$search == "left")]

        ol.junction <- findOverlaps(range, exons)
        ol.junction <- cbind(as.data.frame(ol.junction),
            transcript_id = exons$transcript_id[ol.junction@to]
        )

        ## table of transcripts overlapping the junction
        # tid: transcript id
        tidTable <- as.data.frame(table(
            ol.junction$queryHits,
            ol.junction$transcript_id
        ))
        tidTable <- tidTable[tidTable$Freq > 0, ]
        colnames(tidTable)[c(1, 2)] <- c("from_index", "to_transcript_id")
        tids <- unique(tidTable$to_transcript_id)

        # all combinations of transcripts + junctions
        tidTable <- data.frame(
            from_index = rep(seq_along(junctionPairs),
                each = length(tids)
            ),
            to_transcript_id = rep(
                tids,
                length(junctionPairs)
            ),
            Freq = 1
        )

        tidTable$junction_id <- range$id[tidTable$from_index]
        ## new transcript id:
        ## unique if different junctions are going to be used in same transcript base
        tidTable$new_transcript_id <- paste0(
            tidTable$to_transcript_id, "+AS",
            " ", tidTable$junction_id
        )

        ## all transcripts for structural altercations
        gtfTranscripts <- exons[exons$transcript_id %in% tids]
        mcols(gtfTranscripts) <-
            mcols(gtfTranscripts)[, c(
                "gene_id", "transcript_id",
                "transcript_type", "exon_id",
                "exon_number"
            )]
        m <- match(gtfTranscripts$transcript_id, tidTable$to_transcript_id)
        # add new transcript id
        gtfTranscripts$new_transcript_id <-
            paste0(
                gtfTranscripts$transcript_id, "+AS ",
                range$id[tidTable$from_index[m]]
            )
        gtfTranscripts$new_transcript_id_exnum <-
            paste0(
                gtfTranscripts$new_transcript_id, "_",
                as.numeric(gtfTranscripts$exon_number)
            )

        # duplicate core transcripts if needed
        needsDuplicated <- which(!(tidTable$new_transcript_id %in%
            gtfTranscripts$new_transcript_id))
        if (length(needsDuplicated) > 0) {
            gtfTranscripts.add <- gtfTranscripts[
                gtfTranscripts$transcript_id %in%
                    tidTable$to_transcript_id[needsDuplicated]
            ]
        }
        while (length(needsDuplicated) > 0) {
            gtfTranscripts.add <- gtfTranscripts.add[
                gtfTranscripts.add$transcript_id %in%
                    tidTable$to_transcript_id[needsDuplicated]
            ]
            m <- match(
                gtfTranscripts.add$transcript_id,
                tidTable$to_transcript_id[needsDuplicated]
            )
            gtfTranscripts.add$new_transcript_id <-
                paste0(
                    gtfTranscripts.add$transcript_id, "+AS ",
                    tidTable$junction_id[needsDuplicated][m]
                )
            gtfTranscripts <- c(gtfTranscripts, gtfTranscripts.add)
            needsDuplicated <- which(!(tidTable$new_transcript_id %in%
                gtfTranscripts$new_transcript_id))
        }

        gtfTranscripts$from <- unlist(lapply(str_split(
            gtfTranscripts$new_transcript_id, "AS "
        ), "[[", 2))
        gtfTranscripts <- gtfTranscripts[order(
            gtfTranscripts$transcript_id,
            start(gtfTranscripts)
        )]

        ## alter exons hitting the junctions so they all break at the same place
        # range is at the alt. points defined in eventCoords
        range <- junctionPairs
        start(range) <- min(start(junctionPairs))
        end(range) <- max(start(junctionPairs))

        ol.left <- as.data.frame(findOverlaps(range, gtfTranscripts))
        ol.left$from_id <- range$id[ol.left$queryHits]
        ol.left$to_id <- gtfTranscripts$from[ol.left$subjectHits]
        ol.left <- ol.left[ol.left$from_id == ol.left$to_id, ]

        # fix the end of the left transcript exons
        exons.left <- gtfTranscripts[ol.left$subjectHits]

        keep <- which(start(exons.left) <
            start(junctionPairs[ol.left$queryHits]))
        end(exons.left)[keep] <- start(junctionPairs[ol.left$queryHits])[keep]
        exons.left <- exons.left[keep]

        # now the right side
        range <- junctionPairs

        end(range) <- max(end(junctionPairs))
        start(range) <- min(end(junctionPairs))

        ol.right <- as.data.frame(findOverlaps(range, gtfTranscripts))
        ol.right$from_id <- range$id[ol.right$queryHits]
        ol.right$to_id <- gtfTranscripts$from[ol.right$subjectHits]
        ol.right <- ol.right[ol.right$from_id == ol.right$to_id, ]

        # fix the start of the right exons
        exons.right <- gtfTranscripts[ol.right$subjectHits]

        keep <- which(end(exons.right) > end(junctionPairs[ol.right$queryHits]))
        start(exons.right)[keep] <- end(junctionPairs[ol.right$queryHits])[keep]
        exons.right <- exons.right[keep]

        m <- match(exons.left$new_transcript_id, exons.right$new_transcript_id)
        exons.left <- exons.left[which(!is.na(m))]
        exons.right <- exons.right[m[which(!is.na(m))]]

        exons.glued <- exons.left
        end(exons.glued) <- end(exons.right)

        # replacement exon pairs
        gtfTranscripts.replacement <- c(exons.left, exons.right)

        # remove replaced exons from gtf
        gtfTranscripts.altered <-
            gtfTranscripts[gtfTranscripts$new_transcript_id %in%
                gtfTranscripts.replacement$new_transcript_id]


        ol <- as.data.frame(findOverlaps(exons.glued, gtfTranscripts.altered))
        ol$from_id <- exons.glued$new_transcript_id[ol$queryHits]
        ol$to_id <- gtfTranscripts.altered$new_transcript_id[ol$subjectHits]
        ol <- ol[ol$from_id == ol$to_id, ]
        gtfTranscripts.altered <- gtfTranscripts.altered[-unique(ol$subjectHits)]

        # add together
        gtfTranscripts.altered <- c(
            gtfTranscripts.altered,
            gtfTranscripts.replacement
        )
        gtfTranscripts.altered <- gtfTranscripts.altered[order(
            gtfTranscripts.altered$new_transcript_id,
            start(gtfTranscripts.altered)
        )]

        gtfTranscripts.altered$set <-
            range$set[match(gtfTranscripts.altered$from, range$id)]
        gtfTranscripts.altered$event_id <- junctionPairs$event_id[
            match(gtfTranscripts.altered$from, junctionPairs$id)
        ]
        gtfTranscripts.altered$transcript_id <-
            paste0(
                gtfTranscripts.altered$transcript_id,
                "+AS", type, gtfTranscripts.altered$set, " ",
                gtfTranscripts.altered$event_id
            )
        gtfTranscripts.altered$set <- paste0(
            type, "_",
            gtfTranscripts.altered$set
        )
    } else if (type %in% c("AF", "AL")) {
        end(range)[which(range$search == "right")] <-
            start(range)[which(range$search == "right")]
        start(range)[which(range$search == "left")] <-
            end(range)[which(range$search == "left")]

        olFirstLast.left <- findOverlaps(range, exons, type = "start")
        olFirstLast.right <- findOverlaps(range, exons, type = "end")
        olFirstLast.left <-
            cbind(as.data.frame(olFirstLast.left),
                transcript_id = exons$transcript_id[olFirstLast.left@to]
            )
        olFirstLast.left <- olFirstLast.left[
            which(range$search[olFirstLast.left$queryHits] == "left"),
        ]

        olFirstLast.right <-
            cbind(as.data.frame(olFirstLast.right),
                transcript_id = exons$transcript_id[olFirstLast.right@to]
            )
        olFirstLast.right <- olFirstLast.right[
            which(range$search[olFirstLast.right$queryHits] == "right"),
        ]

        olFirstLast <- rbind(olFirstLast.left, olFirstLast.right)

        olFirstLast$search <- range$search[olFirstLast$queryHits]

        exonsFirstLast <- exons[olFirstLast$subjectHits]
        exonsFirstLast$set <- range$set[olFirstLast$queryHits]
        exonsFirstLast$search <- range$search[olFirstLast$queryHits]

        exonsFirstLast$junction_id <- range$id[olFirstLast$queryHits]
        newId.left <- paste0(
            seqnames(exonsFirstLast), ":",
            start(junctionPairs)[olFirstLast$queryHits], "-",
            end(junctionPairs)[olFirstLast$queryHits], "+",
            end(exonsFirstLast)
        )
        newId.right <- paste0(
            seqnames(exonsFirstLast), ":",
            start(junctionPairs)[olFirstLast$queryHits], "-",
            end(junctionPairs)[olFirstLast$queryHits], "+",
            start(exonsFirstLast)
        )
        exonsFirstLast$new_id <- NA
        exonsFirstLast$new_id[which(exonsFirstLast$search == "left")] <-
            newId.left[which(exonsFirstLast$search == "left")]
        exonsFirstLast$new_id[which(exonsFirstLast$search == "right")] <-
            newId.right[which(exonsFirstLast$search == "right")]

        m <- match(exonsFirstLast$junction_id, junctionPairs$id)
        junctionPairs <- junctionPairs[m]
        range <- junctionPairs
        range$id <- exonsFirstLast$new_id
        end(range)[which(range$search == "left")] <-
            start(range)[which(range$search == "left")]
        start(range)[which(range$search == "right")] <-
            end(range)[which(range$search == "right")]

        ol.junction <- findOverlaps(range, exons)
        ol.junction <- cbind(as.data.frame(ol.junction),
            transcript_id = exons$transcript_id[ol.junction@to]
        )

        ## table of transcripts overlapping the junction
        # tid: transcript id
        tidTable <- as.data.frame(table(
            ol.junction$queryHits,
            ol.junction$transcript_id
        ))
        tidTable <- tidTable[tidTable$Freq > 0, ]
        colnames(tidTable)[c(1, 2)] <- c("from_index", "to_transcript_id")
        tids <- unique(tidTable$to_transcript_id)

        tidTable$junction_id <- range$id[tidTable$from_index]
        ## new transcript id --
        ## unique if different junctions are going to be used in same transcript base
        tidTable$new_transcript_id <- paste0(
            tidTable$to_transcript_id, "+AS ",
            tidTable$junction_id
        )
        ## all transcripts for structural altercations
        gtfTranscripts <- exons[exons$transcript_id %in% tids]
        mcols(gtfTranscripts) <-
            mcols(gtfTranscripts)[, c(
                "gene_id", "transcript_id",
                "transcript_type", "exon_id",
                "exon_number"
            )]
        m <- match(gtfTranscripts$transcript_id, tidTable$to_transcript_id)
        # add new transcript id
        gtfTranscripts$new_transcript_id <-
            paste0(
                gtfTranscripts$transcript_id, "+AS ",
                range$id[tidTable$from_index[m]]
            )
        gtfTranscripts$new_transcript_id_exnum <-
            paste0(
                gtfTranscripts$new_transcript_id,
                "_",
                as.numeric(gtfTranscripts$exon_number)
            )

        # duplicate core transcripts if needed
        needsDuplicated <- which(!(tidTable$new_transcript_id %in%
            gtfTranscripts$new_transcript_id))
        if (length(needsDuplicated) > 0) {
            gtfTranscripts.add <-
                gtfTranscripts[gtfTranscripts$transcript_id %in%
                    tidTable$to_transcript_id[needsDuplicated]]
        }
        while (length(needsDuplicated) > 0) {
            gtfTranscripts.add <- gtfTranscripts.add[
                gtfTranscripts.add$transcript_id %in%
                    tidTable$to_transcript_id[needsDuplicated]
            ]
            m <- match(
                gtfTranscripts.add$transcript_id,
                tidTable$to_transcript_id[needsDuplicated]
            )
            gtfTranscripts.add$new_transcript_id <-
                paste0(
                    gtfTranscripts.add$transcript_id, "+AS ",
                    tidTable$junction_id[needsDuplicated][m]
                )
            gtfTranscripts <- c(gtfTranscripts, gtfTranscripts.add)
            needsDuplicated <- which(!(tidTable$new_transcript_id %in%
                gtfTranscripts$new_transcript_id))
        }

        gtfTranscripts$from <- unlist(lapply(str_split(
            gtfTranscripts$new_transcript_id, "AS "
        ), "[[", 2))
        gtfTranscripts <- gtfTranscripts[order(
            gtfTranscripts$transcript_id,
            start(gtfTranscripts)
        )]

        gtfTranscripts <- gtfTranscripts[gtfTranscripts$transcript_id %in%
            exonsFirstLast$transcript_id]
        gtfTranscripts$new_transcript_id_exnum <-
            paste0(
                gtfTranscripts$new_transcript_id, "_",
                as.numeric(gtfTranscripts$exon_number)
            )

        range <- junctionPairs
        range$id <- exonsFirstLast$new_id

        ## find exons that use/overlap the junction - at the side where it's alternative
        end(range)[which(range$search == "left")] <-
            start(range)[which(range$search == "left")]
        start(range)[which(range$search == "right")] <-
            end(range)[which(range$search == "right")]

        ### Same used junction replacement
        ol.left <- as.data.frame(findOverlaps(range, gtfTranscripts))
        ol.left$from_id <- range$id[ol.left$queryHits]
        ol.left$to_id <- gtfTranscripts$from[ol.left$subjectHits]
        ol.left <- ol.left[ol.left$from_id == ol.left$to_id, ]
        ol.left <- ol.left[which(range$search[ol.left$queryHits] == "left"), ]

        # fix the end of the left transcript exons
        exons.left <- gtfTranscripts[ol.left$subjectHits]
        end(exons.left) <- end(range[ol.left$queryHits])

        ol.right <- as.data.frame(findOverlaps(range, gtfTranscripts))
        ol.right$from_id <- range$id[ol.right$queryHits]
        ol.right$to_id <- gtfTranscripts$from[ol.right$subjectHits]
        ol.right <- ol.right[ol.right$from_id == ol.right$to_id, ]
        ol.right <-
            ol.right[which(range$search[ol.right$queryHits] == "right"), ]

        # fix the end of the right transcript exons
        exons.right <- gtfTranscripts[ol.right$subjectHits]
        start(exons.right) <- start(range[ol.right$queryHits])

        junctionReplacementExons <- c(exons.left, exons.right)
        junctionReplacementExons$set <-
            range$set[match(junctionReplacementExons$from, range$id)]

        keep <- which(gtfTranscripts$new_transcript_id %in%
            junctionReplacementExons$new_transcript_id)
        gtfTranscripts.altered <- gtfTranscripts[keep]
        gtfTranscripts.altered$set <-
            range$set[match(gtfTranscripts.altered$from, range$id)]

        ### First/last exon replacement

        m <- match(junctionReplacementExons$from, exonsFirstLast$new_id)
        replacementExonsFirstLast <- junctionReplacementExons
        ranges(replacementExonsFirstLast) <- ranges(exonsFirstLast[m])


        # remove anything after first/last
        if (type == "AF") {
            back <- 0
            n <- which(gtfTranscripts.altered$new_transcript_id_exnum %in%
                junctionReplacementExons$new_transcript_id_exnum)
            ids <- gtfTranscripts.altered$new_transcript_id[n]
            exonNumbers <- as.numeric(gtfTranscripts.altered$exon_number[n])
            altTidExNum <- paste0(ids, "_", exonNumbers - back)
            m <- match(
                altTidExNum,
                gtfTranscripts.altered$new_transcript_id_exnum
            )
            m <- m[!is.na(m)]
            while (length(m) > 0) {
                gtfTranscripts.altered <- gtfTranscripts.altered[-m]
                back <- back + 1
                altTidExNum <- paste0(ids, "_", exonNumbers - back)
                m <- match(
                    altTidExNum,
                    gtfTranscripts.altered$new_transcript_id_exnum
                )
                m <- m[!is.na(m)]
            }
        }
        if (type == "AL") {
            fwd <- 0
            n <- which(gtfTranscripts.altered$new_transcript_id_exnum %in%
                junctionReplacementExons$new_transcript_id_exnum)
            ids <- gtfTranscripts.altered$new_transcript_id[n]
            exonNumbers <- as.numeric(gtfTranscripts.altered$exon_number[n])
            altTidExNum <- paste0(ids, "_", exonNumbers + fwd)
            m <- match(
                altTidExNum,
                gtfTranscripts.altered$new_transcript_id_exnum
            )
            m <- m[!is.na(m)]
            while (length(m) > 0) {
                gtfTranscripts.altered <- gtfTranscripts.altered[-m]
                fwd <- fwd + 1
                altTidExNum <- paste0(ids, "_", exonNumbers + fwd)
                m <- match(
                    altTidExNum,
                    gtfTranscripts.altered$new_transcript_id_exnum
                )
                m <- m[!is.na(m)]
            }
        }

        # remove overlapping exons
        longRange <- replacementExonsFirstLast
        rangeDF <- data.frame(
            start_1 = start(replacementExonsFirstLast),
            start_2 = start(junctionReplacementExons),
            end_1 = end(replacementExonsFirstLast),
            end_2 = end(junctionReplacementExons)
        )
        start(longRange) <- apply(rangeDF[, c(1, 2)], 1, min)
        end(longRange) <- apply(rangeDF[, c(3, 4)], 1, max)

        ol <- as.data.frame(findOverlaps(longRange, gtfTranscripts.altered))
        ol$from_id <- longRange$new_transcript_id[ol$queryHits]
        ol$to_id <- gtfTranscripts.altered$new_transcript_id[ol$subjectHits]
        ol <- ol[which(ol$from_id == ol$to_id), ]
        if (dim(ol)[1] > 0) {
            gtfTranscripts.altered <-
                (gtfTranscripts.altered[-unique(ol$subjectHits)])
        }
        gtfTranscripts.altered <- c(
            gtfTranscripts.altered,
            junctionReplacementExons,
            replacementExonsFirstLast
        )

        # redo exon numbering
        gtfTranscripts.altered <- gtfTranscripts.altered[
            order(
                gtfTranscripts.altered$new_transcript_id,
                start(gtfTranscripts.altered)
            )
        ]
        tab <- as.data.frame(table(gtfTranscripts.altered$new_transcript_id))
        tab$strand <- as.character(strand(gtfTranscripts.altered[
            match(tab$Var1, gtfTranscripts.altered$new_transcript_id)
        ]))
        gtfTranscripts.altered$exon_number <-
            unlist(apply(tab, 1, function(x) {
                  if (x[3] == "-") {
                      c(x[2]:1)
                  } else {
                      c(1:x[2])
                  }
              }))

        gtfTranscripts.altered <- gtfTranscripts.altered[
            order(
                gtfTranscripts.altered$new_transcript_id,
                start(gtfTranscripts.altered)
            )
        ]

        gtfTranscripts.altered$set <- range$set[
            match(gtfTranscripts.altered$from, range$id)
        ]
        mcols(gtfTranscripts.altered) <- mcols(gtfTranscripts.altered)[
            , match(
                c(
                    "gene_id", "transcript_id",
                    "transcript_type", "exon_id",
                    "exon_number", "from", "set"
                ),
                colnames(mcols(gtfTranscripts.altered))
            )
        ]
        colnames(mcols(gtfTranscripts.altered))[6] <- "new_event_id"
        gtfTranscripts.altered$event_id <- range$event_id[
            match(gtfTranscripts.altered$new_event_id, range$id)
        ]
        gtfTranscripts.altered$transcript_id <- paste0(
            gtfTranscripts.altered$transcript_id,
            "+AS", type, gtfTranscripts.altered$set, " ",
            gtfTranscripts.altered$event_id
        )
        gtfTranscripts.altered$set <- paste0(
            type, "_",
            gtfTranscripts.altered$set
        )
    }

    mcols(gtfTranscripts.altered) <-
        mcols(gtfTranscripts.altered)[, c(
            "gene_id", "transcript_id",
            "transcript_type", "exon_id",
            "exon_number",
            "set", "event_id"
        )]
    gtfTranscripts.altered <- removeDuplicateTranscripts(gtfTranscripts.altered)

    return(gtfTranscripts.altered)
}
#' Remove transcript duplicates
#'
#' Removes Structural duplicates of transcripts in a GRanges object
#' Note that duplicates must have different transcript ids.
#' @param transcripts GRanges object with transcript structures in exon form
#' @return GRanges object with unique transcript structures in exon form
#' @export
#' @import GenomicRanges
#' @importFrom rtracklayer import
#' @family gtf manipulation
#' @author Beth Signal
#' @examples
#' gtf <- rtracklayer::import(system.file("extdata", "gencode.vM25.small.gtf", package = "GeneStructureTools"))
#' exons <- gtf[gtf$type == "exon"]
#' exons.altName <- exons
#' exons.altName$transcript_id <- paste(exons.altName$transcript_id, "duplicated", sep = "_")
#' exons.duplicated <- c(exons, exons.altName)
#' length(exons.duplicated)
#' exons.deduplicated <- removeDuplicateTranscripts(exons.duplicated)
#' length(exons.deduplicated)
removeDuplicateTranscripts <- function(transcripts) {
    transcriptDF <- as.data.frame(transcripts)

    transcriptDF$startend <- with(transcriptDF, paste0(start, "-", end))
    transcriptRangePaste <- aggregate(
        startend ~ transcript_id, transcriptDF,
        function(x) paste0(x, collapse = "+")
    )

    keep <- transcriptRangePaste$transcript_id[
        which(!duplicated(transcriptRangePaste$startend))
    ]
    transcriptsFiltered <- transcripts[transcripts$transcript_id %in% keep]
    return(transcriptsFiltered)
}
betsig/GeneStructureTools documentation built on March 31, 2021, 4:43 a.m.