R/UTR2UTR53.R

Defines functions UTR2UTR53

Documented in UTR2UTR53

#' Annotate UTRs from Gencode GTF as 5' or 3'
#'
#' Annotate UTRs from Gencode GTF as 5' or 3'
#' @param gtf GRanges object of the GTF
#' @return gtf annotation GRanges object
#' @export
#' @family gtf manipulation
#' @import GenomicRanges
#' @importFrom rtracklayer import
#' @examples
#' gtfFile <- system.file("extdata", "gencode.vM25.small.gtf",
#'     package = "GeneStructureTools"
#' )
#' gtf <- rtracklayer::import(gtfFile)
#' gtf <- UTR2UTR53(gtf)
#' table(gtf$type)
#' @author Beth Signal
UTR2UTR53 <- function(gtf) {

    # transcripts with CDS/UTR annotated
    gtf.cdsutr <- gtf[gtf@elementMetadata$type %in% c("UTR", "CDS")]
    gtf.cdsutr <- gtf.cdsutr[order(
        gtf.cdsutr@elementMetadata$transcript_id,
        GenomicRanges::start(
            GenomicRanges::ranges(gtf.cdsutr)
        )
    )]

    # UTRs
    UTRTranscripts <- which(gtf.cdsutr@elementMetadata$type == "UTR")
    id <- gtf.cdsutr@elementMetadata$transcript_id[UTRTranscripts]

    # 5'UTRs (+)
    pos5 <- which(gtf.cdsutr@elementMetadata$transcript_id[UTRTranscripts] ==
        gtf.cdsutr@elementMetadata$transcript_id[
            UTRTranscripts + 1
        ] &
        as.character(gtf.cdsutr@strand[UTRTranscripts]) == "+" &
        gtf.cdsutr@elementMetadata$type[UTRTranscripts + 1] %in%
            c("CDS", "exon"))
    # 3'UTRs (+)
    pos3 <- which(gtf.cdsutr@elementMetadata$transcript_id[UTRTranscripts[-1]] ==
        gtf.cdsutr@elementMetadata$transcript_id[
            UTRTranscripts[-1] - 1
        ] &
        as.character(gtf.cdsutr@strand[UTRTranscripts[-1]]) == "+" &
        gtf.cdsutr@elementMetadata$type[UTRTranscripts[-1] - 1] %in%
            c("CDS", "exon")) + 1

    # 3'UTRs (-)
    neg3 <- which(gtf.cdsutr@elementMetadata$transcript_id[UTRTranscripts] ==
        gtf.cdsutr@elementMetadata$transcript_id[
            UTRTranscripts + 1
        ] &
        as.character(gtf.cdsutr@strand[UTRTranscripts]) == "-" &
        gtf.cdsutr@elementMetadata$type[UTRTranscripts + 1] %in%
            c("CDS", "exon"))
    # 5'UTRs (-)
    neg5 <- which(gtf.cdsutr@elementMetadata$transcript_id[
        UTRTranscripts[-1]
    ] ==
        gtf.cdsutr@elementMetadata$transcript_id[
            UTRTranscripts[-1] - 1
        ] &
        as.character(gtf.cdsutr@strand[
            UTRTranscripts[-1]
        ]) == "-" &
        gtf.cdsutr@elementMetadata$type[
            UTRTranscripts[-1] - 1
        ] %in%
            c("CDS", "exon")) + 1

    # new type var
    gtf.cdsutr@elementMetadata$type2 <-
        as.character(gtf.cdsutr@elementMetadata$type)
    gtf.cdsutr@elementMetadata$type2[UTRTranscripts][pos5] <- "UTR5"
    gtf.cdsutr@elementMetadata$type2[UTRTranscripts][neg5] <- "UTR5"
    gtf.cdsutr@elementMetadata$type2[UTRTranscripts][pos3] <- "UTR3"
    gtf.cdsutr@elementMetadata$type2[UTRTranscripts][neg3] <- "UTR3"

    while (any(gtf.cdsutr@elementMetadata$type2[UTRTranscripts] == "UTR")) {
        UTRTranscripts <- which(gtf.cdsutr@elementMetadata$type2 == "UTR")

        # 5'UTRs (+)
        pos5 <- which(gtf.cdsutr@elementMetadata$transcript_id[
            UTRTranscripts
        ] ==
            gtf.cdsutr@elementMetadata$transcript_id[
                UTRTranscripts + 1
            ] &
            as.character(gtf.cdsutr@strand[
                UTRTranscripts
            ]) == "+" &
            gtf.cdsutr@elementMetadata$type2[
                UTRTranscripts + 1
            ] %in%
                c("CDS", "exon", "UTR5"))
        # 3'UTRs (+)
        pos3 <- which(gtf.cdsutr@elementMetadata$transcript_id[
            UTRTranscripts[-1]
        ] ==
            gtf.cdsutr@elementMetadata$transcript_id[
                UTRTranscripts[-1] - 1
            ] &
            as.character(gtf.cdsutr@strand[
                UTRTranscripts[-1]
            ]) == "+" &
            gtf.cdsutr@elementMetadata$type2[
                UTRTranscripts[-1] - 1
            ] %in%
                c("CDS", "exon", "UTR3")) + 1

        # 3'UTRs (-)
        neg3 <- which(gtf.cdsutr@elementMetadata$transcript_id[
            UTRTranscripts
        ] ==
            gtf.cdsutr@elementMetadata$transcript_id[
                UTRTranscripts + 1
            ] &
            as.character(gtf.cdsutr@strand[
                UTRTranscripts
            ]) == "-" &
            gtf.cdsutr@elementMetadata$type2[
                UTRTranscripts + 1
            ] %in%
                c("CDS", "exon", "UTR3"))
        # 5'UTRs (-)
        neg5 <- which(gtf.cdsutr@elementMetadata$transcript_id[
            UTRTranscripts[-1]
        ] ==
            gtf.cdsutr@elementMetadata$transcript_id[
                UTRTranscripts[-1] - 1
            ] &
            as.character(gtf.cdsutr@strand[
                UTRTranscripts[-1]
            ]) == "-" &
            gtf.cdsutr@elementMetadata$type2[
                UTRTranscripts[-1] - 1
            ] %in%
                c("CDS", "exon", "UTR5")) + 1

        gtf.cdsutr@elementMetadata$type2[UTRTranscripts][pos5] <- "UTR5"
        gtf.cdsutr@elementMetadata$type2[UTRTranscripts][neg5] <- "UTR5"
        gtf.cdsutr@elementMetadata$type2[UTRTranscripts][pos3] <- "UTR3"
        gtf.cdsutr@elementMetadata$type2[UTRTranscripts][neg3] <- "UTR3"

        if (all(gtf.cdsutr@elementMetadata$type2[UTRTranscripts] == "UTR")) {
            gtf.cdsutr@elementMetadata$type2[UTRTranscripts] <- "UTR_NA"
        } else {
            UTRTranscripts <- which(gtf.cdsutr@elementMetadata$type2 == "UTR")
        }
    }

    gtf.cdsutrDF <- as.data.frame(gtf.cdsutr)

    gtf.cdsutrNames <- paste(gtf.cdsutrDF$seqnames,
        gtf.cdsutrDF$start,
        gtf.cdsutrDF$end,
        gtf.cdsutrDF$transcript_id,
        sep = "_"
    )
    gtfDF <- as.data.frame(gtf)
    gtfNames <- paste(gtfDF$seqnames, gtfDF$start, gtfDF$end,
        gtfDF$transcript_id,
        sep = "_"
    )

    m5 <- which(gtfNames %in% gtf.cdsutrNames[
        gtf.cdsutr@elementMetadata$type2 == "UTR5"
    ])
    m3 <- which(gtfNames %in% gtf.cdsutrNames[
        gtf.cdsutr@elementMetadata$type2 == "UTR3"
    ])

    gtf@elementMetadata$type <- as.character(gtf@elementMetadata$type)

    gtf@elementMetadata$type[m5] <- "UTR5"
    gtf@elementMetadata$type[m3] <- "UTR3"

    return(gtf)
}
betsig/GeneStructureTools documentation built on March 31, 2021, 4:43 a.m.