R/extend_gtf.R

Defines functions extend_gtf new_internal_transcript new_right_terminal_transcript new_left_terminal_transcript create_tr_entry

Documented in create_tr_entry extend_gtf new_internal_transcript new_left_terminal_transcript new_right_terminal_transcript

#' Create a new GTF entry for a novel terminal exon
#'
#' This function creates new GTF entries for a novel terminal exon and all
#' transcripts in which it is located. As input, the function takes a list of
#' transcript IDs and the GTF annotations of all exons from the transcripts. A
#' random exon from each transcript is copied and the start and end coordinates
#' are exchanged with those of the novel exon. The ranges of each transcript are
#' adjusted to include the novel exon. The `exon_number` and `exon_version` are
#' set to NA. The new `exon_id`, `transcript_id` and `transcript_name` are the
#' original values with a suffix that identifies the novel exon by its ID: e.g.
#' `exon_id`_ID or `transcript_name`_ID.
#'
#' @param start Integer scalar. Start of the novel exon.
#' @param end Integer scalar. End of the novel exon.
#' @param pred_id Integer scalar. ID of the predicted exon.
#' @param tr_ids Character vector. Transcript IDs of all transcripts that
#'   contain the novel exon.
#' @param exon_copy GRanges object. All exons from the transcripts that contain
#'   the novel exon.
#' @param tran GRanges object with all transcript annotations from the organism.
#' 
#' @return GRanges object with the exon and transcript annotation entries of the
#'   novel exon and all exons from the transcripts in tr_ids.
#' @export
#'
create_tr_entry <- function(start, end, pred_id, tr_ids, exon_copy,
                            tran){
  ## create entry for new exon
  new_exon <- exon_copy[match(tr_ids, mcols(exon_copy)$transcript_id)]
  ranges(new_exon) <- IRanges(start, end)
  ## remove the exon number and version and update the exon_id
  mcols(new_exon)[c("exon_number", "exon_version")] <- NA
  mcols(new_exon)$"exon_id" <- paste0(mcols(new_exon)$"exon_id", "_", pred_id)

  tran <- tran[which(mcols(tran)$transcript_id %in% tr_ids)]
  ## compute the range of each transcript after adding the novel exon
  new_tr <- c(exon_copy, new_exon)
  tr_ranges <- lapply(split(new_tr, mcols(new_tr)$transcript_id), range)
  tr_ranges <- ranges(unlist(GRangesList(tr_ranges)))
  ## update the transcript ranges
  ranges(tran[match(names(tr_ranges), mcols(tran)$transcript_id)]) <- tr_ranges

  ## change the transcript_id and transcript_name of the trs and all their exons
  new_tr <- c(new_tr, tran)
  mcols(new_tr)$"transcript_id" <- paste0(mcols(new_tr)$"transcript_id",
                                          "_", pred_id)
  mcols(new_tr)$"transcript_name" <- paste0(mcols(new_tr)$"transcript_name",
                                            "_", pred_id)
  new_tr
}



#' Create new GTF annotation for a novel left terminal exon
#' 
#' Given a novel exon, the function identifies the IDs of all transcripts that
#' contain the downstream neighbouring exon (exon start is `rstart`) of the
#' novel exon. Each of the transcripts is copied and an entry for the the novel
#' exon is created. The `exon_number` and `exon_version` are set to NA. The new
#' `exon_id`, `transcript_id` and `transcript_name` are the original values with
#' a suffix that identifies the novel exon by its ID: e.g. `exon_id`_ID or
#' `transcript_name`_ID.
#'
#' @param seqn Character string or factor. seqname of the novel exon.
#' @param start Integer scalar. Start of the novel exon.
#' @param end Integer scalar. End of the novel exon.
#' @param rstart Integer scalar. Start of the downstream exon.
#' @param strand Character string or factor. Strand of the novel exon (either
#'   "+" or "-").
#' @param pred_id Integer scalar. ID of the novel exon.
#' @param exons GRanges object. Exon annotations from a GTF file.
#' @param tran GRanges object. Transcript annotations from a GTF file.
#' 
#' @return GRanges object with GTF annotations of the novel exon and all its
#'   transcripts
#'   
#' @export
#' 
new_left_terminal_transcript <- function(seqn, start, end, rstart, strand, 
                                         pred_id, exons, tran) {
  ## find all exons that start in rstart
  tr <- unique(mcols(subsetByOverlaps(exons, 
                                      GRanges(seqnames = seqn,
                                              ranges = IRanges(rstart, rstart),
                                              strand = strand),
                                      type = "start",
                                      ignore.strand = FALSE))$transcript_id)
  if (length(tr) > 0) {
    ## Are there any transcrips with exons upstream of rstart?
    exons <- exons[mcols(exons)$transcript_id %in% tr]
    olap_tr <- unique(exons[start(exons) < rstart]$transcript_id)
    if (length(olap_tr) == length(tr)) {
      exon_copy <- exons[which(mcols(exons)$transcript_id %in% tr)]
      ## remove all exons upstream of the neighbouring exon
      exon_copy <- exon_copy[!start(exon_copy) < rstart]
      create_tr_entry(start, end, pred_id, tr, exon_copy, tran)
    } else {
      tr <- tr[!tr %in% olap_tr]
      exon_copy <- exons[which(mcols(exons)$transcript_id %in% tr)]
      ## all transcripts start with the neighbouring exon
      create_tr_entry(start, end, pred_id, tr, exon_copy, tran)
    }
  }
}



#' Create new GTF annotation for a novel right terminal exon
#' 
#' Given a novel exon, the function identifies the IDs of all transcripts that
#' contain the upstream neighbouring exon (exon end is `lend`) of the
#' novel exon. Each of the transcripts is copied and an entry for the the novel
#' exon is created. The `exon_number` and `exon_version` are set to NA. The new
#' `exon_id`, `transcript_id` and `transcript_name` are the original values with
#' a suffix that identifies the novel exon by its ID: e.g. `exon_id`_ID or
#' `transcript_name`_ID.
#'
#' @param seqn Character string or factor. seqname of the novel exon.
#' @param lend Integer scalar. End of the upstream exon.
#' @param start Integer scalar. Start of the novel exon.
#' @param end Integer scalar. End of the novel exon.
#' @param strand Character string or factor. Strand of the novel exon (either
#'   "+" or "-").
#' @param pred_id Integer scalar. ID of the novel exon.
#' @param exons GRanges object. Exon annotations from a GTF file.
#' @param tran GRanges object. Transcript annotations from a GTF file.
#' 
#' @return GRanges object with GTF annotations of the novel exon and all its
#'   transcripts
#'   
#' @export
#'
new_right_terminal_transcript <- function(seqn, lend, start, end, strand, 
                                          pred_id, exons, tran) {
  ## find all exons that end in lend
  tr <- unique(mcols(subsetByOverlaps(exons, 
                                      GRanges(seqnames = seqn,
                                              ranges = IRanges(lend, lend),
                                              strand = strand),
                                      type = "end",
                                      ignore.strand = FALSE))$transcript_id)
  if (length(tr) > 0) {
    ## Are there any transcrips with exons downtream of lend?
    exons <- exons[mcols(exons)$transcript_id %in% tr]
    olap_tr <- unique(exons[start(exons) > lend]$transcript_id)
    
    if (length(olap_tr) == length(tr)) {  ## all tr have overlapping exons
      exon_copy <- exons[which(mcols(exons)$transcript_id %in% tr)]
      ## remove all downstream exons of the neighbouring exon
      exon_copy <- exon_copy[!end(exon_copy) > lend]
      create_tr_entry(start, end, pred_id, tr, exon_copy, tran)
    } else {
      tr <- tr[!tr %in% olap_tr]
      exon_copy <- exons[which(mcols(exons)$transcript_id %in% tr)]
      ## all transcripts end in the neighbouring exon
      create_tr_entry(start, end, pred_id, tr, exon_copy, tran)
    }
  }
}


#' Create new GTF annotation for a novel internal exon
#' 
#' Given a novel exon, the function identifies the IDs of all transcripts that
#' contain the upstream and downstream neighbouring exons (exon end is `lend`
#' and exon start is `rstart`) of the novel exon. Each of the transcripts is
#' copied and an entry for the the novel exon is created. The `exon_number` and
#' `exon_version` are set to NA. The new `exon_id`, `transcript_id` and
#' `transcript_name` are the original values with a suffix that identifies the
#' novel exon by its ID: e.g. `exon_id`_ID or
#' `transcript_name`_ID.
#'
#' @param seqn Character string or factor. seqname of the novel exon.
#' @param lend Integer scalar. End of the upstream exon.
#' @param start Integer scalar. Start of the novel exon.
#' @param end Integer scalar. End of the novel exon.
#' @param rstart Integer scalar. Start of the downstream exon.
#' @param strand Character string or factor. Strand of the novel exon (either
#'   "+" or "-").
#' @param pred_id Integer scalar. ID of the novel exon.
#' @param exons GRanges object. Exon annotations from a GTF file.
#' @param tran GRanges object. Transcript annotations from a GTF file.
#' 
#' @return GRanges object with GTF annotations of the novel exon and all its
#'   transcripts
#'   
#' @export
#' 
new_internal_transcript <- function(seqn, lend, start, end, rstart, strand, 
                                    pred_id, exons, tran) {
  ## ids of all transcripts with exons that end in lend and start in rstart
  id1 <- mcols(subsetByOverlaps(exons,
                                GRanges(seqnames = seqn,
                                        ranges = IRanges(lend, lend),
                                        strand = strand),
                                type = "end",
                                ignore.strand = FALSE))$transcript_id
  id2 <- mcols(subsetByOverlaps(exons,
                                GRanges(seqnames = seqn,
                                        ranges = IRanges(rstart, rstart),
                                        strand = strand),
                                type = "start",
                                ignore.strand = FALSE))$transcript_id
  tr <- intersect(id1, id2)
  if (length(tr) > 0 ) {
    ## we need a transcript that has an intron from lend to rstart
    ## Are there any transcripts with an exon in the intron?
    olap_tr <- unique(mcols(
      subsetByOverlaps(exons[mcols(exons)$transcript_id %in% tr],
                       GRanges(seqnames = seqn, 
                               ranges = IRanges(lend + 1, rstart - 1),
                               strand = strand),
                       ignore.strand = FALSE)
      )$transcript_id)
    
    if (length(olap_tr) == length(tr)) { 
      ## all transcripts have an exon in the intron
      exon_copy <- exons[which(mcols(exons)$transcript_id %in% tr)]
      ## all exons inside the intron
      olap_exons <- subsetByOverlaps(exon_copy,
                                     GRanges(seqnames = seqn,
                                             ranges = IRanges(lend + 1,
                                                              rstart - 1),
                                             strand = strand),
                                     ignore.strand = FALSE)
      ## take a random exon per transcript and replace its coordinates
      new_exon <- olap_exons[match(tr, mcols(olap_exons)$transcript_id)]
      ranges(new_exon) <- IRanges(start, end)
      ## remove the exon number and version and update the exon_id
      mcols(new_exon)[c("exon_number", "exon_version")] <- NA
      mcols(new_exon)$"exon_id" <- paste0(mcols(new_exon)$"exon_id", "_", 
                                          pred_id)
      ## create new transcript entries and change the transcript_id and
      ## transcript_name of the transcripts and all their exons
      new_tr <- c(exon_copy, new_exon, 
                  tran[which(mcols(tran)$transcript_id %in% tr)])
      mcols(new_tr)$"transcript_id" <- paste0(mcols(new_tr)$"transcript_id",
                                              "_", pred_id)
      mcols(new_tr)$"transcript_name" <- paste0(mcols(new_tr)$"transcript_name",
                                                "_", pred_id)
      new_tr 
    } else { ## all transcripts with no exon in the intron
      tr <- tr[!tr %in% olap_tr]
      exon_copy <- exons[which(mcols(exons)$transcript_id %in% tr)]
      ## take a random exon per transcript
      new_exon <- exon_copy[match(tr, mcols(exon_copy)$transcript_id)]
      ranges(new_exon) <- IRanges(start, end)
      ## remove the exon number and version and update the exon_id
      mcols(new_exon)[c("exon_number", "exon_version")] <- NA
      mcols(new_exon)$"exon_id" <- paste0(mcols(new_exon)$"exon_id", "_", 
                                          pred_id)
      ## create new transcript entries and change the transcript_id and
      ## transcript_name of the transcripts and all their exons
      new_tr <- c(exon_copy, new_exon, 
                  tran[which(mcols(tran)$transcript_id %in% tr)])
      mcols(new_tr)$"transcript_id" <- paste0(mcols(new_tr)$"transcript_id",
                                              "_", pred_id)
      mcols(new_tr)$"transcript_name" <- paste0(mcols(new_tr)$"transcript_name",
                                                "_", pred_id)
      new_tr 
    }
  }
}





#' Extend GTF file with predicted exons
#'
#' Add predicted novel exons to the correct transcripts in a GTF annotation.
#'
#' For each novel exon in `pred` (as returned by [find_novel_exons()]), the list
#' of transcripts that contain its up- and/or downstream exons is determined. A
#' random exon from each transcript is copied and the start and end coordinates
#' are exchanged with those of the novel exon. The `exon_number` and
#' `exon_version` are set to NA. The new `exon_id`, `transcript_id` and
#' `transcript_name` are the original values with a suffix that identifies the
#' novel exon by its ID: e.g. `exon_id`_ID or `transcript_name`_ID.
#'
#' @param gtf GTF annotations, either the path to the GTF file as a character
#'   string or a GRanges object.
#' @param pred data.frame with predicted novel exons as returned by
#'   [find_novel_exons()].
#' @param cores Integer scalar. Number of cores to use. Default 1.
#'
#' @return GRanges object with annotations from the GTF file and extended with
#'   the novel xons.
#' @importFrom rtracklayer import
#' @export
#'
#' @examples
#' gtf <- system.file("extdata", "selected.gtf", package = "DISCERNS",
#'                    mustWork = TRUE)
#'
#' ## Here we artificially create a data.frame with predicted exons.
#' ## In general, the novel exons will be predicted with the function
#' ## find_novel_exons()
#' novel_exons <- data.frame(seqnames = c("19", "22"),
#'                           start = c(47228064,41737092),
#'                           end = c(47228185, 41737150), strand = c("-", "+"),
#'                           lend = c(47226541, 41736141),
#'                           rstart = c(47228589, 41738533), ID = c(1, 2))
#'
#' ## add the predicted exons to the GTF file
#' new_gtf <- extend_gtf(gtf, novel_exons)
#'
#' ## save the new GTF to file with the export() function from rtracklayer
#' library(rtracklayer)
#' export(object = new_gtf, con = "extended_annotation.gtf")
extend_gtf <- function(gtf, pred, cores = 1) {
  if (is.character(gtf)) {
    gtf <- import(gtf)
  }
  exons <- gtf[mcols(gtf)$type == "exon", ]
  tran <- gtf[mcols(gtf)$type == "transcript", ]

  ## Seperate the cassette exons from the terminal exon, because we need to
  ## adjust the transcript range for terminal exons but not for internal exons.
  type <- ifelse(is.na(pred$lend), "left_terminal", 
                 ifelse(is.na(pred$rstart), "right_terminal", "internal"))
  pred_split <- split(pred, type)

  if ("left_terminal" %in% names(pred_split)) {
    p <- pred_split[["left_terminal"]]
    novel_entries <- as(mcmapply(new_left_terminal_transcript, p$seqnames, 
                                 p$start, p$end, p$rstart, p$strand, p$ID,
                                 MoreArgs = list(exons = exons, tran = tran), 
                                 mc.cores = cores, SIMPLIFY = FALSE),
                        "GRangesList")
    gtf <- c(gtf, unlist(novel_entries))
    
  }
  if ("right_terminal" %in% names(pred_split)) {
    p <- pred_split[["right_terminal"]]
    novel_entries <- as(mcmapply(new_right_terminal_transcript, p$seqnames, 
                                 p$lend, p$start, p$end, p$strand, p$ID, 
                                 MoreArgs = list(exons = exons, tran = tran),
                                 mc.cores = cores, SIMPLIFY = FALSE),
                        "GRangesList")
    gtf <- c(gtf, unlist(novel_entries))
  }
  if ("internal" %in% names(pred_split)) {
    p <- pred_split[["internal"]]
    novel_entries <- as(mcmapply(new_internal_transcript, p$seqnames, p$lend,
                                 p$start, p$end, p$rstart, p$strand, p$ID,
                                 MoreArgs = list(exons = exons, tran = tran), 
                                 mc.cores = cores, SIMPLIFY = FALSE),
                        "GRangesList")
    gtf <- c(gtf, unlist(novel_entries))
  }
  gtf
}
khembach/DISCERNS documentation built on June 23, 2020, 3:35 p.m.