R/longestORF.R

Defines functions manualNMD getUOrfs cumsumANDpad getOrfs maxLocation

Documented in cumsumANDpad getOrfs getUOrfs maxLocation

#' Find the largest distance between two vectors of numbers
#' Helper function for get_orfs
#' @param startSite vector of start sites - i.e Met amino acid positions
#' @param stopSite vector of stop sites - i.e Stop (*) amino acid positions
#' @param longest which pair to return (1 = longest pair, 2= 2nd longest pair etc.)
#' @return sequential start site and end site with the greatest difference
#' @export
#' @import plyr
#' @family ORF annotation
#' @author Beth Signal
#' @examples
#' starts <- c(1, 10, 15, 25)
#' stops <- c(4, 16, 50, 55)
#' # longest start site = 25, longest stop site = 50
#' maxLocation(starts, stops, longest = 1)
#' starts <- c(1, 10, 15, 25)
#' stops <- c(4, 14, 50, 55)
#' # longest start site = 15, longest stop site = 50
#' maxLocation(starts, stops, longest = 1)
#' # 2nd longest start site = 10, 2nd longest stop site = 14
#' maxLocation(starts, stops, longest = 2)
maxLocation <- function(startSite, stopSite, longest = 1) {
    if (length(startSite) > 0) {
        # make start / stop pairs
        stopPairIndex <-
            unlist(lapply(startSite, function(x) which.max(1 / (stopSite - x))))
        pairs <- data.frame(start = startSite, stop = stopSite[stopPairIndex])
        pairs$len <- pairs$stop - pairs$start
        pairs <- pairs[order(plyr::desc(pairs$len)), ]
        # pairs <- plyr::arrange(pairs, plyr::desc(len))
        pairs <- pairs[!duplicated(pairs$stop), ]
        return(as.numeric(pairs[longest, c(1, 2)]))
    } else {
        return(c(NA, NA))
    }
}

#' Get open reading frames for transcripts
#' @param transcripts GRanges object with ONLY exon annotations
#' (no gene, transcript, CDS etc.) with all transcripts for orf retrevial
#' @param BSgenome BSgenome object
#' @param returnLongestOnly only return longest ORF?
#' @param allFrames return longest ORF for all 3 frames?
#' @param longest return x longest ORFs (regardless of frames)
#' @param exportFasta export a .fa.gz file with nucleotide sequences for each transcript?
#' @param fastaFile file name for .fa.gz export
#' @param uORFs get uORF summaries?
#' @param selectLongest proportion of ORFs for each gene to find uORFs for. Value between 0 and 1.
#' Speeds up uORF calculations but will only return results for the longest ORFs.
#' @return data.frame with longest orf details
#' @export
#' @import GenomicRanges
#' @import Biostrings
#' @import stringr
#' @importFrom plyr arrange
#' @importFrom plyr desc
#' @importFrom stats aggregate
#' @importFrom rtracklayer import
#' @importFrom utils write.table
#' @family ORF annotation
#' @author Beth Signal
#' @examples
#' gtf <- rtracklayer::import(system.file("extdata", "gencode.vM25.small.gtf",
#'     package = "GeneStructureTools"
#' ))
#' transcript <- gtf[gtf$type == "exon" & gtf$gene_name == "Tmem208"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#' # longest ORF for each transcripts
#' orfs <- getOrfs(transcript, BSgenome = g, returnLongestOnly = TRUE)
#' # longest ORF in all 3 frames for each transcript
#' orfs <- getOrfs(transcript, BSgenome = g, allFrames = TRUE)
#' # longest 3 ORFS in eacht transcript
#' orfs <- getOrfs(transcript, BSgenome = g, returnLongestOnly = FALSE, longest = 3)
getOrfs <- function(transcripts,
                    BSgenome = NULL,
                    returnLongestOnly = TRUE,
                    allFrames = FALSE,
                    longest = 1,
                    exportFasta = FALSE,
                    fastaFile = NULL,
                    uORFs = FALSE,
                    selectLongest = 1) {
    if (allFrames == TRUE) {
        returnLongestOnly <- FALSE
        longest <- 1
    }

    GenomeInfoDb::seqlevelsStyle(transcripts) <- GenomeInfoDb::seqlevelsStyle(BSgenome)[1]
    # check -ve dist to junction b calls
    # check exon number ORF start/ends

    transcripts$exon_number <-
        as.numeric(transcripts$exon_number)
    order <-
        order(transcripts$transcript_id, transcripts$exon_number)
    transcripts <- transcripts[order]
    transcripts$seq <-
        as.character(Biostrings::getSeq(BSgenome, transcripts))

    seqCat <-
        aggregate(seq ~ transcript_id, mcols(transcripts), function(x) {
              (paste(x, collapse = ""))
          })
    ids <- as.character(seqCat$transcript_id)

    if (exportFasta & !is.null(fastaFile)) {
        fastaFile <- ifelse(stringr::str_sub(fastaFile, -3, -1) == ".gz",
            fastaFile, paste0(fastaFile, ".gz")
        )
        fa <- seqCat
        fa$transcript_id <- paste0("> ", fa$transcript_id)

        # gzip files to save some space
        gz <- gzfile(fastaFile, "w")
        utils::write.table(fa, gz,
            col.names = FALSE,
            row.names = FALSE, quote = FALSE, sep = "\n"
        )
        close(gz)
    } else if (exportFasta & is.null(fastaFile)) {
        message("skipping writing .fa file")
        message("please specify a file name for export")
    }

    seqCat <- seqCat$seq
    rm <- which(grepl("N", seqCat))

    if (length(rm) > 0) {
        seqCat <- seqCat[-rm]
        removeId <- ids[rm]
        ids <- ids[-rm]
        transcripts <-
            transcripts[-which(transcripts$transcript_id %in% removeId)]
    }

    # 3 frames
    seqCat <-
        c(seqCat, stringr::str_sub(seqCat, 2), stringr::str_sub(seqCat, 3))
    frames <- rep(c(1, 2, 3), each = length(ids))
    ids <- c(ids, ids, ids)

    orf <-
        suppressWarnings(unlist(lapply(seqCat, function(x) {
              as.character(Biostrings::translate(Biostrings::DNAString(x)))
          })))

    orfDF <- data.frame(
        id = ids,
        aa_sequence = orf,
        frame = frames,
        stringsAsFactors = FALSE
    )

    orfDF$seq_length <- nchar(orfDF$aa_sequence)
    orfDF$seq_length_nt <- nchar(seqCat) + orfDF$frame - 1

    startSites <-
        stringr::str_locate_all(orfDF$aa_sequence, "M")
    # add first site as potential start (if no M)
    startSites <-
        lapply(startSites, function(x) {
              if (length(x) == 0) {
                  1
              } else {
                  as.numeric(x[, 2])
              }
          })

    stopSites <- stringr::str_locate_all(orfDF$aa_sequence, "[*]")
    stopSites <-
        mapply(
            function(x, y) {
                  c(as.numeric(x[, 2]), nchar(y))
              },
            stopSites,
            orfDF$aa_sequence
        )

    maxLoc <-
        mapply(function(x, y) {
              maxLocation(x, y)
          }, startSites, stopSites)

    if (longest >= 2 & returnLongestOnly == FALSE) {
        orfDF.longest <- orfDF

        for (i in 2:longest) {
            maxLoc <- cbind(
                maxLoc,
                mapply(
                    function(x, y) {
                          maxLocation(x, y, longest = i)
                      },
                    startSites,
                    stopSites
                )
            )
            orfDF.longest <- rbind(orfDF.longest, orfDF)
        }

        o <- order(maxLoc[2, ] - maxLoc[1, ], decreasing = TRUE)

        orfDF.longest$start_site <- maxLoc[1, ]
        orfDF.longest$stop_site <- maxLoc[2, ]

        orfDF.longest <- orfDF.longest[o, ]
        keep <- which(!duplicated(orfDF.longest$id))
        orfDF <- orfDF.longest[keep, ]
        orfDF.longest <- orfDF.longest[-keep, ]

        for (i in 2:longest) {
            keep <- which(!duplicated(orfDF.longest$id))
            orfDF <- rbind(orfDF, orfDF.longest[keep, ])
            orfDF.longest <- orfDF.longest[-keep, ]
        }
    } else {
        orfDF$start_site <- maxLoc[1, ]
        orfDF$stop_site <- maxLoc[2, ]
    }

    orfDF$orf_sequence <-
        stringr::str_sub(
            orfDF$aa_sequence, orfDF$start_site,
            orfDF$stop_site - 1
        )
    orfDF$orf_length <- nchar(orfDF$orf_sequence)

    if (returnLongestOnly == TRUE) {
        orfDF <- orfDF[rev(order(orfDF$orf_length)), ]
        orfDF <- orfDF[!duplicated(orfDF$id), ]
    }

    orfDF$start_site_nt <-
        (orfDF$start_site * 3) - 3 + orfDF$frame
    orfDF$stop_site_nt <- (orfDF$orf_length * 3) + orfDF$start_site_nt + 3
    orfDF$utr3_length <-
        (orfDF$seq_length_nt - orfDF$stop_site_nt) + 1

    widths <- data.frame(
        w = width(transcripts),
        id = transcripts$transcript_id
    )
    pad <- max(table(widths$id))
    if (pad > 1) {
        if (length(unique(transcripts$transcript_id)) == 1) {
            w <- cumsumANDpad(widths$w, pad)
            diffs <-
                lapply(orfDF$stop_site_nt, function(x) {
                      x - w
                  })
            diffs <-
                matrix(unlist(diffs), ncol = length(orfDF$id))
        } else {
            w2 <-
                aggregate(w ~ id, widths, function(x) {
                      cumsumANDpad(x, pad)
                  })
            m <- match(orfDF$id, w2$id)
            w2 <- w2[m, -1]
            w2 <- split(w2, seq(nrow(w2)))
            diffs <-
                mapply(function(x, y) {
                      x - y
                  }, orfDF$stop_site_nt, w2)
        }

        orfDF$min_dist_to_junction_a <-
            suppressWarnings(apply(diffs, 2, function(x) {
                  min(x[x > 0 & !is.na(x)])
              }))
        orfDF$min_dist_to_junction_a[
            which(is.infinite(orfDF$min_dist_to_junction_a))
        ] <-
            orfDF$start_site_nt[
                which(is.infinite(orfDF$min_dist_to_junction_a))
            ]
        orfDF$exon_a_from_start <-
            (apply(diffs, 2, function(x) {
                  length(x[x > 0 & !is.na(x)])
              }))

        orfDF$min_dist_to_junction_b <-
            suppressWarnings((apply(diffs, 2, function(x) {
                  max(x[x <= 0 & !is.na(x)])
              }) * -1) + 1)
        orfDF$min_dist_to_junction_b[
            which(is.infinite(orfDF$min_dist_to_junction_b))
        ] <-
            orfDF$utr3_length[which(is.infinite(orfDF$min_dist_to_junction_b))]
        orfDF$exon_b_from_final <-
            (apply(diffs, 2, function(x) {
                  length(x[x <= 0 & !is.na(x)])
              })) - 1

        exonNumber <-
            apply(diffs, 2, function(x) {
                  length(which(!is.na(x)))
              })
        orfDF$exon_a_from_start[exonNumber == 1] <- 0
        orfDF$exon_b_from_final[exonNumber == 1] <- 0
    } else {
        # all single exon transcripts -- therefore no junctions
        orfDF$min_dist_to_junction_a <- orfDF$start_site_nt
        orfDF$exon_a_from_start <- 0
        orfDF$min_dist_to_junction_b <- orfDF$utr3_length
        orfDF$exon_b_from_final <- 0
    }

    orfDF$aa_sequence <- NULL

    orfDF <- plyr::arrange(orfDF, id)
    m <- match(orfDF$id, transcripts$transcript_id)
    orfDF$gene_id <- transcripts$gene_id[m]
    orfDF <- orfDF[, c(1, ncol(orfDF), 2:(ncol(orfDF) - 1))]

    if (uORFs == TRUE) {

        # select the longest N% of orfs to speed stuff up
        maxNgenes <- as.data.frame(ceiling(table(orfDF$gene_id) * selectLongest))

        orfDF <- orfDF[rev(order(orfDF$orf_length)), ]
        longIndices <- as.numeric(unlist(apply(maxNgenes, 1, function(x) which(orfDF$gene_id == x[1])[1:x[2]])))
        orfDF <- orfDF[unique(longIndices), ]

        transcripts.uorf <- transcripts[transcripts$transcript_id %in% orfDF$id]

        upstreamORFs <- getUOrfs(
            transcripts = transcripts.uorf,
            BSgenome = BSgenome,
            orfs = orfDF,
            findExonB = TRUE
        )

        if (nrow(upstreamORFs) > 0) {
            uORFS.bytranscript <- aggregate(
                overlaps_main_ORF ~ id + frame, upstreamORFs,
                function(x) length(x)
            )
            colnames(uORFS.bytranscript)[3] <- "total_uorfs"

            uORFS.bytranscript.newVal <- aggregate(
                overlaps_main_ORF ~ id + frame,
                upstreamORFs, function(x) {
                      length(x[which(x == "upstream")])
                  }
            )
            uORFS.bytranscript$upstream_count <-
                uORFS.bytranscript.newVal[match(
                    paste0(
                        uORFS.bytranscript$id,
                        uORFS.bytranscript$frame
                    ),
                    paste0(
                        uORFS.bytranscript.newVal$id,
                        uORFS.bytranscript.newVal$frame
                    )
                ), 3]

            uORFS.bytranscript.newVal <-
                aggregate(
                    overlaps_main_ORF ~ id + frame,
                    upstreamORFs, function(x) {
                          length(x[which(x == "downstream")])
                      }
                )
            uORFS.bytranscript$downstream_count <-
                uORFS.bytranscript.newVal[match(
                    paste0(
                        uORFS.bytranscript$id,
                        uORFS.bytranscript$frame
                    ),
                    paste0(
                        uORFS.bytranscript.newVal$id,
                        uORFS.bytranscript.newVal$frame
                    )
                ), 3]

            uORFS.bytranscript.newVal <-
                aggregate(uorf_length ~ id + frame, upstreamORFs, function(x) max(x))
            uORFS.bytranscript$max_uorf <-
                uORFS.bytranscript.newVal[match(
                    paste0(
                        uORFS.bytranscript$id,
                        uORFS.bytranscript$frame
                    ),
                    paste0(
                        uORFS.bytranscript.newVal$id,
                        uORFS.bytranscript.newVal$frame
                    )
                ), 3]

            if (any(upstreamORFs$exon_b_from_final != 0)) {
                uORFS.bytranscript.newVal <-
                    aggregate(
                        min_dist_to_junction_b ~ id + frame,
                        upstreamORFs[upstreamORFs$exon_b_from_final != 0, ],
                        function(x) max(x)
                    )
                uORFS.bytranscript$uorf_maxb <-
                    uORFS.bytranscript.newVal[match(
                        paste0(
                            uORFS.bytranscript$id,
                            uORFS.bytranscript$frame
                        ),
                        paste0(
                            uORFS.bytranscript.newVal$id,
                            uORFS.bytranscript.newVal$frame
                        )
                    ), 3]
            } else {
                uORFS.bytranscript$uorf_maxb <- NA
            }
            m <- match(paste0(orfDF$id, orfDF$frame), paste0(
                uORFS.bytranscript$id,
                uORFS.bytranscript$frame
            ))
            orfDF <- cbind(orfDF, uORFS.bytranscript[m, -c(1, 2)])
        } else {
            orfDF$total_uorfs <- NA
            orfDF$upstream_count <- NA
            orfDF$downstream_count <- NA
            orfDF$max_uorf <- NA
            orfDF$uorf_maxb <- NA
        }

        # replace any non-matching uorf summaries with 0
        for (i in 17:ncol(orfDF)) {
            orfDF[which(is.na(orfDF[, i])), i] <- 0
        }
    }

    rownames(orfDF) <- NULL
    return(orfDF)
}

#' Cumulative sum of a sequence of numbers, padded with NA
#' @param x input numeric vector
#' @param padLength length to pad output to
#' @return vector with cumulative sum, padded with NA
#' @author Beth Signal
#' @keywords internal
cumsumANDpad <- function(x, padLength) {
    y <- cumsum(c(1, x))[-1]
    if (length(y) < padLength) {
        y <- c(y, rep(NA, padLength - length(y)))
    }
    return(y)
}

#' Get upstream open reading frames for transcripts with annotated main ORFs
#' @param transcripts GRanges object with ONLY exon annotations
#' (no gene, transcript, CDS etc.) with all transcripts for orf retrevial
#' @param BSgenome BSgenome object
#' @param orfs orf annotation for the transcripts object. Generated by getOrfs(transcripts, ...)
#' @param findExonB find the distance to and exon number of the downstream (B) junction?
#' @return data.frame with all upstream ORF details.
#' @export
#' @import GenomicRanges
#' @import Biostrings
#' @import stringr
#' @importFrom rtracklayer import
#' @family ORF annotation
#' @author Beth Signal
#' @examples
#' gtf <- rtracklayer::import(system.file("extdata", "gencode.vM25.small.gtf",
#'     package = "GeneStructureTools"
#' ))
#' transcript <- gtf[gtf$type == "exon" & gtf$gene_name == "Tmem208"]
#' g <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#' # longest ORF for each transcripts
#' orfs <- getOrfs(transcript, BSgenome = g, returnLongestOnly = FALSE)
#' uORFS <- getUOrfs(transcript, BSgenome = g, orfs = orfs, findExonB = TRUE)
getUOrfs <- function(transcripts,
                     BSgenome = NULL,
                     orfs, findExonB = FALSE) {
    transcripts$exon_number <-
        as.numeric(transcripts$exon_number)
    order <-
        order(transcripts$transcript_id, transcripts$exon_number)
    transcripts <- transcripts[order]
    transcripts$seq <-
        as.character(Biostrings::getSeq(BSgenome, transcripts))

    seqCat <-
        aggregate(seq ~ transcript_id, mcols(transcripts), function(x) {
              (paste(x, collapse = ""))
          })
    ids <- as.character(seqCat$transcript_id)
    seqCat <- seqCat$seq
    rm <- which(grepl("N", seqCat))

    if (length(rm) > 0) {
        seqCat <- seqCat[-rm]
        removeId <- ids[rm]
        ids <- ids[-rm]
        transcripts <-
            transcripts[-which(transcripts$transcript_id %in% removeId)]
    }

    # 3 frames
    seqCat <-
        c(seqCat, stringr::str_sub(seqCat, 2), stringr::str_sub(seqCat, 3))
    frames <- rep(c(1, 2, 3), each = length(ids))
    ids <- c(ids, ids, ids)

    orf <-
        suppressWarnings(unlist(lapply(seqCat, function(x) {
              as.character(Biostrings::translate(Biostrings::DNAString(x)))
          })))

    orfDF <- data.frame(
        id = ids,
        aa_sequence = orf,
        frame = frames,
        stringsAsFactors = FALSE
    )

    orfDF$seq_length <- nchar(orfDF$aa_sequence)
    orfDF$seq_length_nt <- nchar(seqCat) + orfDF$frame - 1

    uORF.pos <- stringr::str_locate_all(orfDF$aa_sequence, "M(\\w+?)*") # need to add one to end
    uORF.number <- unlist(lapply(uORF.pos, length)) / 2
    ids <- as.vector(unlist(mapply(function(x, y) rep(x, y), orfDF$id, uORF.number)))
    frames <- as.vector(unlist(mapply(function(x, y) rep(x, y), orfDF$frame, uORF.number)))
    uORF.pos <- do.call("rbind", uORF.pos)
    upstreamORFs <- data.frame(id = ids, frame = frames, start = uORF.pos[, 1], stop = uORF.pos[, 2] + 1)


    if (any(duplicated(orfs$id))) {
        orfs$id <- paste0(orfs$id, "_frame", orfs$frame)

        upstreamORFs <- rbind(upstreamORFs, upstreamORFs, upstreamORFs)
        upstreamORFs$id <-
            paste0(
                upstreamORFs$id, "_frame",
                rep(1:3, each = nrow(upstreamORFs) / 3)
            )
    }

    m <- match(upstreamORFs$id, orfs$id)
    upstreamORFs <- upstreamORFs[which((upstreamORFs$start - orfs$start_site[m]) < 0), ]
    m <- match(upstreamORFs$id, orfs$id)
    upstreamORFs$dist_to_start <- orfs$start_site[m] - upstreamORFs$stop
    upstreamORFs$overlaps_main_ORF <- ifelse(upstreamORFs$dist_to_start > 0,
        "upstream", "downstream"
    )
    upstreamORFs <- upstreamORFs[!is.na(upstreamORFs$start), ]
    upstreamORFs$uorf_length <- upstreamORFs$stop - upstreamORFs$start

    upstreamORFs$start_site_nt <-
        (upstreamORFs$start * 3) - 3 + upstreamORFs$frame
    upstreamORFs$stop_site_nt <- (upstreamORFs$uorf_length * 3) +
        upstreamORFs$start_site_nt + 3
    m <- match(upstreamORFs$id, orfs$id, orfs$frame)
    upstreamORFs$dist_to_start_nt <- orfs$start_site_nt[m] -
        upstreamORFs$stop_site_nt
    upstreamORFs <- upstreamORFs[which((upstreamORFs$start_site_nt -
        orfs$start_site_nt[m]) < 0), ]

    if (findExonB == TRUE & nrow(upstreamORFs) > 0) {
        upstreamORFs$utr3_length <-
            (orfs$seq_length_nt[match(upstreamORFs$id, orfs$id, )] -
                upstreamORFs$stop_site_nt) + 1

        widths <- data.frame(
            w = width(transcripts),
            id = transcripts$transcript_id
        )

        pad <- max(table(widths$id))
        if (pad > 1) {
            if (length(unique(transcripts$transcript_id)) == 1) {
                w <- cumsumANDpad(widths$w, pad)
                diffs <-
                    lapply(upstreamORFs$stop_site_nt, function(x) {
                          x - w
                      })
                diffs <-
                    matrix(unlist(diffs), ncol = length(upstreamORFs$id))
            } else {
                w2 <-
                    aggregate(w ~ id, widths, function(x) {
                          cumsumANDpad(x, pad)
                      })
                id <- gsub(
                    "_frame1", "",
                    gsub(
                        "_frame2", "",
                        gsub("_frame3", "", upstreamORFs$id)
                    )
                )
                m <- match(id, w2$id)
                w2 <- w2[m, -1]
                w2 <- split(w2, seq(nrow(w2)))
                diffs <-
                    mapply(function(x, y) {
                          x - y
                      }, upstreamORFs$stop_site_nt, w2)
            }

            upstreamORFs$min_dist_to_junction_b <-
                suppressWarnings((apply(diffs, 2, function(x) {
                      max(x[x <= 0 & !is.na(x)])
                  }) * -1) + 1)
            upstreamORFs$min_dist_to_junction_b[
                which(is.infinite(upstreamORFs$min_dist_to_junction_b))
            ] <-
                upstreamORFs$utr3_length[
                    which(is.infinite(upstreamORFs$min_dist_to_junction_b))
                ]
            upstreamORFs$exon_b_from_final <-
                (apply(diffs, 2, function(x) {
                      length(x[x <= 0 & !is.na(x)])
                  })) - 1

            exonNumber <-
                apply(diffs, 2, function(x) {
                      length(which(!is.na(x)))
                  })
            upstreamORFs$exon_b_from_final[exonNumber == 1] <- 0
        } else {
            upstreamORFs$min_dist_to_junction_b <- upstreamORFs$utr3_length
            upstreamORFs$exon_b_from_final <- 0
        }
    }

    upstreamORFs$start <- NULL
    upstreamORFs$stop <- NULL
    upstreamORFs$dist_to_start <- NULL
    upstreamORFs$utr3_length <- NULL

    m <- match(upstreamORFs$id, orfs$id)
    upstreamORFs$frame <- orfs$frame[m]
    replaceName <- which(stringr::str_sub(upstreamORFs$id, -6, -1) %in%
        c("frame1", "frame2", "frame3"))

    upstreamORFs$id[replaceName] <- gsub(
        "_frame1", "",
        gsub(
            "_frame2", "",
            gsub(
                "_frame3", "",
                upstreamORFs$id[replaceName]
            )
        )
    )

    upstreamORFs <- upstreamORFs[order(
        upstreamORFs$id,
        upstreamORFs$frame,
        upstreamORFs$start_site_nt
    ), ]

    return(upstreamORFs)
}

manualNMD <- function(orfs) {
    orfs$nmd_class_manual <- "nonsense_mediated_decay"
    orfs$nmd_class_manual[orfs$orf_length > 50 &
        (orfs$min_dist_to_junction_b < 50 |
            orfs$exon_b_from_final == 0)] <-
        "not_nmd"

    orfs$nmd_prob_manual <- 1
    orfs$nmd_prob_manual[orfs$nmd_class_manual == "not_nmd"] <- 0
    return(orfs)
}
betsig/GeneStructureTools documentation built on March 31, 2021, 4:43 a.m.