R/analyze_ORF.R

Defines functions analyzeORFforPTC myAllFindORFsinSeq extractSequence analyzeORF getCDS

Documented in analyzeORF extractSequence getCDS

########################################################################
########################### switchAnalyzeRlist #########################
######### Analyze coding potential and identify protein domains ########
########################################################################

getCDS <- function(
    selectedGenome,
    repoName
) {
    ### Test input
    if (length(repoName) > 1)
        stop("getCDS: Please supply only one repository")
    if (!any(selectedGenome %in% c("hg19", "hg38", "mm9", "mm10")))
        stop("getCDS: supported genomes are currently: hg19, hg38, mm9, mm10")
    if (!any(repoName %in% c("ensemble", "UCSC", "refseq", "GENCODE")))
        stop("getCDS: Supported repositories are currently: Ensemble (not hg38), UCSC, Refseq, GENCODE (not mm9)")

    ### Make data.frame to translate to tack names
    localRepos <- rbind(
        # hg19
        data.frame(
            genome='hg19',
            repo  =c("ensemble", "UCSC"     , "refseq"         , "GENCODE"),
            track =c("ensGene" , "knownGene", "refGene"        , "wgEncodeGencodeV19"),
            stringsAsFactors = FALSE
        ),
        # hg38
        data.frame(
            genome='hg38',
            repo  =c("ensemble", "UCSC"     , "refseq"         , "GENCODE"),
            track =c(NA        , NA         , 'refSeqComposite', "knownGene"), # gencode is stored as knownGene according to trackNames() annotation
            stringsAsFactors = FALSE
        ),
        # mm9
        data.frame(
            genome='mm9',
            repo  =c("ensemble", "UCSC"     , "refseq"         , "GENCODE"),
            track =c("ensGene" , "knownGene", "refGene"        , NA),
            stringsAsFactors = FALSE
        ),
        # mm10
        data.frame(
            genome='mm10',
            repo  =c("ensemble", "UCSC"     , "refseq"         , "GENCODE"),
            track =c(NA        , "knownGene", "refSeqComposite", "wgEncodeGencode"),
            stringsAsFactors = FALSE
        )
    )
    repoOfInterest <- localRepos[which(
        localRepos$genome == selectedGenome &
            localRepos$repo == repoName
    ),]

    if(is.na(repoOfInterest$track)) {
        stop('Combination of genome and repository is not available')
    }

    ### Make session
    message("Retrieving CDS tables for ", repoOfInterest$repo, "...", sep = "")
    session <- rtracklayer::browserSession("UCSC",url="http://genome-euro.ucsc.edu/cgi-bin/") # KVS : this solves the problem with changing geomes
    GenomeInfoDb::genome(session) <- repoOfInterest$genome
    query <- rtracklayer::ucscTableQuery(session, repoOfInterest$track)

    ### Get data
    cdsTable <- rtracklayer::getTable(query)
    if (repoOfInterest$repo == "ensemble")
        cdsTable <- cdsTable[cdsTable$cdsStartStat != "none",
                             ]
    if (repoOfInterest$repo == "UCSC")
        cdsTable <- cdsTable[cdsTable$cdsStart != cdsTable$cdsEnd,
                             ]
    if (repoOfInterest$repo == "refseq")
        cdsTable <- cdsTable[cdsTable$cdsStart != cdsTable$cdsEnd,
                             ]
    cdsTable <- cdsTable[, c("chrom", "strand", "txStart", "txEnd",
                             "cdsStart", "cdsEnd", "exonCount", "name")]
    message("Retrieved ", nrow(cdsTable), " records...", sep = "")
    utils::flush.console()
    return(new("CDSSet", cdsTable))
}

analyzeORF <- function(
    switchAnalyzeRlist,
    genomeObject = NULL,
    minORFlength = 100,
    orfMethod = 'longest',
    cds = NULL,
    PTCDistance = 50,
    startCodons = "ATG",
    stopCodons = c("TAA", "TAG", "TGA"),
    showProgress = TRUE,
    quiet = FALSE
) {
    ### check input
    if (TRUE) {
        # Input data
        if (class(switchAnalyzeRlist) != 'switchAnalyzeRlist') {
            stop(paste(
                'The object supplied to \'switchAnalyzeRlist\'',
                'must be a \'switchAnalyzeRlist\''
            ))
        }

        ntAlreadyInSwitchList <- ! is.null(switchAnalyzeRlist$ntSequence)
        if( ! ntAlreadyInSwitchList ) {
            if (class(genomeObject) != 'BSgenome') {
                stop('The genomeObject argument must be a BSgenome')
            }
        }

        # method choice
        if (length(orfMethod) != 1) {
            stop(paste(
                'The \'orfMethod\' must be one of \'mostUpstreamAnnoated\',',
                '\'mostUpstream\', \'longestAnnotated\' or \'longest\',',
                'not a vector'
            ))
        }
        if (!orfMethod %in% c('mostUpstreamAnnoated',
                              'mostUpstream',
                              'longest',
                              'longestAnnotated')) {
            stop(paste(
                'The \'orfMethod\' argument must be either of',
                '\'mostUpstreamAnnoated\',\'mostUpstream\',',
                ' \'longestAnnotated\' or \'longest\' indicating whether',
                'to use the the most upstream annoated start codon or',
                'the longest ORF respectively'
            ))
        }

        if (orfMethod %in% c('mostUpstreamAnnoated', 'longestAnnotated')) {
            if (is.null(cds)) {
                stop(paste(
                    'When using orfMethod is \'mostUpstreamAnnoated\' or',
                    '\'longestAnnotated\', a CDSSet must be supplied to',
                    'the \'cds\' argument '
                ))
            }
        }

    }

    ### Assign paramters
    if (TRUE) {
        useAnnoated <-
            orfMethod %in% c('longestAnnotated', 'mostUpstreamAnnoated')

        nrStepsToPerform <- 3 + as.numeric(useAnnoated)
        nrStepsPerformed <- 0

        if (showProgress & !quiet) {
            progressBar <- 'text'
        } else {
            progressBar <- 'none'
        }

        myCodonDf <- data.frame(
            codons = c(startCodons, stopCodons),
            meaning = c(
                replicate(n = length(startCodons), expr = 'start'),
                replicate(n = length(stopCodons), expr = 'stop')
            ),
            stringsAsFactors = FALSE
        )

    }

    ### Idenetify overlapping CDS and exons
    if (useAnnoated) {
        if (!quiet) {
            message(
                paste(
                    'Step',
                    nrStepsPerformed + 1,
                    'of',
                    nrStepsToPerform,
                    ': Identifying overlap between supplied annoated translation start sites and transcripts',
                    sep = ' '
                )
            )
        }

        ### Idenetify overlapping CDS and exons and annoate transcript position
        cds <- unique(cds[, c('chrom', 'strand', 'cdsStart', 'cdsEnd')])

        ### strand specific translation starts
        plusIndex <- which(cds$strand == '+')
        annoatedStartGRangesPlus <-
            unique(GRanges(
                cds$chrom[plusIndex],
                IRanges(
                    start = cds$cdsStart[plusIndex],
                    end = cds$cdsStart[plusIndex]
                ),
                strand = cds$strand[plusIndex]
            ))
        minusIndex <- which(cds$strand == '-')
        annoatedStartGRangesMinus <-
            unique(GRanges(
                cds$chrom[minusIndex],
                IRanges(
                    start = cds$cdsEnd[minusIndex],
                    end = cds$cdsEnd[minusIndex]),
                strand = cds$strand[minusIndex]
            ))

        annoatedStartGRanges <-
            unique(c(annoatedStartGRangesPlus, annoatedStartGRangesMinus))
        annoatedStartGRanges$id <-
            paste('cds_', 1:length(annoatedStartGRanges), sep = '')

        ### Extract exons
        localExons <-  switchAnalyzeRlist$exons[which(
            switchAnalyzeRlist$exons$isoform_id %in%
                switchAnalyzeRlist$isoformFeatures$isoform_id
        ),]
        #localExons <- localExons[which(strand(localExons) %in% c('+','-')),]
        localExons <-
            localExons[which(as.character(localExons@strand) %in% c('+', '-')),]

        localExons <-
            localExons[order(localExons$isoform_id,
                             start(localExons),
                             end(localExons)), ]
        localExons$exon_id <-
            paste('exon_', 1:length(localExons), sep = '')


        ### Find overlaps
        suppressWarnings(overlappingAnnotStart <-
                             as.data.frame(
                                 findOverlaps(
                                     query = localExons,
                                     subject = annoatedStartGRanges,
                                     ignore.strand = FALSE
                                 )
                             ))
        if (!nrow(overlappingAnnotStart)) {
            stop(
                'No overlap between CDS and transcripts were found. This is most likely due to a annoation problem around chromosome name.'
            )
        }

        ### Annoate overlap
        overlappingAnnotStart$queryHits <-
            localExons$exon_id[overlappingAnnotStart$queryHits]
        overlappingAnnotStart$subjectHits <-
            annoatedStartGRanges$id[overlappingAnnotStart$subjectHits]
        colnames(overlappingAnnotStart) <- c('exon_id', 'cds_id')
        overlappingAnnotStart$isoform_id <-
            localExons$isoform_id[match(
                overlappingAnnotStart$exon_id, localExons$exon_id
            )]

        ### annoate with genomic start site
        overlappingAnnotStart$cdsGenomicStart <-
            start(annoatedStartGRanges)[match(overlappingAnnotStart$cds_id,
                                              annoatedStartGRanges$id)]


        ## Extract exon information
        myExons <-
            as.data.frame(localExons[which(
                localExons$isoform_id %in% overlappingAnnotStart$isoform_id
            ),])
        myExons <- myExons[sort.list(myExons$isoform_id), ]
        #myExonsSplit <- split(myExons, f=myExons$isoform_id)

        myExonPlus <- myExons[which(myExons$strand == '+'), ]
        myExonPlus$cumSum <-
            unlist(sapply(
                split(myExonPlus$width, myExonPlus$isoform_id),
                function(aVec) {
                    cumsum(c(0, aVec))[1:(length(aVec))]
                }
            ))
        myExonMinus <- myExons[which(myExons$strand == '-'), ]
        myExonMinus$cumSum <-
            unlist(sapply(
                split(myExonMinus$width, myExonMinus$isoform_id),
                function(aVec) {
                    cumsum(c(0, rev(aVec)))[(length(aVec)):1] # reverse
                }
            ))

        myExons2 <- rbind(myExonPlus, myExonMinus)
        myExons2 <-
            myExons2[order(myExons2$isoform_id, myExons2$start, myExons2$end), ]
        #myExonsSplit <- split(myExons2, f=myExons2$isoform_id)

        ### Annoate with exon information
        matchIndex <-
            match(overlappingAnnotStart$exon_id, myExons2$exon_id)
        overlappingAnnotStart$strand <- myExons2$strand[matchIndex]
        overlappingAnnotStart$exon_start <-
            myExons2$start[matchIndex]
        overlappingAnnotStart$exon_end <- myExons2$end[matchIndex]
        overlappingAnnotStart$exon_cumsum <-
            myExons2$cumSum[matchIndex]

        ### Annoate with transcript coordinats
        overlappingAnnotStartPlus <-
            overlappingAnnotStart[which(overlappingAnnotStart$strand == '+'), ]
        overlappingAnnotStartPlus$transcriptStart <-
            overlappingAnnotStartPlus$exon_cumsum + (
                overlappingAnnotStartPlus$cdsGenomicStart -
                    overlappingAnnotStartPlus$exon_start
            ) + 2

        overlappingAnnotStartMinus <-
            overlappingAnnotStart[which(overlappingAnnotStart$strand == '-'), ]
        overlappingAnnotStartMinus$transcriptStart <-
            overlappingAnnotStartMinus$exon_cumsum + (
                overlappingAnnotStartMinus$exon_end -
                    overlappingAnnotStartMinus$cdsGenomicStart
            ) + 1

        overlappingAnnotStart2 <-
            rbind(overlappingAnnotStartPlus,
                  overlappingAnnotStartMinus)
        overlappingAnnotStart2 <-
            overlappingAnnotStart2[order(
                overlappingAnnotStart2$isoform_id,
                overlappingAnnotStart2$exon_start,
                overlappingAnnotStart2$exon_end
            ), ]


        # Update number of steps performed
        nrStepsPerformed <- nrStepsPerformed + 1
    }

    ### Extract nucleotide sequence of the transcripts
    if (!quiet) {
        message(
            paste(
                'Step',
                nrStepsPerformed + 1,
                'of',
                nrStepsToPerform,
                ': Extracting transcript sequences...',
                sep = ' '
            )
        )
    }
    if (TRUE) {
        if( ntAlreadyInSwitchList ) {
            transcriptSequencesDNAstring <- switchAnalyzeRlist$ntSequence

        } else {
            tmpSwitchAnalyzeRlist <- switchAnalyzeRlist

            ### Subset to those analyzed (if annoation is used)
            if (useAnnoated) {
                tmpSwitchAnalyzeRlist$isoform_feature <-
                    tmpSwitchAnalyzeRlist$isoform_feature[which(
                        tmpSwitchAnalyzeRlist$isoform_feature$isoform_id %in%
                            overlappingAnnotStart2$isoform_id
                    ),]
                tmpSwitchAnalyzeRlist$exons <-
                    tmpSwitchAnalyzeRlist$exons[which(
                        tmpSwitchAnalyzeRlist$exons$isoform_id %in%
                            overlappingAnnotStart2$isoform_id
                    ),]
            }

            transcriptSequencesDNAstring <-
                suppressMessages(
                    extractSequence(
                        switchAnalyzeRlist = tmpSwitchAnalyzeRlist,
                        genomeObject = genomeObject,
                        onlySwitchingGenes = FALSE,
                        extractNTseq = TRUE,
                        extractAAseq = FALSE,
                        removeLongAAseq = FALSE,
                        addToSwitchAnalyzeRlist = TRUE,
                        writeToFile = FALSE,
                        quiet = TRUE
                    )$ntSequence
                )

        }

        nrStepsPerformed <- nrStepsPerformed + 1
    }

    ### For each nucleotide sequence identify position of longest ORF with method selected
    if (!quiet) {
        message(
            paste(
                'Step',
                nrStepsPerformed + 1,
                'of',
                nrStepsToPerform,
                ': Locating ORFs of interest...',
                sep = ' '
            )
        )
    }
    if (TRUE) {
        if (useAnnoated) {
            overlappingAnnotStartList <-
                split(
                    overlappingAnnotStart2[, c('isoform_id', 'transcriptStart')],
                    f = overlappingAnnotStart2$isoform_id
                )
        } else {
            overlappingAnnotStartList <-
                split(
                    names(transcriptSequencesDNAstring),
                    names(transcriptSequencesDNAstring)
                )
        }

        # Make logics
        useLongest <- orfMethod == 'longestAnnotated'

        ### Find the disired ORF
        transcriptORFs <-
            plyr::llply(
                overlappingAnnotStartList,
                .progress = progressBar,
                function(
                    annoationInfo
                ) { # annoationInfo <- overlappingAnnotStartList[[1]]

                # Extract wanted ORF
                if (useAnnoated) {
                    correspondingSequence <-
                        transcriptSequencesDNAstring[[
                            annoationInfo$isoform_id[1]
                            ]]

                    localORFs <-
                        myAllFindORFsinSeq(
                            dnaSequence = correspondingSequence,
                            codonAnnotation = myCodonDf,
                            filterForPostitions = annoationInfo$transcriptStart
                        )

                    # subset by length
                    localORFs <-
                        localORFs[which(localORFs$length >= minORFlength), ]

                    if (useLongest) {
                        # Longest annoated ORF
                        myMaxORF <-
                            localORFs[which.max(localORFs$length), ]
                    } else {
                        # most upresteam annoated ORF
                        myMaxORF <-
                            localORFs[which.min(localORFs$start), ]
                    }

                } else {
                    correspondingSequence <-
                        transcriptSequencesDNAstring[[annoationInfo]]

                    # longest ORF
                    localORFs <-
                        myAllFindORFsinSeq(dnaSequence = correspondingSequence,
                                           codonAnnotation = myCodonDf)

                    # subset by length
                    localORFs <-
                        localORFs[which(localORFs$length >= minORFlength), ]

                    if (orfMethod == 'longest') {
                        # longest ORF
                        myMaxORF <-
                            localORFs[which.max(localORFs$length), ]
                    } else {
                        # most upstream ORF
                        myMaxORF <-
                            localORFs[which.min(localORFs$start), ]
                    }
                }

                # Sanity check
                if (nrow(myMaxORF) == 0) {
                    return(data.frame(
                        start = 1,
                        end = NA,
                        length = 0
                    ))
                } # by having length 0 it will be removed later

                return(myMaxORF)
            })

        myTranscriptORFdf <-
            myListToDf(transcriptORFs, addOrignAsColumn = TRUE)

        nrStepsPerformed <- nrStepsPerformed + 1
    }

    ### Use the obtained ORF coordinats to predict PTC
    if (!quiet) {
        message(
            paste(
                'Step',
                nrStepsPerformed + 1,
                'of',
                nrStepsToPerform,
                ': Scanning for PTCs...',
                sep = ' '
            )
        )
    }
    if (TRUE) {
        ### Extract exon structure for each transcript
        myExons <- as.data.frame(switchAnalyzeRlist$exons[which(
            switchAnalyzeRlist$exons$isoform_id %in%
                switchAnalyzeRlist$isoformFeatures$isoform_id
        ),])
        myExons <- myExons[which(myExons$strand %in% c('+', '-')), ]
        myExons <-
            myExons[which(myExons$isoform_id %in% names(transcriptORFs)), ]
        myExonsSplit <- split(myExons, f = myExons$isoform_id)

        # Loop over all isoforms and extract info
        allIsoforms <-
            split(names(myExonsSplit), names(myExonsSplit))
        ptcResult <-
            plyr::llply(
                allIsoforms,
                .fun = function(isoformName) {
                    # isoformName <- allIsoforms[[1]]
                    # Extract ORF info
                    orfInfo <- transcriptORFs[[isoformName]]
                    if (orfInfo$length == 0) {
                        return(NULL)
                    }

                    # Extract exon info
                    exonInfo <- myExonsSplit[[isoformName]]

                    # do PTC analysis
                    localPTCresult <-
                        analyzeORFforPTC(
                            aORF = orfInfo,
                            exonStructure = exonInfo,
                            PTCDistance = PTCDistance
                        )

                    return(localPTCresult)
                }
            )
        # remove empty once
        ptcResult <-
            ptcResult[which(sapply(ptcResult, function(x)
                ! is.null(x)))]
        if (length(ptcResult) == 0) {
            stop('No ORFs (passing the filtering) were found')
        }

        myPTCresults <-
            myListToDf(ptcResult, addOrignAsColumn = TRUE)
    }

    ### Add result to switchAnalyzeRlist
    if (TRUE) {
        # merge ORF and PTC analysis together
        myResultDf <-
            dplyr::inner_join(
                myTranscriptORFdf[,c('orign','start','end','length')],
                myPTCresults,
                by = 'orign'
            )
        colnames(myResultDf)[1] <- 'isoform_id'
        colnames(myResultDf)[2:4] <-
            paste(
                'orfTranscipt',
                startCapitalLetter(colnames(myResultDf)[2:4]),
                sep =''
            )

        ### Add NAs
        myResultDf <- merge(
            unique(switchAnalyzeRlist$isoformFeatures[, 'isoform_id', drop = FALSE]),
            myResultDf,
            all.x = TRUE
        )

        # myResultDf$orfStarExon <- NULL
        # myResultDf$orfEndExon <- NULL

        ### Add result to switch list
        switchAnalyzeRlist$orfAnalysis <- myResultDf

        switchAnalyzeRlist$isoformFeatures$PTC		         <-
            myResultDf$PTC [match(switchAnalyzeRlist$isoformFeatures$isoform_id,
                                  myResultDf$isoform_id)]

        ### Add NT sequences
        switchAnalyzeRlist$ntSequence <- transcriptSequencesDNAstring[which(
            names(transcriptSequencesDNAstring) %in%
                switchAnalyzeRlist$isoformFeatures$isoform_id
        )]

        if (!quiet) {
            message(
                sum(myResultDf$orfStartGenomic != -1, na.rm = TRUE) ,
                " putative ORFs were identified and analyzed",
                sep = ""
            )
        }
    }

    if (!quiet) {
        message('Done')
    }
    return(switchAnalyzeRlist)
}

extractSequence <- function(
    switchAnalyzeRlist,
    genomeObject = NULL,
    onlySwitchingGenes = TRUE,
    alpha = 0.05,
    dIFcutoff = 0.1,
    extractNTseq = TRUE,
    extractAAseq = TRUE,
    removeShortAAseq = TRUE,
    removeLongAAseq  = FALSE,
    alsoSplitFastaFile = FALSE,
    removeORFwithStop = TRUE,
    addToSwitchAnalyzeRlist = TRUE,
    writeToFile = TRUE,
    pathToOutput = getwd(),
    outputPrefix = 'isoformSwitchAnalyzeR_isoform',
    forceReExtraction = FALSE,
    quiet = FALSE
) {
    ### Determine sequence status
    if(TRUE) {
        ntAlreadyInSwitchList <- ! is.null(switchAnalyzeRlist$ntSequence)
        aaAlreadyInSwitchList <- ! is.null(switchAnalyzeRlist$aaSequence)

        if(forceReExtraction) {
            ntAlreadyInSwitchList <- FALSE
            aaAlreadyInSwitchList <- FALSE
        }
    }

    ### Check input
    if (TRUE) {
        # Test input data class
        if (class(switchAnalyzeRlist) != 'switchAnalyzeRlist') {
            stop(
                'The object supplied to \'switchAnalyzeRlist\' must be a \'switchAnalyzeRlist\''
            )
        }
        if( ! ntAlreadyInSwitchList ) {
            if (class(genomeObject) != 'BSgenome') {
                stop('The genomeObject argument must be a BSgenome')
            }
        }

        # Test what to extract
        if (!extractNTseq & !extractAAseq) {
            stop(
                'At least one of \'extractNTseq\' or \'extractNTseq\' must be true (else using this function have no purpose)'
            )
        }

        # How to repport result
        if (!addToSwitchAnalyzeRlist & !writeToFile) {
            stop(
                'At least one of \'addToSwitchAnalyzeRlist\' or \'writeToFile\' must be true (else this function outputs nothing)'
            )
        }

        # Are ORF annotated
        if (extractAAseq) {
            if (!'orfAnalysis' %in% names(switchAnalyzeRlist)) {
                stop('Please run the \'annotatePTC\' function to detect ORFs')
            }
        }

        # If switches are annotated
        if (onlySwitchingGenes) {
            if (all(is.na(
                switchAnalyzeRlist$isoformFeatures$isoform_switch_q_value
            )) &
            all(is.na(
                switchAnalyzeRlist$isoformFeatures$gene_switch_q_value
            ))) {
                stop(
                    'If only switching genes should be outputted please run the \'isoformSwitchTestDEXSeq\' or \'isoformSwitchTestDRIMSeq\' function first and try again'
                )
            }
        }

        if (alpha < 0 |
            alpha > 1) {
            warning('The alpha parameter should usually be between 0 and 1 ([0,1]).')
        }
        if (alpha > 0.05) {
            warning(
                'Most journals and scientists consider an alpha larger than 0.05 untrustworthy. We therefore recommend using alpha values smaller than or queal to 0.05'
            )
        }
        if (dIFcutoff < 0 | dIFcutoff > 1) {
            stop('The dIFcutoff must be in the interval [0,1].')
        }

        if( !is.logical(alsoSplitFastaFile) ){
            stop('The \'alsoSplitFastaFile\' argument must be either TRUE or FALSE')
        }
        if( alsoSplitFastaFile ) {
            if( ! removeShortAAseq ) {
                warning('Since you are using the alsoSplitFastaFile you probably also want to use the \'removeShortAAseq\' option.')
            }
            if( ! removeLongAAseq ) {
                warning('Since you are using the alsoSplitFastaFile you probably also want to use the \'removeLongAAseq\' option.')
            }
        }

        if( !is.logical(forceReExtraction)) {
            stop('\'forceReExtraction\' must be either TRUE or FALSE')
        }

        nrAnalysisToMake <- 2 + as.integer(extractAAseq)
        startOfAnalysis <- 1
    }

    ### Extract NT sequence (needed for AA extraction so always nessesary)
    if (TRUE) {
        if (!quiet) {
            message(
                paste(
                    "Step",
                    startOfAnalysis ,
                    "of",
                    nrAnalysisToMake,
                    ": Extracting transcript nucleotide sequences...",
                    sep = " "
                )
            )
        }

        # extract switching isoforms
        if (onlySwitchingGenes) {
            isoResTest <-
                any(!is.na(
                    switchAnalyzeRlist$isoformFeatures$isoform_switch_q_value
                ))
            if (isoResTest) {
                switchingGenes <-
                    unique(switchAnalyzeRlist$isoformFeatures$gene_id [which(
                        switchAnalyzeRlist$isoformFeatures$isoform_switch_q_value <
                            alpha &
                            abs(switchAnalyzeRlist$isoformFeatures$dIF) >
                            dIFcutoff
                    )])
            } else {
                switchingGenes <-
                    unique(switchAnalyzeRlist$isoformFeatures$gene_id [which(
                        switchAnalyzeRlist$isoformFeatures$gene_switch_q_value <
                            alpha     &
                            abs(switchAnalyzeRlist$isoformFeatures$dIF) >
                            dIFcutoff
                    )])
            }
            if (length(switchingGenes) == 0) {
                stop(
                    'No switching genes were found. Pleasae turn off \'onlySwitchingGenes\' and try again.'
                )
            }
            switchingIsoforms <-
                unique(switchAnalyzeRlist$isoformFeatures$isoform_id[which(
                    switchAnalyzeRlist$isoformFeatures$gene_id %in%
                        switchingGenes
                )])
        }

        # Extract nuclotide sequences
        if(   ntAlreadyInSwitchList ) {
            if (onlySwitchingGenes) {
                transcriptSequencesDNAstring <- switchAnalyzeRlist$ntSequence[which(
                    names(switchAnalyzeRlist$ntSequence) %in% switchingIsoforms
                )]
            } else {
                transcriptSequencesDNAstring <- switchAnalyzeRlist$ntSequence
            }

        }
        if( ! ntAlreadyInSwitchList ) {
            ### Extract exon GRanges
            if (onlySwitchingGenes) {
                # only extract transcript info for the switching genes
                myExonGranges <-
                    switchAnalyzeRlist$exons[which(
                        switchAnalyzeRlist$exons$isoform_id %in% switchingIsoforms
                    ),]
            } else {
                myExonGranges <- switchAnalyzeRlist$exons[which(
                    switchAnalyzeRlist$exons$isoform_id %in%
                        switchAnalyzeRlist$isoformFeatures$isoform_id
                ),]
            }

            myExonGranges <-
                myExonGranges[which(
                    as.character(myExonGranges@strand) %in% c('+', '-')
                ) , ]

            # update grange levels (migth be nessesary if it is a subset)
            seqlevels(myExonGranges) <-
                unique( as.character(seqnames(myExonGranges)@values) ) # nessesary to make sure seqlevels not pressented in the input data casuses problems - if presented with a subset for example


            ### Check whether isoform annotation and genome fits together
            genomeSeqs <- seqnames(genomeObject)
            grSeqs <- seqlevels(myExonGranges)

            # check overlap
            chrOverlap <- intersect(grSeqs, genomeSeqs)

            if (length(chrOverlap) == 0) {
                ### Try correct chr names of GRanges
                if (any(!grepl('^chr', seqlevels(myExonGranges)))) {
                    # Correct all chromosomes
                    seqlevels(myExonGranges) <-
                        unique(paste("chr", seqnames(myExonGranges), sep = ""))
                    # Correct Mitochondria
                    seqlevels(myExonGranges) <-
                        sub('chrMT', 'chrM', seqlevels(myExonGranges))

                    # check overlap again
                    grSeqs <- seqlevels(myExonGranges)
                    chrOverlap <- intersect(grSeqs, genomeSeqs)

                    # Correct chr names genome
                } else if (any(!grepl('^chr', genomeSeqs))) {
                    # Correct all chromosomes
                    seqnames(genomeObject) <-
                        paste("chr", seqnames(genomeObject), sep = "")
                    # Correct Mitochondria
                    seqnames(genomeObject) <-
                        sub('chrMT', 'chrM', seqnames(genomeObject))

                    # check overlap again
                    genomeSeqs <- seqnames(genomeObject)
                    chrOverlap <- intersect(grSeqs, genomeSeqs)

                }

                if (length(chrOverlap) == 0) {
                    stop(
                        'The genome supplied to genomeObject have no seqnames in common with the genes in the switchAnalyzeRlist'
                    )
                }
            }

            ### remove those transcripts that does not have a corresponding chromosme
            notOverlappingIndex <- which(!grSeqs %in% genomeSeqs)
            if (length(notOverlappingIndex)) {
                toRemove <-
                    which(seqnames(myExonGranges) %in% grSeqs[notOverlappingIndex])

                if (length(toRemove)) {
                    nrTranscripts <-
                        length(unique(myExonGranges$isoform_id[toRemove]))
                    warning(
                        paste(
                            nrTranscripts,
                            'transcripts was removed due to them being annotated to chromosomes not found in the refrence genome object',
                            sep = ' '
                        )
                    )

                    myExonGranges <- myExonGranges[-toRemove , ]
                }
            }

            # extract exon sequence and split into transcripts
            myExonSequences <- getSeq(genomeObject, myExonGranges)
            names(myExonSequences) <- myExonGranges$isoform_id

            ### In a strand specific manner combine exon sequenes (strand specific is nessesry because they should be reversed - else they are combined in the '+' order)
            myStrandData <-
                unique(
                    data.frame(
                        isoform_id = myExonGranges$isoform_id,
                        strand = as.character(myExonGranges@strand),
                        stringsAsFactors = FALSE
                    )
                )
            plusStrandTransripts <-
                myStrandData$isoform_id[which(myStrandData$strand == '+')]
            minusStrandTransripts <-
                myStrandData$isoform_id[which(myStrandData$strand == '-')]

            # Collaps exons of plus strand transcripts
            myPlusExonSequences <-
                myExonSequences[which(
                    names(myExonSequences) %in% plusStrandTransripts
                ), ]
            myPlusExonSequences <-
                split(myPlusExonSequences, f = names(myPlusExonSequences))
            myPlusExonSequences <-
                lapply(myPlusExonSequences, unlist) # does not work in the R package

            # Collaps exons of minus strand transcripts
            myMinusExonSequences <-
                myExonSequences[which(
                    names(myExonSequences) %in% minusStrandTransripts
                ), ]
            myMinusExonSequences <- rev(myMinusExonSequences)
            myMinusExonSequences <-
                split(myMinusExonSequences, f = names(myMinusExonSequences))
            myMinusExonSequences <- lapply(myMinusExonSequences, unlist)

            # combine strands
            transcriptSequencesDNAstring <-
                DNAStringSet(c(myPlusExonSequences, myMinusExonSequences))

        }

        startOfAnalysis <- startOfAnalysis + 1
    }

    ### Extract protein sequence of the identified ORFs
    if (extractAAseq) {
        if (!quiet) {
            message(
                paste(
                    "Step",
                    startOfAnalysis ,
                    "of",
                    nrAnalysisToMake,
                    ": Extracting ORF AA sequences...",
                    sep = " "
                )
            )
        }

        ### Extract switchAnalyzeRlist ORF annotation and filter for those I need
        if(TRUE) {
            switchORFannotation <-
                unique(data.frame(switchAnalyzeRlist$orfAnalysis[, c(
                    'isoform_id',
                    "orfTransciptStart",
                    'orfTransciptEnd',
                    "orfTransciptLength",
                    "PTC"
                )]))
            switchORFannotation <-
                switchORFannotation[which(!is.na(switchORFannotation$PTC)), ]
            switchORFannotation <-
                switchORFannotation[which(
                    switchORFannotation$isoform_id %in%
                        names(transcriptSequencesDNAstring)), ]
            switchORFannotation <-
                switchORFannotation[which(
                    switchORFannotation$orfTransciptStart != 0
                ), ]

        }

        ### Extract AA sequences
        if(   aaAlreadyInSwitchList ) {
            transcriptORFaaSeq <- switchAnalyzeRlist$aaSequence[which(
                names(switchAnalyzeRlist$aaSequence) %in% names(transcriptSequencesDNAstring)
            )]
        }
        if( ! aaAlreadyInSwitchList ) {
            ### Reorder transcript sequences
            transcriptSequencesDNAstringInData <-
                transcriptSequencesDNAstring[na.omit(match(
                    x = switchORFannotation$isoform_id,
                    table = names(transcriptSequencesDNAstring)
                )), ]
            if (!all(
                names(transcriptSequencesDNAstringInData) ==
                switchORFannotation$isoform_id
            )) {
                stop('Somthing went wrong in sequence extraction - contract developer')
            }

            ### Test whether the annotation agrees
            switchORFannotation$lengthOK <-
                switchORFannotation$orfTransciptEnd <=
                width(transcriptSequencesDNAstringInData)[match(
                    switchORFannotation$isoform_id,
                    names(transcriptSequencesDNAstringInData)
                )]
            if (any(!switchORFannotation$lengthOK)) {
                warning(
                    paste(
                        'There were',
                        sum(!switchORFannotation$lengthOK),
                        'cases where the annotated ORF were longer than the exons annoated - these cases will be ommitted'
                    )
                )

                # Subset data
                switchORFannotation <-
                    switchORFannotation[which(switchORFannotation$lengthOK), ]
                transcriptSequencesDNAstringInData <-
                    transcriptSequencesDNAstringInData[na.omit(match(
                        x = switchORFannotation$isoform_id,
                        table = names(transcriptSequencesDNAstringInData)
                    )), ]
            }

            ### Get corresponding protein sequence
            # Use the predicted ORF coordinats to extract the nt sequence of the ORF
            transcriptORFntSeq <-
                XVector::subseq(
                    transcriptSequencesDNAstringInData,
                    start = switchORFannotation$orfTransciptStart,
                    width = switchORFannotation$orfTransciptLength
                )

            # translate ORF nucleotide to aa sequence
            transcriptORFaaSeq <-
                suppressWarnings(
                    Biostrings::translate(x = transcriptORFntSeq, if.fuzzy.codon = 'solve')
                ) # supress warning is nessesary because isoformSwitchAnalyzeR allows ORFs to exceed the transcript - which are by default ignored and just gives a warning

            ### Check ORFs for stop codons
            stopData <- data.frame(
                isoform_id = names(transcriptORFaaSeq),
                stopCodon = Biostrings::vcountPattern(pattern = '*', transcriptORFaaSeq),
                stringsAsFactors = FALSE
            )
            stopDataToRemove <- stopData[which(stopData$stopCodon > 0), ]

            if (nrow(stopDataToRemove)) {
                if (removeORFwithStop) {
                    warning(
                        paste(
                            'There were',
                            nrow(stopDataToRemove),
                            'isoforms where the amino acid sequence had a stop codon before the annotated stop codon. These was be removed.',
                            sep = ' '
                        )
                    )

                    ### Remove PTC annotation
                    switchAnalyzeRlist$isoformFeatures$PTC[which(
                        switchAnalyzeRlist$isoformFeatures$isoform_id %in% stopDataToRemove$isoform_id
                    )] <- NA

                    ### Remove ORF annoation
                    switchAnalyzeRlist$orfAnalysis[which(
                        switchAnalyzeRlist$orfAnalysis$isoform_id %in% stopDataToRemove$isoform_id
                    ), 2:ncol(switchAnalyzeRlist$orfAnalysis)] <- NA

                    ### Remove sequence
                    transcriptORFaaSeq <-
                        transcriptORFaaSeq[which(!names(transcriptORFaaSeq) %in% stopDataToRemove$isoform_id)]

                    switchORFannotation <- switchORFannotation[which(
                        ! switchORFannotation$isoform_id %in% stopDataToRemove$isoform_id
                    ),]

                } else {
                    warning(
                        paste(
                            'There were',
                            nrow(stopDataToRemove),
                            'isoforms where the amino acid sequence had a stop codon before the annotated stop codon. These was NOT removed in accodance with the \'removeORFwithStop\' argument.',
                            sep = ' '
                        )
                    )
                }
            }



        }

        startOfAnalysis <- startOfAnalysis + 1

    }


    ### If enabled write fasta file(s) (after filteri g)
    seqWasTrimmed <- FALSE
    if (writeToFile) {
        if (!quiet) {
            message(
                paste(
                    "Step",
                    startOfAnalysis ,
                    "of",
                    nrAnalysisToMake,
                    ": Preparing output...",
                    sep = " "
                )
            )
        }

        ### add / if directory
        if (file.exists(pathToOutput)) {
            pathToOutput <- paste(pathToOutput, '/', sep = '')
        } else {
            stop('The path supplied to \'pathToOutput\' does not seem to exist')
        }

        # Nucleotides
        if (extractNTseq) {
            writeXStringSet(
                transcriptSequencesDNAstring,
                filepath = paste(
                    pathToOutput,
                    outputPrefix,
                    '_nt.fasta',
                    sep = ''
                ),
                format = 'fasta'
            )
        }

        # Amino Acids
        if (extractAAseq) {

            ### Filter if nessesary
            if (removeLongAAseq | removeShortAAseq) {

                switchORFannotation$toBeTrimmed <- FALSE

                ### remove to long sequences
                if(   removeLongAAseq) {
                    ### Filter lengths
                    switchORFannotation$aaLength <- width(transcriptORFaaSeq)
                    switchORFannotation$toBeTrimmed <- switchORFannotation$aaLength > 1000


                    ### Make new lengths
                    switchORFannotation$newAAlength <- switchORFannotation$aaLength
                    switchORFannotation$newAAlength[which(
                        switchORFannotation$toBeTrimmed
                    )] <- 1000   # <- EBI's current limmit

                    ### Annotate the trimming
                    if(any( switchORFannotation$toBeTrimmed )) {
                        seqWasTrimmed <- TRUE


                        trimmedCases <- switchORFannotation[which(
                            switchORFannotation$toBeTrimmed
                        ),]

                        ### Calculate genomic positions of trimmed regions
                        if(TRUE) {
                            trimmedCases$trimmedTranscriptStart <-
                                (1001  * 3 - 2) + trimmedCases$orfTransciptStart - 1
                            trimmedCases$trimmedTranscriptEnd <- trimmedCases$orfTransciptEnd

                            ### convert from transcript to genomic coordinats
                            # extract exon data
                            myExons <-
                                as.data.frame(switchAnalyzeRlist$exons[which(
                                    switchAnalyzeRlist$exons$isoform_id %in% trimmedCases$isoform_id
                                ), ])
                            myExonsSplit <- split(myExons, f = myExons$isoform_id)

                            # loop over the individual transcripts and extract the genomic coordiants of the domain and also for the active residues (takes 2 min for 17000 rows)
                            trimmedCases <-
                                plyr::ddply(
                                    trimmedCases[,c('isoform_id','newAAlength','trimmedTranscriptStart','trimmedTranscriptEnd')],
                                    .progress = 'none',
                                    .variables = 'isoform_id',
                                    .fun = function(aDF) {
                                        # aDF <- trimmedCases[1,]

                                        transcriptId <- aDF$isoform_id[1]
                                        localExons <-
                                            as.data.frame(myExonsSplit[[transcriptId]])

                                        # extract domain allignement
                                        localORFalignment <- aDF
                                        colnames(localORFalignment)[match(
                                            x = c('trimmedTranscriptStart', 'trimmedTranscriptEnd'),
                                            table = colnames(localORFalignment)
                                        )] <- c('start', 'end')

                                        # loop over domain alignment (migh be several)
                                        orfPosList <- list()
                                        for (j in 1:nrow(localORFalignment)) {
                                            domainInfo <-
                                                convertCoordinatsTranscriptToGenomic(
                                                    transcriptCoordinats =  localORFalignment[j, ],
                                                    exonStructure = localExons
                                                )

                                            orfPosList[[as.character(j)]] <- domainInfo

                                        }
                                        orfPosDf <- do.call(rbind, orfPosList)

                                        return(cbind(aDF, orfPosDf))
                                    }
                                )

                        }

                    }

                    ### Trim
                    transcriptORFaaSeq2 <- XVector::subseq(
                        transcriptORFaaSeq,
                        start = 1,
                        width = switchORFannotation$newAAlength
                    )
                }
                if( ! removeLongAAseq) {
                    transcriptORFaaSeq2 <- transcriptORFaaSeq
                }

                ### Remove to short sequences
                if( removeShortAAseq ) {
                    transcriptORFaaSeq2 <-
                        transcriptORFaaSeq2[which(
                            width(transcriptORFaaSeq2) > 10
                        )]
                }

                message(
                    paste(
                        'The \'removeLongAAseq\' and \'removeShortAAseq\' arguments:\n',
                        'Removed :',
                        length(transcriptORFaaSeq) - length(transcriptORFaaSeq2),
                        'isoforms.\n',
                        'Trimmed :',
                        sum(switchORFannotation$toBeTrimmed),
                        'isoforms (to only contain the first 1000 AA)',
                        sep=' '
                    )
                )
            } else {
                transcriptORFaaSeq2 <- transcriptORFaaSeq
            }

            ### Write file(s)
            if(   alsoSplitFastaFile ) {
                ### Make index
                l <- length(transcriptORFaaSeq2)
                indexVec <- unique( c( seq(
                    from = 1,
                    to = l,
                    by = 500 -1 # Max in PFAM Jan 2019
                ), l))

                indexDf <- data.frame(
                    start = indexVec[-length(indexVec)],
                    end = indexVec[-1]
                )
                n <- nrow(indexDf)
                indexDf$file <- paste0('_subset_', 1:n,'_of_',n)

                ### loop over index and make files
                tmp <- plyr::ddply(indexDf, .variables = 'file', .fun = function(aDF) { # aDF <- indexDf[1,]
                    writeXStringSet(
                        transcriptORFaaSeq2[ aDF$start:aDF$end ],
                        filepath = paste(
                            pathToOutput,
                            outputPrefix,
                            '_AA',
                            aDF$file,
                            '.fasta',
                            sep = ''
                        ),
                        format = 'fasta'
                    )
                })
                message(paste(
                    'The \'alsoSplitFastaFile\' caused',
                    nrow(indexDf),
                    'fasta files, each with a subset of the data, to be created (each named X of Y).'
                ))

                ### ALso write full
                writeXStringSet(
                    transcriptORFaaSeq2,
                    filepath = paste(
                        pathToOutput,
                        outputPrefix,
                        '_AA_complete.fasta',
                        sep = ''
                    ),
                    format = 'fasta'
                )
            }
            if( ! alsoSplitFastaFile ) {
                ### Write full
                writeXStringSet(
                    transcriptORFaaSeq2,
                    filepath = paste(
                        pathToOutput,
                        outputPrefix,
                        '_AA.fasta',
                        sep = ''
                    ),
                    format = 'fasta'
                )
            }

        }
    }

    if (!quiet) {
        message('Done')
    }

    ### Add sequences to switchAnalyzeRlist
    if (addToSwitchAnalyzeRlist) {
        if (extractNTseq) {
            switchAnalyzeRlist$ntSequence <- transcriptSequencesDNAstring
        }
        if (extractAAseq) {
            switchAnalyzeRlist$aaSequence <- transcriptORFaaSeq
        }

    }

    ### Add run info
    if( TRUE ) {
        if( is.null(switchAnalyzeRlist$runInfo) ) {
            switchAnalyzeRlist$runInfo <- list()
        }

        if(seqWasTrimmed) {
            switchAnalyzeRlist$orfAnalysis$wasTrimmed <- switchORFannotation$toBeTrimmed[match(
                switchAnalyzeRlist$orfAnalysis$isoform_id, switchORFannotation$isoform_id
            )]

            switchAnalyzeRlist$orfAnalysis$trimmedStartGenomic <- trimmedCases$pfamStartGenomic[match(
                switchAnalyzeRlist$orfAnalysis$isoform_id, trimmedCases$isoform_id
            )]
        }

        switchAnalyzeRlist$runInfo$extractSequence <- list(
            removeShortAAseq = removeShortAAseq,
            removeLongAAseq = seqWasTrimmed
        )

    }
    return(switchAnalyzeRlist)
}


### Helper functions
myAllFindORFsinSeq <- function(
    dnaSequence, # A DNAString object containing the DNA nucleotide sequence of to analyze for open reading frames
    codonAnnotation = data.frame(
        codons = c("ATG", "TAA", "TAG", "TGA"),
        meaning = c('start', 'stop', 'stop', 'stop'),
        stringsAsFactors = FALSE
    ), # a data.frame with two collums: 1) A collumn called \'codons\' containing a vector of capitalized three-letter strings with codons to analyze. 2) A collumn called \'meaning\' containing the corresponding meaning of the codons. These must be either \'start\' or \'stop\'. See defult data.frame for example. Default are canonical \'ATG\' as start start codons and \'TAA\', \'TAG\', \'TGA\' as stop codons.
    filterForPostitions = NULL # A vector of which transcipt start site potitions to extract

) {
    ### To do
    # might be made more efficeint using matchPDict()
    # Find all codons of interes in the supplied dnaSequence
    myFindPotentialStartsAndStops <- function(dnaSequence, codons) {
        # use matchPattern() to find the codons
        myCodonPositions <-
            sapply(codons, function(x)
                matchPattern(x, dnaSequence))

        # extract the position of the codons
        myCodonPositions <-
            lapply(myCodonPositions, function(x)
                data.frame(position = start(x@ranges)))
        myCodonPositions <-
            myListToDf(
                myCodonPositions,
                ignoreColNames = TRUE,
                addOrignAsRowNames = FALSE,
                addOrignAsColumn = TRUE,
                addOrgRownames = FALSE
            )

        # Massage the data.frame
        colnames(myCodonPositions)[2] <- 'codon'
        myCodonPositions <-
            myCodonPositions[sort.list(
                myCodonPositions$position,
                decreasing = FALSE
            ), ]
        rownames(myCodonPositions) <- NULL

        return(myCodonPositions)
    }
    codonsOfInterest <-
        myFindPotentialStartsAndStops(
            dnaSequence = dnaSequence,
            codons = codonAnnotation$codons
        )

    # Add stop codon at the end to make sure ORFs are allowed to continue over the edge of the transcript (by simmulating the last codons in each reading frame is a stop codon)
    codonsOfInterest <-
        rbind(
            codonsOfInterest,
            data.frame(
                position = nchar(dnaSequence) - 2 - 2:0,
                codon = codonAnnotation$codons[which(
                    codonAnnotation$meaning == 'stop'
                )][1],
                stringsAsFactors = FALSE
            )
        )

    ### Annotate with meaning of condon
    codonsOfInterest$meaning <-
        codonAnnotation$meaning[match(
            codonsOfInterest$codon,
            codonAnnotation$codons
        )]

    # Filter for annotated start sites
    if (!is.null(filterForPostitions)) {
        codonsOfInterest <-
            codonsOfInterest[which(
                codonsOfInterest$position %in% filterForPostitions |
                    codonsOfInterest$meaning != 'start'
            ), ]
        if (!nrow(codonsOfInterest)) {
            return(data.frame(NULL))
        }
    }


    # Loop over the 3 possible reading frames
    myORFs <- list()
    for (i in 0:2) {
        # Reduce the data.frame with codons to only contain those within that reading frame
        localCodonsOfInterest <-
            codonsOfInterest[which(codonsOfInterest$position %% 3 == i), ]

        # if there are any look for ORFs
        if (nrow(localCodonsOfInterest) == 0) {
            next
        }

        ### Create rle objet to allow analysis of order for start/stop codons
        myRle <- rle(localCodonsOfInterest$meaning)

        ### Trim codons
        # Remove rows so the first codon is a start codon (not a stop codon)
        redoRLE <- FALSE
        if (myRle$values[1] == 'stop') {
            localCodonsOfInterest <-
                localCodonsOfInterest[
                    (myRle$lengths[1] + 1):(nrow(localCodonsOfInterest))
                , ]

            #redo rle nessesary
            redoRLE <- TRUE
        }
        # Remove rows so the last codon is a stop codon (not a start codon)
        if (tail(myRle$values, 1) == 'start') {
            localCodonsOfInterest <-
                localCodonsOfInterest[
                    1:(nrow(localCodonsOfInterest) - tail(myRle$lengths, 1))
                , ]

            #redo rle nessesary
            redoRLE <- TRUE
        }
        # redo RLE if anything was removed
        if (redoRLE) {
            myRle <- rle(localCodonsOfInterest$meaning)
        }

        # make sure that there are both start and stop (left)
        if (!all(c('start', 'stop') %in% localCodonsOfInterest$meaning)) {
            next
        }

        # Remove duplicates to make sure that I only take the first start codon (if multiple are pressent) and the first stop codon if multiple are pressent
        localCodonsOfInterest <-
            localCodonsOfInterest[
                c(1, cumsum(myRle$lengths) + 1)[1:length(myRle$lengths)]
            , ]

        ### Loop over the resulting table and concattenate the ORFs
        localCodonsOfInterest$myOrf <-
            as.vector(sapply(1:(nrow(
                localCodonsOfInterest
            ) / 2), function(x)
                rep(x, 2)))
        localCodonsOfInterestSplit <- split(
                localCodonsOfInterest$position,
                f = localCodonsOfInterest$myOrf
        )
        myORFs[[i + 1]] <-
            list(myListToDf(
                lapply(localCodonsOfInterestSplit, function(x)
                    data.frame(start = x[1], end = x[2]))
            ))
    }
    if (length(myORFs) == 0) {
        return(data.frame(NULL))
    }

    # Massage from list to data.frame
    myORFs <-
        myORFs[which(!sapply(myORFs, is.null))] # remove potential empty ones
    myORFs <-
        myListToDf(lapply(myORFs, function(x)
            x[[1]])) # x[[1]] nessary because of the extra list i had to introduce to avoid classes due to single entry lists

    ### Make final calculatons
    # Subtract 1 since the positions I here have worked with are the start of the stop codon positions (and I do not want the stop codon to be included)
    myORFs$end <- myORFs$end - 1
    # add lengths
    myORFs$length <-
        myORFs$end - myORFs$start + 1 # the +1 is because both are included

    # sort
    myORFs <- myORFs[sort.list(myORFs$start, decreasing = FALSE), ]
    rownames(myORFs) <- NULL

    return(myORFs)
}

analyzeORFforPTC <- function(
    aORF,           # A data.frame containing the transcript coordinats and length of the main ORF
    exonStructure,  # A data.frame with the exon structure of the transcript
    PTCDistance = 50  # A numeric giving the premature termination codon-distance: The minimum distance from a STOP to the final exon-exon junction, for a transcript to be marked as NMD-sensitive
){
    # Have to be done diffetly for each strand due to the reversed exon structure
    if (exonStructure$strand[1] == '+') {
        # Calculate exon cumSums (because they "translate" the genomic coordinats to transcript coordinats )
        exonCumsum      <- cumsum(c(0,      exonStructure$width))

        # Calculate wich exon the start and stop codons are in
        cdsStartExonIndex   <- max(which(aORF$start >  exonCumsum))
        cdsEndExonIndex     <- max(which(aORF$end   >  exonCumsum))
        # Calcualte genomic position of the ORF
        cdsStartGenomic <-
            exonStructure$start[cdsStartExonIndex]  +
            (aORF$start - exonCumsum[cdsStartExonIndex]  - 1) # -1 because both intervals are inclusive [a;b] (meaning the start postition will be counted twice)
        cdsEndGenomic   <-
            exonStructure$start[cdsEndExonIndex]    +
            (aORF$end   - exonCumsum[cdsEndExonIndex]  - 1) # -1 because both intervals are inclusive [a;b] (meaning the start postition will be counted twice)

        # Calculate length from stop codon to last splice junction
        #               transcript pos for last exon:exon junction  - transcript pos for stop codon
        stopDistance <-
            exonCumsum[length(exonCumsum) - 1]  -
            aORF$end # (positive numbers means the stop position upstream of the last exon exon junction )

        # Calculate which exon the stop codon are in compared to the last exon exon junction
        # stop in exon      total nr exon
        junctionIndex <- cdsEndExonIndex - nrow(exonStructure)
    }
    if (exonStructure$strand[1] == '-') {
        # Calculate exon cumSums (because they "translate" the genomic coordinats to transcript coordinats )
        exonRevCumsum   <- cumsum(c(0, rev(exonStructure$width)))

        # Calculate wich exon the start and stop codons are in
        cdsStartExonIndex   <-
            max(which(aORF$start >  exonRevCumsum))
        cdsEndExonIndex     <-
            max(which(aORF$end   >  exonRevCumsum))

        # create a vector to translate indexes to reverse (needed when exon coordinats are extracted)
        reversIndexes <- nrow(exonStructure):1

        # Calcualte genomic position of the ORF (end and start are switched in order to return them so start < end (default of all formating))
        cdsStartGenomic <-
            exonStructure$end[reversIndexes[cdsStartExonIndex]]  -
            (aORF$start - exonRevCumsum[cdsStartExonIndex] - 1) # -1 because both intervals are inclusive [a;b] (meaning the start postition will be counted twice)
        cdsEndGenomic   <-
            exonStructure$end[reversIndexes[cdsEndExonIndex]]  -
            (aORF$end   - exonRevCumsum[cdsEndExonIndex] - 1) # -1 because both intervals are inclusive [a;b] (meaning the start postition will be counted twice)

        # Calculate length from stop codon to last splice junction
        #               transcript pos for last exon:exon junction  - transcript pos for stop codon
        stopDistance <-
            exonRevCumsum[length(exonRevCumsum) - 1]    - aORF$end # (positive numbers means the stop position upstream of the last exon exon junction )

        # Calculate which exon the stop codon are in compared to the last exon exon junction
        # stop in exon      total nr exon
        junctionIndex <- cdsEndExonIndex - nrow(exonStructure)
    }

    return(
        data.frame(
            orfStarExon = cdsStartExonIndex,
            orfEndExon = cdsEndExonIndex,
            orfStartGenomic = cdsStartGenomic,
            orfEndGenomic = cdsEndGenomic,
            stopDistanceToLastJunction = stopDistance,
            stopIndex = junctionIndex,
            PTC = (stopDistance >= PTCDistance) & junctionIndex != 0
        )
    )

}

Try the IsoformSwitchAnalyzeR package in your browser

Any scripts or data that you put into this service are public.

IsoformSwitchAnalyzeR documentation built on Nov. 8, 2020, 5:36 p.m.