R/import_data.R

Defines functions preFilter importSalmonData prepareSalmonFileDataFrame importRdata importIsoformExpression importGTF importCufflinksFiles

Documented in importCufflinksFiles importGTF importIsoformExpression importRdata importSalmonData preFilter prepareSalmonFileDataFrame

### Acutal import functions
importCufflinksFiles <- function(
    ### Core arguments
    pathToGTF,
    pathToGeneDEanalysis,
    pathToIsoformDEanalysis,
    pathToGeneFPKMtracking,
    pathToIsoformFPKMtracking,
    pathToIsoformReadGroupTracking,
    pathToSplicingAnalysis = NULL,
    pathToReadGroups,
    pathToRunInfo,
    isoformNtFasta = NULL,

    ### Advanced arguments
    fixCufflinksAnnotationProblem = TRUE,
    addIFmatrix = TRUE,
    estimateDifferentialGeneRange = TRUE,
    quiet = FALSE
) {
    ### Test that files exist
    if (TRUE) {
        if( pathToGTF == '' ) {
            stop(
                paste(
                    'The \'pathToGTF\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                    '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                    'to import your own data? The system.file() should only be used',
                    'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                    'To access your own data simply provide the string to the directory with the data as:',
                    '"path/to/quantification/".',
                    sep=' '
                )
            )
        }
        if( pathToGeneDEanalysis == '' ) {
            stop(
                paste(
                    'The \'pathToGeneDEanalysis\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                    '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                    'to import your own data? The system.file() should only be used',
                    'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                    'To access your own data simply provide the string to the directory with the data as:',
                    '"path/to/quantification/".',
                    sep=' '
                )
            )
        }

        # pathToGTF
        if (!file.exists(pathToGTF)) {
            stop('The \'pathToGTF\' argument does not point to an acutal file')
        }
        if( !is.null(isoformNtFasta)) {
            if( !is.character( isoformNtFasta)) {
                stop('The \'isoformNtFasta\' argument must be a charachter string.')
            }

            if( any(isoformNtFasta == '') ) {
                stop(
                    paste(
                        'The \'isoformNtFasta\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                        '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                        'to import your own data? The system.file() should only be used',
                        'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                        'To access your own data simply provide the string to the directory with the data as:',
                        '"path/to/quantification/".',
                        sep=' '
                    )
                )
            }

            if( any( ! sapply(isoformNtFasta, file.exists) ) ) {
                stop('At least one of the file(s) pointed to with \'isoformNtFasta\' seems not to exist.')
            }

            if( any(! grepl('\\.fa|\\.fasta|\\.fa.gz|\\.fasta.gz', isoformNtFasta)) ) {
                stop('The file pointed to via the \'isoformNtFasta\' argument does not seem to be a fasta file...')
            }
        }

        # DE
        if (!file.exists(pathToGeneDEanalysis))    {
            stop('The \'pathToGeneDEanalysis\' argument does not point to an acutal file')
        }
        if (!file.exists(pathToIsoformDEanalysis)) {
            stop('The \'pathToIsoformDEanalysis\' argument does not point to an acutal file')
        }
        # Tracking
        if (!file.exists(pathToGeneFPKMtracking))    {
            stop('The \'pathToGeneFPKMtracking\' argument does not point to an acutal file')
        }
        if (!file.exists(pathToIsoformFPKMtracking)) {
            stop(
                'The \'pathToIsoformFPKMtracking\' argument does not point to an acutal file'
            )
        }
        if (!file.exists(pathToIsoformReadGroupTracking)) {
            stop(
                'The \'pathToIsoformReadGroupTracking\' argument does not point to an acutal file'
            )
        }
        # splicing
        if (!is.null(pathToSplicingAnalysis)) {
            if (!file.exists(pathToSplicingAnalysis)) {
                stop(
                    'The \'pathToSplicingAnalysis\' argument does not point to an acutal file'
                )
            }
        }
        # info
        if (!file.exists(pathToReadGroups)) {
            stop('The \'pathToReadGroups\' argument does not point to an acutal file')
        }
        if (!file.exists(pathToRunInfo))    {
            stop('The \'pathToRunInfo\' argument does not point to an acutal file')
        }
    }

    ### Import the supplied files (not gtf)
    if (TRUE) {
        if (!quiet) { message('Step 1 of 5: Importing data...')}
        suppressMessages(
            geneDiffanalysis     <-
                readr::read_tsv(
                    file = pathToGeneDEanalysis,
                    col_names = TRUE
                )
        )
        suppressMessages(
            isoformDiffanalysis  <-
                readr::read_tsv(
                    file = pathToIsoformDEanalysis,
                    col_names = TRUE
                )
        )
        suppressMessages(
            geneAnnotation       <-
                readr::read_tsv(
                    file = pathToGeneFPKMtracking,
                    col_names = TRUE
                )
        )
        suppressMessages(
            isoformAnnotation    <-
                readr::read_tsv(
                    file = pathToIsoformFPKMtracking,
                    col_names = TRUE
                )
        )
        suppressMessages(
            isoRepExp        <-
                read.table(
                    file = pathToIsoformReadGroupTracking,
                    header = TRUE,
                    sep='\t',
                    stringsAsFactors = FALSE
                )
        )
        if( !is.null(pathToSplicingAnalysis) ) {
            suppressMessages(
                cuffSplicing         <-
                    readr::read_tsv(
                        file = pathToSplicingAnalysis,
                        col_names = TRUE
                    )
            )
        }

        suppressMessages(
            readGroup <-
                read.table(
                    file = pathToReadGroups,
                    sep='\t',
                    header = TRUE
                )
        )
        suppressMessages(
            runInfo   <-
                readr::read_tsv(
                    file = pathToRunInfo,
                    col_names = TRUE
                )
        )
    }

    ### "Test" that the data.files are what they are supposed to be
    if (TRUE) {
        ### gene diff analysis
        q1 <-
            !all(
                colnames(geneDiffanalysis) %in% c(
                    "test_id",
                    "gene_id",
                    "gene",
                    "locus",
                    "sample_1",
                    "sample_2",
                    "status",
                    "value_1",
                    "value_2",
                    "log2(fold_change)",
                    "test_stat",
                    "p_value",
                    "q_value",
                    "significant"
                )
            )
        if (q1) {
            stop(paste(
                'The file supplied to pathToGeneDEanalysis does not appear',
                'to be the result of the CuffDiff gene expression analysis.'
            ))
        }
        ### transcript diff analysis
        q1 <-
            !all(
                colnames(isoformDiffanalysis) %in% c(
                    "test_id",
                    "gene_id",
                    "gene",
                    "locus",
                    "sample_1",
                    "sample_2",
                    "status",
                    "value_1",
                    "value_2",
                    "log2(fold_change)",
                    "test_stat",
                    "p_value",
                    "q_value",
                    "significant"
                )
            )
        if (q1) {
            stop(paste(
                'The file supplied to isoformDiffanalysis does not appear to',
                'be the result of the CuffDiff transcript expression analysis.'
            ))
        }

        q2 <-
            sum(grepl(
                'TCONS', isoformDiffanalysis$test_id
            )) != nrow(isoformDiffanalysis)
        if (q2) {
            warning(paste(
                'It looks like you have NOT been doing transcript\n',
                'reconstruction/assembly with Cufflinks/Cuffdiff.\n',
                'If you have not reconstructed transcripts we receomend to use Kallisto or Salmon\n',
                'to do the quantification instead - they are more accurate and have better biase correction methods.'
            ))
        }



        ### gene annoation
        q1 <-
            !all(
                colnames(geneAnnotation)[1:8] %in% c(
                    "tracking_id",
                    "class_code",
                    "nearest_ref_id",
                    "gene_id",
                    "gene_short_name",
                    "tss_id",
                    "locus",
                    "length"
                )
            )
        if (q1) {
            stop(paste(
                'The file supplied to geneAnnotation does not appear to be the',
                'gene FPKM traccking of the CuffDiff gene FPKM trascking analysis.'
            ))
        }
        ### transcript annoation
        q1 <-
            !all(
                colnames(isoformAnnotation)[1:8] %in% c(
                    "tracking_id",
                    "class_code",
                    "nearest_ref_id",
                    "gene_id",
                    "gene_short_name",
                    "tss_id",
                    "locus",
                    "length"
                )
            )
        if (q1) {
            stop(paste(
                'The file supplied to isoformAnnotation does not appear to be',
                'the isoform FPKM tracking of the CuffDiff transcript analysis.'
            ))
        }

        ### rep expression
        q1 <-
            !all(
                colnames(isoRepExp)[1:4] %in% c(
                    "tracking_id", "condition", "replicate", "raw_frags"
                )
            )
        if (q1) {
            stop(paste(
                'The file supplied to pathToIsoformCountTracking does not',
                'appear to be the isoform count tracking of the CuffDiff',
                'transcript analysis.'
            ))
        }

        ### splicing analysis
        if( !is.null(pathToSplicingAnalysis) ) {
            q1 <-
                !all(
                    colnames(cuffSplicing) %in% c(
                        "test_id",
                        "gene_id",
                        "gene",
                        "locus",
                        "sample_1",
                        "sample_2",
                        "status",
                        "value_1",
                        "value_2",
                        "sqrt(JS)",
                        "test_stat",
                        "p_value",
                        "q_value",
                        "significant"
                    )
                )
            if (q1) {
                stop(
                    'The file supplied to cuffSplicing does not appear to be the',
                    'result of the CuffDiff differential analysis of alternative splicing.'
                )
            }

        }

        ### Read grous
        q1 <-
            !all(
                colnames(readGroup) %in% c(
                    "file",
                    "condition",
                    "replicate_num",
                    "total_mass",
                    "norm_mass",
                    "internal_scale",
                    "external_scale"
                )
            )
        q2 <-
            !all(readGroup$condition %in% unique(
                unlist(geneDiffanalysis[, c('sample_1', 'sample_2')]))
            )
        if (q1 | q2) {
            stop(paste(
                'The file supplied to readGroup does not appear to be the',
                'pathToReadGroups of the CuffDiff transcript analysis.'
            ))
        }

        ### Run info
        q1 <- !all(colnames(runInfo) %in% c("param", 'value'))
        q2 <-
            !all(
                runInfo$param %in% c(
                    "cmd_line",
                    "version",
                    "SVN_revision",
                    "boost_version"
                )
            )
        if (q1 | q2) {
            stop(paste(
                'The file supplied to runInfo does not appear to be',
                'the runInfo of the CuffDiff transcript analysis.'
            ))
        }

    }

    ### Massage and merge gene and isoform annoation and DE analysis
    if (TRUE) {
        if (!quiet) { message('Step 2 of 5: Merging gene and isoform expression...')}
        ### Design matrix
        readGroup$sample_name <-
            stringr::str_c(readGroup$condition, '_', readGroup$replicate_num)
        designMatrix <- readGroup[, c('sample_name', 'condition')]
        colnames(designMatrix) <- c('sampleID', 'condition')

        ### Massage data frames
        if (TRUE) {
            # gene
            geneDiffanalysis <-
                geneDiffanalysis[, -which(
                    colnames(geneDiffanalysis) %in%
                        c('test_id', 'gene', 'locus', 'test_stat')
                )]
            colnames(geneDiffanalysis)[4:ncol(geneDiffanalysis)] <-
                paste(
                    "gene_",
                    colnames(geneDiffanalysis)[4:ncol(geneDiffanalysis)] ,
                    sep = "") # add gene to the colnames so they can be destinquished from the gene diff data
            colnames(geneDiffanalysis) <- gsub(
                    'gene_log2.fold_change.',
                    'gene_log2_fold_change',
                    colnames(geneDiffanalysis)
                )

            # info
            colnames(isoformAnnotation)[1] <- 'isoform_id'
            isoformAnnotation2 <-
                isoformAnnotation[, na.omit(match(
                    c(
                        'isoform_id',
                        'gene_id',
                        'gene_short_name',
                        'nearest_ref_id',
                        'class_code',
                        'tss_id',
                        'CDS_id',
                        'length',
                        'locus'
                    ),
                    colnames(isoformAnnotation)
                ))]

            colnames(isoformAnnotation2)[which(
                colnames(isoformAnnotation2) == 'gene_short_name'
            )] <- 'gene_name'

            # iso
            isoformDiffanalysis <- isoformDiffanalysis[, which(
                !colnames(isoformDiffanalysis) %in%
                    c('gene_id', 'gene', 'test_stat')
            )]
            colnames(isoformDiffanalysis)[5:ncol(isoformDiffanalysis)] 	<-
                paste(
                    "iso_",
                    colnames(isoformDiffanalysis)[5:ncol(isoformDiffanalysis)],
                    sep = "") # add gene to the colnames so they can be destinquished from the gene diff data
            colnames(isoformDiffanalysis)[1] <- 'isoform_id'
            colnames(isoformDiffanalysis) <-
                gsub(
                    'iso_log2.fold_change.',
                    'iso_log2_fold_change',
                    colnames(isoformDiffanalysis)
                )

            # rep expression
            isoRepExp2 <-
                isoRepExp[, c("tracking_id",
                              "condition",
                              "replicate",
                              "raw_frags")]
            colnames(isoRepExp2)[1] <- 'isoform_id'
            isoRepExp2$rep <-
                paste(isoRepExp2$condition, isoRepExp2$replicate, sep = '_')
            isoRepExp2 <-
                reshape2::dcast(data = isoRepExp2,
                                isoform_id ~ rep,
                                value.var = 'raw_frags')


            ### rep fpkm
            # iso
            isoRepFpkm <- isoRepExp[, c(
                "tracking_id",
                "condition",
                "replicate",
                "FPKM"
            )]
            colnames(isoRepFpkm)[1] <- 'isoform_id'
            isoRepFpkm$rep <-
                paste(isoRepFpkm$condition, isoRepFpkm$replicate, sep = '_')
            isoRepFpkm <-
                reshape2::dcast(data = isoRepFpkm,
                                isoform_id ~ rep,
                                value.var = 'FPKM')

            ### Gene
            isoRepFpkm2 <- isoRepFpkm
            isoRepFpkm2$gene_id <- isoformAnnotation2$gene_id[match(
                isoRepFpkm2$isoform_id, isoformAnnotation2$isoform_id
            )]
            isoRepFpkm2$isoform_id <- NULL

            geneRepFpkm <- isoformToGeneExp(isoRepFpkm2, quiet = TRUE)

            ### Calculate means
            rownames(isoRepFpkm) <- isoRepFpkm$isoform_id
            isoRepFpkm$isoform_id <- NULL
            isoMean <- rowMeans(isoRepFpkm)

            rownames(geneRepFpkm) <- geneRepFpkm$gene_id
            geneRepFpkm$gene_id <- NULL
            geneMean <- rowMeans(geneRepFpkm)

            ### add means
            geneDiffanalysis$gene_overall_mean <- geneMean[match(
                geneDiffanalysis$gene_id, names(geneMean)
            )]

            isoformDiffanalysis$iso_overall_mean <- isoMean[match(
                isoformDiffanalysis$isoform_id, names(isoMean)
            )]

        }

        ### Extract standard error
        if (TRUE) {
            ### Tjek if the Isoform CI collums are switches
            ciLowColumn <-
                which(grepl('_conf_lo', colnames(isoformAnnotation)))[1]
            ciHighColumn <-
                which(grepl('_conf_hi', colnames(isoformAnnotation)))[1]

            if (all(isoformAnnotation[, ciHighColumn] >= isoformAnnotation[, ciLowColumn])) {
                highString <- '_conf_hi'
            } else {
                highString <- '_conf_lo'
            }

            ### extract isoform sddev from CI
            # fpkm
            isoformFPKM <-
                isoformAnnotation[, which(grepl(
                    'isoform_id|_FPKM', colnames(isoformAnnotation)
                ))]
            isoformFPKM <- reshape2::melt(isoformFPKM, id.vars = 'isoform_id')
            isoformFPKM$variable <-
                gsub('_FPKM$', '', isoformFPKM$variable)
            colnames(isoformFPKM)[3] <- 'expression'
            # ci high
            isoformFPKMciHi <-
                isoformAnnotation[, which(grepl(
                    paste('isoform_id|', highString, sep = ''),
                    colnames(isoformAnnotation)
                ))]
            isoformFPKMciHi <-
                reshape2::melt(isoformFPKMciHi, id.vars = 'isoform_id')
            isoformFPKMciHi$variable <-
                gsub(highString, '', isoformFPKMciHi$variable)
            colnames(isoformFPKMciHi)[3] <- 'ci_hi'
            # stderr
            isoformFPKMcombined <-
                dplyr::inner_join(isoformFPKM,
                      isoformFPKMciHi,
                      by = c('isoform_id', 'variable'))
            isoformFPKMcombined$iso_stderr <-
                (isoformFPKMcombined$ci_hi - isoformFPKMcombined$expression) / 2 # How it's done in cufflinks source code
            isoformFPKMcombined$expression <- NULL
            isoformFPKMcombined$ci_hi <- NULL
            colnames(isoformFPKMcombined) <-
                c('isoform_id', 'sample_name', 'iso_stderr')

            ### Tjek if the gene CI collums are switches
            ciLowColumn <-
                which(grepl('_conf_lo', colnames(geneAnnotation)))[1]
            ciHighColumn <-
                which(grepl('_conf_hi', colnames(geneAnnotation)))[1]

            if (all(
                geneAnnotation[, ciHighColumn] >= geneAnnotation[, ciLowColumn])
            ) {
                highString <- '_conf_hi'
            } else {
                highString <- '_conf_lo'
            }

            ### extract gene sddev from CI
            # fpkm
            geneFPKM <- geneAnnotation[, which(grepl(
                    'tracking_id|_FPKM', colnames(geneAnnotation)
                ))]
            geneFPKM <- reshape2::melt(geneFPKM, id.vars = 'tracking_id')
            geneFPKM$variable <-
                gsub('_FPKM$', '', geneFPKM$variable)
            colnames(geneFPKM)[3] <- 'expression'
            # ci high
            geneFPKMciHi <- geneAnnotation[, which(grepl(
                    paste('tracking_id|', highString, sep = ''),
                    colnames(geneAnnotation)
                ))]
            geneFPKMciHi <- reshape2::melt(geneFPKMciHi, id.vars = 'tracking_id')
            geneFPKMciHi$variable <- gsub(highString, '', geneFPKMciHi$variable)
            colnames(geneFPKMciHi)[3] <- 'ci_hi'
            # stderr
            geneFPKMcombined <-
                dplyr::inner_join(geneFPKM,
                      geneFPKMciHi,
                      by = c('tracking_id', 'variable'))
            geneFPKMcombined$iso_stderr <-
                (geneFPKMcombined$ci_hi - geneFPKMcombined$expression) / 2 # how it's done in cufflinks sourece code
            geneFPKMcombined$expression <- NULL
            geneFPKMcombined$ci_hi <- NULL
            colnames(geneFPKMcombined) <-
                c('gene_id', 'sample_name', 'gene_stderr')


            ## Merge stderr with DE analysis
            #isoformDiffanalysis <-
            #    merge(
            #        isoformDiffanalysis,
            #        isoformFPKMcombined,
            #        by.x = c('isoform_id', 'sample_2'),
            #        by.y = c('isoform_id', 'sample_name')
            #    )
            isoformDiffanalysis <- suppressWarnings( dplyr::inner_join(
                isoformDiffanalysis,
                isoformFPKMcombined,
                by=c("sample_2" = "sample_name", "isoform_id" = "isoform_id")
            ) )
            colnames(isoformDiffanalysis)[which( grepl(
                'iso_stderr', colnames(isoformDiffanalysis))
            )] <- 'iso_stderr_2'

            #isoformDiffanalysis <- merge(
            #        isoformDiffanalysis,
            #        isoformFPKMcombined,
            #        by.x = c('isoform_id', 'sample_1'),
            #        by.y = c('isoform_id', 'sample_name')
            #    )
            isoformDiffanalysis <- suppressWarnings( dplyr::inner_join(
                isoformDiffanalysis,
                isoformFPKMcombined,
                by=c("sample_2" = "sample_name", "isoform_id" = "isoform_id")
            ) )
            colnames(isoformDiffanalysis)[which(grepl(
                'iso_stderr$',
                colnames(isoformDiffanalysis),
                perl = TRUE
            ))] <- c('iso_stderr_1')

            isoformDiffanalysis <-
                isoformDiffanalysis[, c(
                    'isoform_id',
                    'sample_1',
                    'sample_2',
                    'iso_status',
                    'iso_overall_mean',
                    'iso_value_1',
                    'iso_value_2',
                    'iso_stderr_1',
                    'iso_stderr_2',
                    'iso_log2_fold_change',
                    'iso_p_value',
                    'iso_q_value',
                    'iso_significant'
                )]

            ### Extract and add gene stderr
            #geneDiffanalysis <-
            #    merge(
            #        geneDiffanalysis,
            #        geneFPKMcombined,
            #        by.x = c('gene_id', 'sample_2'),
            #        by.y = c('gene_id', 'sample_name')
            #    )
            geneDiffanalysis <- suppressWarnings( dplyr::inner_join(
                geneDiffanalysis,
                geneFPKMcombined,
                by=c("sample_2" = "sample_name", "gene_id" = "gene_id")
            ) )
            colnames(geneDiffanalysis)[ which(grepl(
                'gene_stderr', colnames(geneDiffanalysis)
            ))] <- 'gene_stderr_2'

            #geneDiffanalysis <- merge(
            #        geneDiffanalysis,
            #        geneFPKMcombined,
            #        by.x = c('gene_id', 'sample_1'),
            #        by.y = c('gene_id', 'sample_name')
            #    )
            geneDiffanalysis <- suppressWarnings( dplyr::inner_join(
                geneDiffanalysis,
                geneFPKMcombined,
                by=c("sample_1" = "sample_name", "gene_id" = "gene_id")
            ) )
            colnames(geneDiffanalysis)[which(grepl(
                'gene_stderr$',
                colnames(geneDiffanalysis),
                perl = TRUE
            ))] <- c('gene_stderr_1')

            geneDiffanalysis <-
                geneDiffanalysis[, c(
                    'gene_id',
                    'sample_1',
                    'sample_2',
                    'gene_status',
                    'gene_overall_mean',
                    'gene_value_1',
                    'gene_value_2',
                    'gene_stderr_1',
                    'gene_stderr_2',
                    'gene_log2_fold_change',
                    'gene_p_value',
                    'gene_q_value',
                    'gene_significant'
                )]

        }


        ### Merge data
        if (TRUE) {
            ### Meger gene DE and annotation data
            isoformData <-
                dplyr::inner_join(isoformAnnotation2, geneDiffanalysis, by = 'gene_id')

            ### Merge with iso DE
            isoformData <-
                dplyr::inner_join(
                    isoformData,
                    isoformDiffanalysis,
                    by = c('isoform_id', 'sample_1', 'sample_2')
                )

            ### Massage again
            colnames(isoformData)[which(
                colnames(isoformData) == 'tss_id'
            )] <- 'TSS_group_id'

        }

    }

    ### Obtain transcript structure information
    if (TRUE) {
        if (!quiet) { message('Step 3 of 5: Obtaining annotation...')}

        ### Import file
        tmp <- capture.output(
            suppressWarnings(
                suppressMessages(
                    exonFeatures <-  rtracklayer::import(pathToGTF, format = 'gtf')
                )
            )
        )
        if (length(exonFeatures) == 0)
            stop("No exon information extracted from GTF")

        ### Filter for what is needed
        exonFeatures <- exonFeatures[
            which(tolower(exonFeatures$type) == 'exon'),
            c('gene_id', 'transcript_id')
            ]

        ### rename
        colnames(exonFeatures@elementMetadata) <- gsub(
            'transcript_id', 'isoform_id',
            colnames(exonFeatures@elementMetadata)
        )
    }

    ### import nulceotide fasta file
    if(TRUE) {
        addIsoformNt <- FALSE

        if( !is.null(isoformNtFasta) ) {
            isoformNtSeq <- do.call(
                c,
                lapply(isoformNtFasta, function(aFile) {
                    Biostrings::readDNAStringSet(
                        filepath = isoformNtFasta, format = 'fasta'
                    )
                })
            )

            if(!is(isoformNtSeq, "DNAStringSet")) {
                stop('The fasta file supplied to \'isoformNtFasta\' does not contain the nucleotide (DNA) sequence...')
            }

            ### Remove preceeding ref|
            if(
                sum(grepl('^ref\\|', names(isoformNtSeq))) == length( isoformNtSeq )
            ) {
                names(isoformNtSeq) <- gsub('^ref\\|', '', names(isoformNtSeq))
            }

            ### Remove potential name duplication
            isoformNtSeq <- isoformNtSeq[which(
                ! duplicated(names(isoformNtSeq))
            )]

            if( ! all(isoformData$isoform_id %in% names(isoformNtSeq)) ) {
                warning(
                    paste(
                        'The fasta file supplied to \'isoformNtFasta\' does not contain the',
                        'nucleotide (DNA) sequence for all isoforms annotated and will not be added!',
                        '\nSpecifically:\n',
                        length(unique(isoformData$isoform_id)), 'isoforms were annotated in the GTF\n',
                        length(unique(names(isoformNtSeq))), 'isoforms have a sequence.\n',
                        'Only', length(intersect(names(isoformNtSeq), isoformData$isoform_id)), 'overlap.\n',
                        length(setdiff(unique(isoformData$isoform_id), names(isoformNtSeq))), 'annoated isoforms isoforms had no corresponding nucleotide sequence\n',

                        '\nIf there is no overlap (as in zero or close) there are two options:\n',
                        '1) The files do not fit together (different databases, versions etc)',
                        '(no fix except using propperly paired files).\n',
                        '2) It is somthing to do with how the isoform ids are stored in the different files.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
                        '    3 Examples from GTF are :',
                        paste0( sample(unique(isoformData$isoform_id), min(3, nrow(isoformData)) ), collapse = ', '),'\n',
                        '    3 Examples of isoform sequence are  :',
                        paste0( sample(names(isoformNtSeq), min(3, length(isoformNtSeq)) ), collapse = ', '),'\n',


                        '\nIf there is a large overlap but still far from complete there are 3 possibilites:\n',
                        '1) The files do not fit together (different databases versions)',
                        '(no fix except using propperly paired files).\n',
                        '2) The isoforms quantified have their nucleotide sequence stored in multiple fasta files (common for Ensembl).',
                        'Just supply a vector with the path to each of them to the \'isoformNtFasta\' argument.\n',
                        '3) One file could contain non-chanonical chromosomes while the other do not',
                        '(might be solved using the \'removeNonConvensionalChr\' argument.)\n',
                        '4) It is somthing to do with how a subset of the isoform ids are stored in the different files.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n\n',
                        sep = ' '
                    )
                )
            } else {
                addIsoformNt <- TRUE
            }

            ### Subset to annotated isoforms
            isoformNtSeq <- isoformNtSeq[which(
                names(isoformNtSeq) %in% isoformData$isoform_id
            )]
            #if( !length(isoformNtSeq) ) {
            #    addIsoformNt <- FALSE
            #}
        }
    }

    ### Check it is the same transcripts in transcript structure and expression info
    if (TRUE) {
        myUnion     <-
            unique(c(
                isoformData$isoform_id,
                exonFeatures$isoform_id,
                isoRepExp2$isoform_id
            ))
        myIntersect <- intersect(
            intersect(isoformData$isoform_id, exonFeatures$isoform_id),
            isoRepExp2$isoform_id
        )

        # If there are descripencies
        if(length(myIntersect) == 0) {
            stop(
                paste(
                    'No overlap between isoform annotation',
                    'and isoform expression data was found',
                    sep=' '
                )
            )
        }


        if (length(myUnion) != length(myIntersect)) {
            isoformData <- isoformData[which(
                isoformData$isoform_id     %in% myIntersect), ]
            exonFeatures <- exonFeatures[which(
                    exonFeatures$isoform_id    %in% myIntersect), ]
            isoRepExp2 <- isoRepExp2[which(
                isoRepExp2$isoform_id       %in% myIntersect), ]

            if (!quiet) {
                message(
                    paste(
                        'There were discrepencies between the GTF and the',
                        'expression analysis files. To solve this',
                        abs(length(myUnion) - length(myIntersect)) ,
                        'transcripts were removed.',
                        sep = ' '
                    )
                )
            }
        }
    }

    ### Fix to correct for Cufflinks annotation problem where cufflinks assignes
    # transcripts from several annotated genes to 1 cuffgene
    if (   fixCufflinksAnnotationProblem ) {
        if (!quiet) { message('Step 4 of 3: Fixing cufflinks annotation problem...')}

        geneName <- unique(isoformData[, c('gene_id', 'gene_name')])
        geneNameSplit <-
            split(geneName$gene_name , f = geneName$gene_id)
        # remove all unique
        geneNameSplit <-
            geneNameSplit[which(sapply(geneNameSplit, function(x)
                length(unique(x))) > 1)]

        if (length(geneNameSplit) > 0) {
            # if there are any problems
            #get indexes of those affected
            geneNameIndexesData     <-
                which(isoformData$gene_id %in% names(geneNameSplit))
            geneNameIndexesFeatures <-
                which(exonFeatures$spliceR.gene_id %in% names(geneNameSplit))

            # combine names of cuffgenes and gene short name
            isoformData$gene_id[geneNameIndexesData]              <-
                paste(isoformData$gene_id[geneNameIndexesData]        ,
                      isoformData$gene_name[geneNameIndexesData],
                      sep = ':')
            exonFeatures$spliceR.gene_id[geneNameIndexesFeatures] <-
                paste(exonFeatures$spliceR.gene_id[geneNameIndexesFeatures],
                      exonFeatures$spliceR.gene_name[geneNameIndexesFeatures],
                      sep = ':')

            ## Correct gene expression levels and differntial analysis
            problematicGenes <-
                isoformData[geneNameIndexesData, c(
                    'isoform_id',
                    'gene_id',
                    'sample_1',
                    'sample_2',
                    'gene_overall_mean',
                    'gene_value_1',
                    'gene_value_2',
                    'gene_stderr_1',
                    'gene_stderr_2',
                    'gene_log2_fold_change',
                    'gene_p_value',
                    'gene_q_value',
                    'gene_significant',
                    'iso_status',
                    'iso_overall_mean',
                    'iso_value_1',
                    'iso_value_2'
                )]
            problematicGenesSplit <-
                split(problematicGenes, f = problematicGenes[
                    ,c('gene_id', 'sample_1', 'sample_2')], drop =TRUE)

            correctedGenes <-
                plyr::ldply(
                    problematicGenesSplit,
                    .fun = function(df) {
                        # df <- problematicGenesSplit[[1]]
                        df$gene_overall_mean <- sum(df$iso_overall_mean)
                        df$gene_value_1 <- sum(df$iso_value_1)
                        df$gene_value_2 <- sum(df$iso_value_2)
                        df$gene_stderr_1 <- NA
                        df$gene_stderr_2 <- NA
                        df$gene_log2_fold_change <- log2(
                            (df$gene_value_2[2] + 0.0001) /
                                (df$gene_value_1[1] + 0.0001)
                        )
                        df$gene_p_value <- 1
                        df$gene_q_value <- 1
                        df$gene_significant <- 'no'
                        df$iso_status <- 'NOTEST'
                        return(df)
                    }
                )

            # sort so genes end up being in correct order for overwriting
            correctedGenes <-
                correctedGenes[order(
                    correctedGenes$isoform_id,
                    correctedGenes$gene_id,
                    correctedGenes$sample_1,
                    correctedGenes$sample_2
                ), -1] # -1 removes the index created by ldply
            # overwrite problematic genes
            isoformData[geneNameIndexesData, c(
                'gene_id',
                'sample_1',
                'sample_2',
                'gene_overall_mean',
                'gene_value_1',
                'gene_value_2',
                'gene_stderr_1',
                'gene_stderr_2',
                'gene_log2_fold_change',
                'gene_p_value',
                'gene_q_value',
                'gene_significant',
                'iso_status',
                'iso_overall_mean',
                'iso_value_1',
                'iso_value_2'
            )] <- correctedGenes[, -1] # -1 removes the isoform id


            ### Add to exons
            exonFeatures$gene_id <- isoformData$gene_id[match(
                exonFeatures$isoform_id, isoformData$isoform_id
            )]

            if (!quiet) {
                message(
                    paste(
                        "    Cufflinks annotation problem was fixed for",
                        length(geneNameSplit),
                        "Cuff_genes",
                        sep = ' '
                    )
                )
            }
        } else {
            if (!quiet) {
                message(paste(
                    'No instances of a Cufflinks annotation',
                    'problem found - no changes were made'
                ))
            }
        }
    } # end of fix cufflinks annotatopn problem
    if ( ! fixCufflinksAnnotationProblem ) {
        if (!quiet) { message('Step 4 of 5: Skipped fixing of cufflinks annotation problem (due to fixCufflinksAnnotationProblem argument)...')}
    }

    if (!quiet) { message('Step 5 of 5: Creating switchAanalyzeRlist...')}

    ### Calculate IF values
    localAnnot <- unique(isoformData[,c('gene_id','isoform_id')])
    ifMatrix <- isoformToIsoformFraction(
        isoformRepExpression = isoRepFpkm,
        isoformGeneAnnotation = localAnnot,
        quiet = TRUE
    )

    ### Summarize IF
    myMeanIF <- rowMeans(ifMatrix[,designMatrix$sampleID,drop=FALSE], na.rm = TRUE)
    ifMeanDf <- plyr::ddply(
        .data = designMatrix,
        .variables = 'condition',
        .fun = function(aDF) { # aDF <- switchAnalyzeRlist$designMatrix[1:2,]
            tmp <- rowMeans(ifMatrix[,aDF$sampleID,drop=FALSE], na.rm = TRUE)
            data.frame(
                isoform_id=names(tmp),
                mean=tmp,
                stringsAsFactors = FALSE
            )
        }
    )

    isoformData$IF_overall <- myMeanIF[match(
        isoformData$isoform_id, names(myMeanIF)
    )]
    isoformData$IF1 <- ifMeanDf$mean[match(
        stringr::str_c(isoformData$isoform_id,isoformData$sample_1),
        stringr::str_c(ifMeanDf$isoform_id, ifMeanDf$condition)
    )]
    isoformData$IF2 <- ifMeanDf$mean[match(
        stringr::str_c(isoformData$isoform_id,isoformData$sample_2),
        stringr::str_c(ifMeanDf$isoform_id, ifMeanDf$condition)
    )]
    isoformData$dIF <- isoformData$IF2 - isoformData$IF1

    ### Add q-values
    if (!is.null(pathToSplicingAnalysis)) {
        ### Add cufflinks analysis isoform switch analysis results
        isoformData$isoform_switch_q_value <- NA
        isoformData$gene_switch_q_value <-
            cuffSplicing$q_value[match(
                paste(
                    isoformData$gene_id,
                    isoformData$sample_1,
                    isoformData$sample_2,
                    sep = '_'
                ),
                paste(
                    cuffSplicing$gene_id,
                    cuffSplicing$sample_1,
                    cuffSplicing$sample_2,
                    sep = '_'
                )
            )]
    } else {
        ### Add collumns for isoform switch analysis results
        isoformData$isoform_switch_q_value <- NA
        isoformData$gene_switch_q_value <- NA
    }

    ### Reorder a bit
    ofInterest <- c('isoform_id','gene_id','gene_name','sample_1','sample_2')
    isoformData <- isoformData[, c(
        match( ofInterest, colnames(isoformData)),
        which( ! colnames(isoformData) %in% ofInterest)
    )]
    colnames(isoformData)[4:5] <- c('condition_1', 'condition_2')
    isoformData <- as.data.frame(isoformData)

    ### Extract run info
    # cufflinks version
    cuffVersion <- runInfo$value[2]

    # replicate numbers
    nrRep <- table(readGroup$condition)
    nrRep <-
        data.frame(
            condition = names(nrRep),
            nrReplicates = as.vector(nrRep),
            row.names = NULL,
            stringsAsFactors = FALSE
        )


    isoRepFpkm$isoform_id <- rownames(isoRepFpkm)
    rownames(isoRepFpkm) <- NULL
    isoRepFpkm <- isoRepFpkm[,c(
        which(colnames(isoRepFpkm) == 'isoform_id'),
        which(colnames(isoRepFpkm) != 'isoform_id')
    )]

    # Return SpliceRList
    switchAnalyzeRlist <- createSwitchAnalyzeRlist(
        isoformFeatures = isoformData,
        exons = exonFeatures,
        designMatrix = designMatrix,
        isoformCountMatrix = isoRepExp2,
        isoformRepExpression = isoRepFpkm,
        sourceId = paste("cufflinks", cuffVersion , sep = '_')
    )

    if (!is.null(pathToSplicingAnalysis)) {
        if( nrow(cuffSplicing) ) {
            switchAnalyzeRlist$isoformSwitchAnalysis <- as.data.frame(cuffSplicing)
        }
    }

    if( addIFmatrix ) {
        ifMatrix$isoform_id <- rownames(ifMatrix)
        rownames(ifMatrix) <- NULL
        ifMatrix <- ifMatrix[,c(
            which(colnames(ifMatrix) == 'isoform_id'),
            which(colnames(ifMatrix) != 'isoform_id')
        )]

        switchAnalyzeRlist$isoformRepIF <- ifMatrix
    }

    if(addIsoformNt) {
        switchAnalyzeRlist$ntSequence <- isoformNtSeq[which(
            names(isoformNtSeq) %in% switchAnalyzeRlist$isoformFeatures$isoform_id
        )]
    }

    ### Estimate DTU
    if(estimateDifferentialGeneRange & !quiet) {
        localEstimate <- estimateDifferentialRange(switchAnalyzeRlist)

        message('The GUESSTIMATED number of genes with differential isoform usage are:')
        print(localEstimate)
    }

    if (!quiet) {
        message("Done")
    }

    return(switchAnalyzeRlist)
}

importGTF <- function(
    pathToGTF,
    isoformNtFasta = NULL,
    extractAaSeq = FALSE,
    addAnnotatedORFs = TRUE,
    onlyConsiderFullORF = FALSE,
    removeNonConvensionalChr = FALSE,
    ignoreAfterBar = TRUE,
    ignoreAfterSpace = TRUE,
    ignoreAfterPeriod = FALSE,
    removeTECgenes = TRUE,
    PTCDistance = 50,
    removeFusionTranscripts = TRUE,
    quiet = FALSE
) {
    ### Test files
    if(TRUE) {
        ### Test existance of files
        if(TRUE) {
            if( pathToGTF == '' ) {
                stop(
                    paste(
                        'The \'pathToGTF\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                        '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                        'to import your own data? The system.file() should only be used',
                        'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                        'To access your own data simply provide the string to the directory with the data as:',
                        '"path/to/quantification/".',
                        sep=' '
                    )
                )
            }
            if( ! (file.exists(pathToGTF) | RCurl::url.exists(pathToGTF)) ) {
                stop(
                    paste(
                        'The file pointed to with the \'pathToGTF\' argument does not exists.',
                        '\nDid you accidentially make a spelling mistake or added a unwanted "/" infront of the text string?',
                        sep=' '
                    )
                )
            }

            if( !is.null(isoformNtFasta)) {
                if( !is.character( isoformNtFasta)) {
                    stop('The \'isoformNtFasta\' argument must be a charachter string.')
                }

                if( any(isoformNtFasta == '') ) {
                    stop(
                        paste(
                            'The \'isoformNtFasta\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                            '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                            'to import your own data? The system.file() should only be used',
                            'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                            'To access your own data simply provide the string to the directory with the data as:',
                            '"path/to/quantification/".',
                            sep=' '
                        )
                    )
                }

                if( any( ! sapply(isoformNtFasta, file.exists) ) ) {
                    stop('At least one of the file(s) pointed to with \'isoformNtFasta\' seems not to exist.')
                }

                if( any(! grepl('\\.fa|\\.fasta|\\.fa.gz|\\.fasta.gz', isoformNtFasta)) ) {
                    stop('The file pointed to via the \'isoformNtFasta\' argument does not seem to be a fasta file...')
                }
            }


        }


        if( ! grepl('\\.gtf$|\\.gtf\\.gz$|\\.gff$|\\.gff\\.gz$|\\.gff3$|\\.gff3\\.gz$', pathToGTF, ignore.case = TRUE) ) {
            warning('The file pointed to by the "pathToGTF" argument appearts not to be a GTF/GFF file as it does have the right suffix - are you sure it is the rigth file?')
        }

        isGTF <- grepl('\\.gtf$|\\.gtf\\.gz$', pathToGTF, ignore.case = TRUE)

    }

    # Read in from GTF/GFF file
    if(TRUE) {
        if(   isGTF ) {
            if (!quiet) {
                message('Importing GTF (this may take a while)...')
            }
            tmp <- capture.output(
                suppressWarnings(
                    suppressMessages(
                        mfGTF <- rtracklayer::import(pathToGTF, format='gtf')
                    )
                )
            )
            ### Check GTF
            if (!all(c('transcript_id', 'gene_id') %in% colnames(mfGTF@elementMetadata))) {
                collumnsMissing <- paste(
                    c('transcript_id', 'gene_id')[which(
                        !c('transcript_id', 'gene_id') %in%
                            colnames(mfGTF@elementMetadata)
                    )], collapse = ', ')
                stop(
                    paste(
                        'The GTF file must contain the folliwing collumns',
                        '\'transcript_id\' and \'gene_id\'.',
                        collumnsMissing,
                        'is missing.',
                        sep = ' '
                    )
                )
            }
        }
        if( ! isGTF ){
            if (!quiet) {
                message('importing GFF (this may take a while)')
            }
            tmp <- capture.output(
                suppressWarnings(
                    suppressMessages(
                        mfGTF <- rtracklayer::import(pathToGTF, format='gff')
                    )
                )
            )

            ### Check GTF
            geneIdPressent <- 'gene_id' %in% colnames(mfGTF@elementMetadata)

            if( ! geneIdPressent ) {
                ### Check for RefSeq "gene" (aka gene_id)
                if (!all(c('transcript_id', 'gene') %in% colnames(mfGTF@elementMetadata) )) {
                    collumnsMissing <- paste(
                        c('transcript_id', 'gene')[which(
                            !c('transcript_id', 'gene') %in%
                                colnames(mfGTF@elementMetadata)
                        )], collapse = ', ')
                    stop(
                        paste(
                            'The GFF file must contain the folliwing collumns',
                            '\'transcript_id\' and \'gene\'.',
                            collumnsMissing,
                            'is missing.',
                            sep = ' '
                        )
                    )
                }

                ### Rename RefSeq gene to gene_id
                if( ! 'gene_id' %in% colnames(mcols(mfGTF)) ) {
                    if( 'gene' %in% colnames(mcols(mfGTF)) ) {
                        colnames(mcols(mfGTF))[which(
                            colnames(mcols(mfGTF)) == 'gene'
                        )] <- 'gene_id'
                    } else {
                        stop('Could not locate the gene id in the gff file.')
                    }
                }
            }
            if(   geneIdPressent ) {
                stop(
                    paste0(
                        'This is not a RefSeq GFF file (from ftp://ftp.ncbi.nlm.nih.gov/genomes/).',
                        '\nIsoformSwitchAnalyzeR only handles RefSeq GFF files so please supply GTF file instead.',
                        '\n(for more info see FAQ about annotate databases in vignette).'
                    )
                )
            }


        }
    }

    ### Reduce if nessesary
    if (removeNonConvensionalChr) {
        mfGTF <- mfGTF[which( ! grepl('_'  , as.character(mfGTF@seqnames))), ]
        mfGTF <- mfGTF[which( ! grepl('\\.', as.character(mfGTF@seqnames))), ]

        if (length(mfGTF) == 0) {
            stop('No exons were left after filtering',
                 'with \'removeNonConvensionalChr\'.')
        }

        seqlevels(mfGTF) <- as.character(mfGTF@seqnames@values)
    }

    if( removeTECgenes ) {
        if(isGTF) {
            ### Ensembl
            if( 'gene_biotype' %in% colnames(mcols(mfGTF)) ) {
                toExclude <-  mfGTF$gene_biotype == 'TEC'
                if( any( toExclude ) ) {
                    mfGTF <- mfGTF[-which( toExclude ),]
                }
            }
            ### Gencode
            if( 'gene_type' %in% colnames(mcols(mfGTF)) ) {
                toExclude <-  mfGTF$gene_type == 'TEC'
                if( any( toExclude ) ) {
                    mfGTF <- mfGTF[-which( toExclude ),]
                }

            }
        }
    }

    ### Potentially add version numbering
    if( TRUE ) {
        if(any( colnames(mcols(mfGTF)) == 'gene_version' )) {
            mfGTF$gene_id <- stringr::str_c(
                mfGTF$gene_id,
                '.',
                mfGTF$gene_version
            )
        }
        if(any( colnames(mcols(mfGTF)) == 'transcript_version' )) {
            mfGTF$transcript_id <- stringr::str_c(
                mfGTF$transcript_id,
                '.',
                mfGTF$transcript_version
            )
        }
    }

    ### Fix names
    if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {

        mfGTF$transcript_id <- fixNames(
            nameVec = mfGTF$transcript_id,
            ignoreAfterBar = ignoreAfterBar,
            ignoreAfterSpace = ignoreAfterSpace,
            ignoreAfterPeriod = ignoreAfterPeriod
        )

    }

    ### Make annoation
    if(TRUE) {
        if(   isGTF ) {
            if (!quiet) {
                message('Massaging annoation...')
            }
            exonAnoationIndex <- which(mfGTF$type == 'exon')

            colsToExtract <- c(
                'transcript_id', 'gene_id', 'gene_name',
                'gene_type','gene_biotype',     # respectively gencode and ensembl gene type col
                'transcript_biotype','transcript_type'
            )
            myIso <-
                as.data.frame(unique(mfGTF@elementMetadata[
                    exonAnoationIndex,
                    na.omit(match(colsToExtract, colnames(mfGTF@elementMetadata)))]
                ))

            ### Handle columns not extracted
            if (is.null(myIso$gene_name)) {
                myIso$gene_name <- NA
            }

            ### Handle columns with multiple options
            geneTypeCol <- which(colnames(myIso) %in% c('gene_type','gene_biotype'))
            if( length(geneTypeCol) == 0 ) {
                myIso$geneType <- NA
            } else {
                myIso$geneType <- myIso[,geneTypeCol[1]]
            }

            isoTypeCol <- which(colnames(myIso) %in% c('transcript_biotype','transcript_type'))
            if( length(isoTypeCol) == 0 ) {
                myIso$isoType <- NA
            } else {
                myIso$isoType <- myIso[,isoTypeCol]
            }

        }
        if( ! isGTF ) {
            if (!quiet) {
                message('converting GFF to switchAnalyzeRlist')
            }
            exonAnoationIndex <- which(mfGTF$type == 'exon')
            colsToExtract <- c(
                'Parent',
                'gene_id',
                'transcript_id'
            )

            # extract exon annot
            myIso <-
                as.data.frame(unique(mfGTF@elementMetadata[
                    exonAnoationIndex,
                    na.omit(match(colsToExtract, colnames(mfGTF@elementMetadata)))]
                ))

            if(any(is.na(myIso$transcript_id))) {
                nNa <- sum(is.na(myIso$transcript_id))

                warning(
                    paste(
                        'There were', nNa, 'annotated features without isoform_ids.',
                        'These were removed.'
                    )
                )

                myIso <- myIso[which(
                    ! is.na(myIso$transcript_id)
                ),]
            }

            # extract gene biotype
            if( 'gene_biotype' %in% colnames(mcols(mfGTF)) ) {
                gffGeneAnnot <- mfGTF[which( mfGTF$type == 'gene' ),c(
                    'ID',
                    'gene_id',
                    'gene_biotype'
                )]

                myIso$gene_biotype    <- gffGeneAnnot$gene_biotype[match(myIso$gene_id, gffGeneAnnot$gene_id)]
            }


            ### Handle columns not extracted
            if (is.null(myIso$gene_name)) {
                myIso$gene_name <- NA
            }

            ### Handle columns with multiple options
            geneTypeCol <- which(colnames(myIso) %in% c('gene_type','gene_biotype'))
            if( length(geneTypeCol) == 0 ) {
                myIso$geneType <- NA
            } else {
                myIso$geneType <- myIso[,geneTypeCol[1]]
            }

            isoTypeCol <- which(colnames(myIso) %in% c('transcript_biotype','transcript_type'))
            if( length(isoTypeCol) == 0 ) {
                myIso$isoType <- NA
            } else {
                myIso$isoType <- myIso[,isoTypeCol]
            }
        }


        ### Make annotation
        myIsoAnot <- data.frame(
            isoform_id = myIso$transcript_id,
            gene_id = myIso$gene_id,
            condition_1 = "plaseholder1",
            condition_2 = "plaseholder2",
            gene_name = myIso$gene_name,
            gene_biotype = myIso$geneType,
            iso_biotype = myIso$isoType,
            class_code = '=',
            gene_overall_mean = 0,
            gene_value_1 = 0,
            gene_value_2 = 0,
            gene_stderr_1 = NA,
            gene_stderr_2 = NA,
            gene_log2_fold_change = 0,
            gene_p_value = 1,
            gene_q_value = 1,
            iso_overall_mean = 0,
            iso_value_1 = 0,
            iso_value_2 = 0,
            iso_stderr_1 = NA,
            iso_stderr_2 = NA,
            iso_log2_fold_change = 0,
            iso_p_value = 1,
            iso_q_value = 1,
            IF_overall = NA,
            IF1 = NA,
            IF2 = NA,
            dIF = NA,
            isoform_switch_q_value = NA,
            gene_switch_q_value = NA,
            stringsAsFactors = FALSE
        )

    }

    ### Test for annoation problems
    if(TRUE) {
        geneSummary <-
            myIsoAnot %>%
            select(gene_id, gene_name) %>%
            dplyr::distinct() %>%
            group_by(gene_id) %>%
            dplyr::summarise(
                n_gene_names = length(na.omit(gene_name)),
                have_missing_gene_name = any(is.na(gene_name))
            )

        missingGeneProblem <- any(
            geneSummary$n_gene_names > 0 & geneSummary$have_missing_gene_name
        )
        mergedGeneProblem <- any(
            geneSummary$n_gene_names > 1
        )

        if( missingGeneProblem | mergedGeneProblem ) {
            warning(
                paste0(
                    '\nThe annotaion seems to have probelems that commonly occure',
                    '\n  when transcript assembly is done (gene merging and unassigned novel isoforms).',
                    '\n  These can be fixed and/or rescued by using the importRdata() function instead.',
                    '\n'
                )
            )
        }
    }

    ### Add CDS annoation from GTF file inc convertion to transcript coordinats
    if (addAnnotatedORFs) {

        if(   isGTF ) {
            # test whether any CDS are found
            if (any(mfGTF$type == 'CDS')) {
                if (!quiet) {
                    message('Massaging annotated CDSs...')
                }

                ### Prepare CDS data
                myCDS <-
                    mfGTF[which(mfGTF$type == 'CDS'), 'transcript_id']
                colnames(mcols(myCDS)) <- 'isoform_id'

                ### Prepare exon data
                localExons <- mfGTF[which(mfGTF$type == 'exon'), 'transcript_id']
                colnames(mcols(localExons)) <- 'isoform_id'
                localExons <-
                    localExons[which(
                        as.character(localExons@strand) %in% c('+', '-')), ]
                localExons <-
                    localExons[which(
                        localExons$isoform_id %in% myCDS$isoform_id
                ), ]

                ### Analyze CDS
                orfInfo <- analyseCds(
                    myCDS = myCDS,
                    localExons = localExons,
                    onlyConsiderFullORF = onlyConsiderFullORF,
                    mfGTF = mfGTF,
                    PTCDistance = PTCDistance
                )

                # make sure all ORFs are annotated (with NAs)
                orfInfo <-
                    dplyr::full_join(
                        orfInfo,
                        unique(myIsoAnot[, 'isoform_id', drop = FALSE]),
                        by = 'isoform_id',
                        all = TRUE
                    )

            } else {
                # if no CDS were found
                warning(paste(
                    'No CDS was found in the GTF file. Please make sure the GTF',
                    'file have the CDS "feature" annotation. Adding NAs instead'
                ))

                orfInfo <- data.frame(
                    isoform_id = unique(myIsoAnot$isoform_id),
                    orfTransciptStart = NA,
                    orfTransciptEnd = NA,
                    orfTransciptLength = NA,
                    orfStarExon = NA,
                    orfEndExon = NA,
                    orfStartGenomic = NA,
                    orfEndGenomic = NA,
                    stopDistanceToLastJunction = NA,
                    stopIndex = NA,
                    PTC = NA,
                    stringsAsFactors = FALSE
                )
            }

            ### add to iso annotation
            myIsoAnot$PTC <-
                orfInfo$PTC[match(myIsoAnot$isoform_id, orfInfo$isoform_id)]


        }
        if( ! isGTF ) {
            # test whether any CDS are found
            if (any(mfGTF$type == 'CDS')) {
                if (!quiet) {
                    message('converting annotated CDSs')
                }

                ### Extract CDS Ids
                colsToExtract <- c(
                    'ID',
                    'gene_id',
                    'transcript_id'
                )
                myIso2 <-
                    as.data.frame(unique(mfGTF@elementMetadata[
                        which(mfGTF$type == 'mRNA'),
                        na.omit(match(colsToExtract, colnames(mfGTF@elementMetadata)))]
                    ))

                ### Extract CDS
                myCDS <- mfGTF[which(mfGTF$type == 'CDS'),c('ID','transcript_id','Parent')]
                myCDS$Parent <- sapply(myCDS$Parent, function(x) x[1])

                ### Transfer ids
                myCDS <- myCDS[which(
                    myCDS$Parent %in% myIso2$ID
                ),]
                myCDS$transcript_id <- myIso2$transcript_id[match(
                    myCDS$Parent, myIso2$ID
                )]
                myCDS <- myCDS[which(
                    myCDS$transcript_id %in% myIsoAnot$isoform_id
                ),]

                ### Get it
                myCDS <-
                    sort(myCDS[,'transcript_id'])
                myCDSedges <-
                    suppressMessages(unlist(range(
                        split(myCDS[, 0], f = myCDS$transcript_id)
                    )))  # Extract EDGEs
                myCDSedges$id <- names(myCDSedges)
                names(myCDSedges) <- NULL

                ### Extract Exons
                localExons <- mfGTF[exonAnoationIndex, 'transcript_id']
                localExons <-
                    localExons[which(
                        as.character(localExons@strand) %in% c('+', '-')), ]
                localExons <-
                    localExons[which(localExons$transcript_id %in% myCDSedges$id), ]

                localExons <-
                    localExons[order(localExons$transcript_id,
                                     start(localExons),
                                     end(localExons)), ]
                localExons$exon_id <-
                    paste('exon_', 1:length(localExons), sep = '')

                ### This is where I can remove the stop codon

                ### Extract strand specific ORF info
                cds <- as.data.frame(myCDSedges)
                # start
                plusIndex <- which(cds$strand == '+')
                annoatedStartGRangesPlus <-
                    GRanges(
                        cds$seqnames[plusIndex],
                        IRanges(
                            start = cds$start[plusIndex],
                            end = cds$start[plusIndex] #-3 # -3 since stop codon is included in GFF according to https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
                        ),
                        strand = cds$strand[plusIndex],
                        id = cds$id[plusIndex]
                    )
                minusIndex <- which(cds$strand == '-')
                annoatedStartGRangesMinus <-
                    GRanges(
                        cds$seqnames[minusIndex],
                        IRanges(
                            start = cds$end[minusIndex],
                            end = cds$end[minusIndex] #+3 # +3 since stop codon is included in GFF according to https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
                        ),
                        strand = cds$strand[minusIndex],
                        id = cds$id[minusIndex]
                    )

                annoatedStartGRanges <-
                    c(annoatedStartGRangesPlus,
                      annoatedStartGRangesMinus)
                annoatedStartGRanges$orf_id <-
                    paste('cds_', 1:length(annoatedStartGRanges), sep = '')

                # end
                annoatedEndGRangesPlus  <-
                    GRanges(
                        cds$seqnames[plusIndex],
                        IRanges(
                            start = cds$end[plusIndex],
                            end = cds$end[plusIndex]),
                        strand = cds$strand[plusIndex],
                        id = cds$id[plusIndex]
                    )
                annoatedEndGRangesMinus <-
                    GRanges(
                        cds$seqnames[minusIndex],
                        IRanges(
                            start = cds$start[minusIndex],
                            end = cds$start[minusIndex]),
                        strand = cds$strand[minusIndex],
                        id = cds$id[minusIndex]
                    )

                annoatedEndGRanges <-
                    c(annoatedEndGRangesPlus, annoatedEndGRangesMinus)
                annoatedEndGRanges$orf_id <-
                    paste('stop_', 1:length(annoatedEndGRanges), sep = '')

                # combine
                annotatedORFGR <-
                    c(annoatedStartGRanges, annoatedEndGRanges)


                ### Idenetify overlapping CDS and exons as well as the annoate transcript id
                suppressWarnings(overlappingAnnotStart <-
                                     as.data.frame(
                                         findOverlaps(
                                             query = localExons,
                                             subject = annotatedORFGR,
                                             ignore.strand = FALSE
                                         )
                                     ))
                if (!nrow(overlappingAnnotStart)) {
                    stop(
                        'No overlap between CDS and transcripts were found. This is most likely due to a annoation problem around chromosome name.'
                    )
                }

                # Annoate overlap ids
                overlappingAnnotStart$transcript_id <-
                    localExons$transcript_id[overlappingAnnotStart$queryHits]
                overlappingAnnotStart$exon_id <- localExons$exon_id[
                    overlappingAnnotStart$queryHits
                    ]

                overlappingAnnotStart$cdsTranscriptID <- annotatedORFGR$id[
                    overlappingAnnotStart$subjectHits
                    ]
                overlappingAnnotStart$orf_id <- annotatedORFGR$orf_id[
                    overlappingAnnotStart$subjectHits
                    ]

                # subset to annoateted overlap
                overlappingAnnotStart <-
                    overlappingAnnotStart[which(
                        overlappingAnnotStart$transcript_id ==
                            overlappingAnnotStart$cdsTranscriptID
                    ), c('transcript_id',
                         'exon_id',
                         'cdsTranscriptID',
                         'orf_id')]

                # annoate with genomic site
                overlappingAnnotStart$orfGenomic <-
                    start(annotatedORFGR)[match(
                        overlappingAnnotStart$orf_id, annotatedORFGR$orf_id
                    )]


                ### Enrich exon information
                myExons <-
                    as.data.frame(localExons[which(
                        localExons$transcript_id %in%
                            overlappingAnnotStart$transcript_id),])

                # Strand
                myExonPlus <- myExons[which(myExons$strand == '+'), ]
                myExonMinus <- myExons[which(myExons$strand == '-'), ]

                plusSplit <-
                    split(myExonPlus$width, myExonPlus$transcript_id)
                minusSplit <-
                    split(myExonMinus$width, myExonMinus$transcript_id)

                # cumsum
                myExonPlus$cumSum <-
                    unlist(sapply(plusSplit , function(aVec) {
                        cumsum(c(0, aVec))[1:(length(aVec))]
                    }))
                myExonMinus$cumSum <-
                    unlist(sapply(minusSplit, function(aVec) {
                        cumsum(c(0, rev(aVec)))[(length(aVec)):1] # reverse
                    }))

                # exon number
                myExonPlus$nrExon <-
                    unlist(sapply(plusSplit, function(aVec) {
                        1:length(aVec)
                    }))
                myExonMinus$nrExon <-
                    unlist(sapply(minusSplit, function(aVec) {
                        1:length(aVec)
                    }))

                # total nr exons
                myExonPlus$lastExonIndex <-
                    unlist(sapply(plusSplit, function(aVec) {
                        rep(length(aVec), length(aVec))
                    }))
                myExonMinus$lastExonIndex <-
                    unlist(sapply(minusSplit, function(aVec) {
                        rep(1, length(aVec))
                    }))

                # final exon exon junction trancipt position
                myExonPlus$finalJunctionPos <-
                    unlist(sapply(plusSplit , function(aVec) {
                        rep(cumsum(c(0, aVec))[length(aVec)], times = length(aVec))
                    }))
                myExonMinus$finalJunctionPos <-
                    unlist(sapply(minusSplit, function(aVec) {
                        rep(cumsum(c(0, rev(
                            aVec
                        )))[length(aVec)], times = length(aVec))
                    }))

                myExons2 <- rbind(myExonPlus, myExonMinus)

                ### Annoate with exon information
                matchIndex <-
                    match(overlappingAnnotStart$exon_id, myExons2$exon_id)
                overlappingAnnotStart$strand <- myExons2$strand[matchIndex]
                overlappingAnnotStart$exon_start <- myExons2$start[matchIndex]
                overlappingAnnotStart$exon_end <- myExons2$end[matchIndex]
                overlappingAnnotStart$exon_cumsum <- myExons2$cumSum[matchIndex]
                overlappingAnnotStart$exon_nr <- myExons2$nrExon[matchIndex]
                overlappingAnnotStart$lastExonIndex <-
                    myExons2$lastExonIndex[matchIndex]
                overlappingAnnotStart$finalJunctionPos <-
                    myExons2$finalJunctionPos[matchIndex]

                ### Annoate with transcript coordinats
                overlappingAnnotStartPlus <-
                    overlappingAnnotStart[which(
                        overlappingAnnotStart$strand == '+'), ]
                overlappingAnnotStartPlus$orfTranscript <-
                    overlappingAnnotStartPlus$exon_cumsum + (
                        overlappingAnnotStartPlus$orfGenomic -
                            overlappingAnnotStartPlus$exon_start
                    ) + 1
                overlappingAnnotStartPlus$junctionDistance <-
                    overlappingAnnotStartPlus$finalJunctionPos -
                    overlappingAnnotStartPlus$orfTranscript + 3 # +3 because the ORF does not include the stop codon - but it should in this calculation

                overlappingAnnotStartMinus <-
                    overlappingAnnotStart[which(
                        overlappingAnnotStart$strand == '-'), ]
                overlappingAnnotStartMinus$orfTranscript <-
                    overlappingAnnotStartMinus$exon_cumsum + (
                        overlappingAnnotStartMinus$exon_end -
                            overlappingAnnotStartMinus$orfGenomic
                    ) + 1
                overlappingAnnotStartMinus$junctionDistance <-
                    overlappingAnnotStartMinus$finalJunctionPos -
                    overlappingAnnotStartMinus$orfTranscript + 3 # +3 because the ORF does not include the stop codon - but it should in this calculation

                overlappingAnnotStart2 <-
                    rbind(overlappingAnnotStartPlus,
                          overlappingAnnotStartMinus)
                overlappingAnnotStart2 <-
                    overlappingAnnotStart2[order(
                        overlappingAnnotStart2$transcript_id,
                        overlappingAnnotStart2$exon_start,
                        overlappingAnnotStart2$exon_end
                    ), ]

                ### devide into start and stop
                starInfo <-
                    overlappingAnnotStart2[which(
                        grepl('^cds', overlappingAnnotStart2$orf_id)), ]
                stopInfo <-
                    overlappingAnnotStart2[which(
                        grepl('^stop', overlappingAnnotStart2$orf_id)), ]

                ### predict PTC
                stopInfo$PTC <-
                    stopInfo$exon_nr != stopInfo$lastExonIndex &
                    stopInfo$junctionDistance > PTCDistance

                ### Merge the data
                starInfo2 <-
                    unique(starInfo[, c('transcript_id',
                                        'orfGenomic',
                                        'exon_nr',
                                        'orfTranscript')])
                colnames(starInfo2) <-
                    c('isoform_id',
                      'orfStartGenomic',
                      'orfStarExon',
                      'orfTransciptStart')

                stopInfo2 <-
                    unique(stopInfo[, c(
                        'transcript_id',
                        'orfGenomic',
                        'exon_nr',
                        'orfTranscript',
                        'junctionDistance',
                        'lastExonIndex',
                        'PTC'
                    )])
                colnames(stopInfo2) <-
                    c(
                        'isoform_id',
                        'orfEndGenomic',
                        'orfEndExon',
                        'orfTransciptEnd',
                        'stopDistanceToLastJunction',
                        'stopIndex',
                        'PTC'
                    )

                orfInfo <- dplyr::inner_join(starInfo2, stopInfo2, by = 'isoform_id')
                orfInfo$orfTransciptLength  <-
                    orfInfo$orfTransciptEnd - orfInfo$orfTransciptStart + 1

                # reorder
                orfInfo <-
                    orfInfo[, c(
                        'isoform_id',
                        'orfTransciptStart',
                        'orfTransciptEnd',
                        'orfTransciptLength',
                        'orfStarExon',
                        'orfEndExon',
                        'orfStartGenomic',
                        'orfEndGenomic',
                        'stopDistanceToLastJunction',
                        'stopIndex',
                        'PTC'
                    )]

                # make sure all ORFs are annotated (with NAs)
                orfInfo <-
                    dplyr::full_join(orfInfo,
                                     unique(myIsoAnot[, 'isoform_id', drop = FALSE]),
                                     by = 'isoform_id',
                                     all = TRUE)

            } else {
                # if no CDS were found
                warning(paste(
                    'No CDS was found in the GTF file. Please make sure the GTF',
                    'file have the CDS "feature" annotation. Adding NAs instead'
                ))

                orfInfo <- data.frame(
                    isoform_id = unique(myIsoAnot$isoform_id),
                    orfTransciptStart = NA,
                    orfTransciptEnd = NA,
                    orfTransciptLength = NA,
                    orfStarExon = NA,
                    orfEndExon = NA,
                    orfStartGenomic = NA,
                    orfEndGenomic = NA,
                    stopDistanceToLastJunction = NA,
                    stopIndex = NA,
                    PTC = NA,
                    stringsAsFactors = FALSE
                )
            }

            ### add to iso annotation
            myIsoAnot$PTC <-
                orfInfo$PTC[match(myIsoAnot$isoform_id, orfInfo$isoform_id)]

        }
    }

    ### Handle sequence input
    if(TRUE) {
        addIsoformNt <- FALSE

        if( !is.null(isoformNtFasta) ) {
            isoformNtSeq <- do.call(
                c,
                lapply(isoformNtFasta, function(aFile) {
                    Biostrings::readDNAStringSet(
                        filepath = isoformNtFasta, format = 'fasta'
                    )
                })
            )

            if(!is(isoformNtSeq, "DNAStringSet")) {
                stop('The fasta file supplied to \'isoformNtFasta\' does not contain the nucleotide (DNA) sequence...')
            }

            ### Remove preceeding ref|
            if(
                sum(grepl('^ref\\|', names(isoformNtSeq))) == length( isoformNtSeq )
            ) {
                names(isoformNtSeq) <- gsub('^ref\\|', '', names(isoformNtSeq))
            }

            ### Remove potential name duplication
            isoformNtSeq <- isoformNtSeq[which(
                ! duplicated(names(isoformNtSeq))
            )]

            ### Fix names
            if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {

                names(isoformNtSeq) <- fixNames(
                    nameVec = names(isoformNtSeq),
                    ignoreAfterBar = ignoreAfterBar,
                    ignoreAfterSpace = ignoreAfterSpace,
                    ignoreAfterPeriod = ignoreAfterPeriod
                )
            }

            ### Subset to those in GTF file
            isoformNtSeq <- isoformNtSeq[which(
                names(isoformNtSeq) %in% myIsoAnot$isoform_id
            )]

            if( ! all(myIsoAnot$isoform_id %in% names(isoformNtSeq)) ) {
                warning(
                    paste(
                        'The fasta file supplied to \'isoformNtFasta\' does not contain the',
                        'nucleotide (DNA) sequence for all isoforms annotated and will not be added!',
                        '\nSpecifically:\n',
                        length(unique(myIsoAnot$isoform_id)), 'isoforms were annotated in the GTF\n',
                        length(unique(names(isoformNtSeq))), 'isoforms have a sequence.\n',
                        'Only', length(intersect(names(isoformNtSeq), myIsoAnot$isoform_id)), 'overlap.\n',
                        length(setdiff(unique(myIsoAnot$isoform_id), names(isoformNtSeq))), 'annoated isoforms isoforms had no corresponding nucleotide sequence\n',

                        '\nIf there is no overlap (as in zero or close) there are two options:\n',
                        '1) The files do not fit together (different databases, versions etc)',
                        '(no fix except using propperly paired files).\n',
                        '2) It is somthing to do with how the isoform ids are stored in the different files.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
                        '    3 Examples from GTF are :',
                        paste0( sample(unique(myIsoAnot$isoform_id), min(3, nrow(myIsoAnot)) ), collapse = ', '),'\n',
                        '    3 Examples of isoform sequence are  :',
                        paste0( sample(names(isoformNtSeq), min(3, length(isoformNtSeq)) ), collapse = ', '),'\n',


                        '\nIf there is a large overlap but still far from complete there are 3 possibilites:\n',
                        '1) The files do not fit together (different databases versions)',
                        '(no fix except using propperly paired files).\n',
                        '2) The isoforms quantified have their nucleotide sequence stored in multiple fasta files (common for Ensembl).',
                        'Just supply a vector with the path to each of them to the \'isoformNtFasta\' argument.\n',
                        '3) One file could contain non-chanonical chromosomes while the other do not',
                        '(might be solved using the \'removeNonConvensionalChr\' argument.)\n',
                        '4) It is somthing to do with how a subset of the isoform ids are stored in the different files.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n\n',
                        sep = ' '
                    )
                )
            } else {
                addIsoformNt <- TRUE
            }

            ### Subset to annotated isoforms
            isoformNtSeq <- isoformNtSeq[which(
                names(isoformNtSeq) %in% myIsoAnot$isoform_id
            )]
            #if( length(isoformNtSeq) ) {
            #    addIsoformNt <- FALSE
            #}
        }
    }

    ### Create exon_features grange
    myExons <-
        sort(mfGTF[exonAnoationIndex , c('transcript_id', 'gene_id')])
    colnames(myExons@elementMetadata) <- c('isoform_id', 'gene_id')
    myExons <- myExons[which(
        myExons$isoform_id %in% myIsoAnot$isoform_id
    ),]

    # Collaps ajecent exons (without any intron between)
    if(TRUE) {
        ### Reduce ajecent exons
        tmp <- unlist(
            GenomicRanges::reduce(
                split(
                    myExons,
                    myExons$isoform_id
                )
            )
        )
        ### Add isoform id
        tmp$isoform_id <- tmp@ranges@NAMES
        tmp@ranges@NAMES <- NULL

        ### add gene id
        tmp$gene_id <-myExons$gene_id[match(
            tmp$isoform_id, myExons$isoform_id
        )]

        ### sort
        tmp <- tmp[sort.list(tmp$isoform_id),]

        ### Overwrite
        myExons <- tmp
    }

    # create replicates
    nrRep <-
        data.frame(
            condition = c('plaseholder1', 'plaseholder2'),
            nrReplicates = c(NA, NA),
            row.names = NULL,
            stringsAsFactors = FALSE
        )

    # create dummy feature
    repExp <- data.frame(
        isoform_id = myIsoAnot$isoform_id,
        plaseholder1 = NA,
        plaseholder2 = NA,
        stringsAsFactors = FALSE
    )

    designMatrix <-
        data.frame(
            sampleID = c('plaseholder1', 'plaseholder2'),
            condition = c('plaseholder1', 'plaseholder2'),
            stringsAsFactors = FALSE
        )

    ### Create switchList
    if (!quiet) {
        message('Creating switchAnalyzeRlist...')
    }
    localSwichList <- createSwitchAnalyzeRlist(
        isoformFeatures = myIsoAnot,
        exons = myExons,
        designMatrix = designMatrix,
        isoformCountMatrix = repExp,
        removeFusionTranscripts = removeFusionTranscripts,
        sourceId = 'gtf'
    )

    if (addAnnotatedORFs) {
        # subset to those in list
        orfInfo <-
            orfInfo[which(orfInfo$isoform_id %in%
                              localSwichList$isoformFeatures$isoform_id), ]

        # check for negative ORF lengths
        isoformsToRemove <-
            orfInfo$isoform_id[which(orfInfo$orfTransciptLength < 0)]
        if (length(isoformsToRemove)) {
            genesToRemove <-
                localSwichList$isoformFeatures$gene_id[which(
                    localSwichList$isoformFeatures$isoform_id %in%
                        isoformsToRemove)]
            localSwichList <-
                subsetSwitchAnalyzeRlist(
                    localSwichList,
                    !localSwichList$isoformFeatures$gene_id %in% genesToRemove
                )

            warning(
                paste(
                    length(genesToRemove),
                    'genes where removed due to negative ORF lengths. This',
                    'typically occures because gene_id are not unique',
                    '(meaning are found multiple places accorss the genome).',
                    'Please note there might still be duplicated gene_id',
                    'located on the same chromosome.',
                    sep = ' '
                )
            )
        }
        localSwichList$orfAnalysis <- orfInfo[which(
            orfInfo$isoform_id %in% localSwichList$isoformFeatures$isoform_id)
            ,]
    }

    ### Add nucleotide sequence
    if(addIsoformNt) {
        localSwichList$ntSequence <- isoformNtSeq[which(
            names(isoformNtSeq) %in% localSwichList$isoformFeatures$isoform_id
        )]

        if(addAnnotatedORFs & extractAaSeq) {
            localSwichList <- extractSequence(
                switchAnalyzeRlist = localSwichList,
                onlySwitchingGenes = FALSE,
                writeToFile = FALSE,
                extractNTseq = TRUE,
                extractAAseq = TRUE,
                addToSwitchAnalyzeRlist = TRUE
            )
        }
    }


    # Return switchAnalyzeRlist
    message('Done.')
    return(localSwichList)
}

importIsoformExpression <- function(
    ### Core arguments
    parentDir = NULL,
    sampleVector = NULL,

    ### Advanced arguments
    calculateCountsFromAbundance=TRUE,
    addIsofomIdAsColumn=TRUE,
    interLibNormTxPM=TRUE,
    normalizationMethod='TMM',
    pattern='',
    invertPattern=FALSE,
    ignore.case=FALSE,
    ignoreAfterBar = TRUE,
    ignoreAfterSpace = TRUE,
    ignoreAfterPeriod = FALSE,
    readLength = NULL,
    showProgress = TRUE,
    quiet = FALSE
) {
    ### To do
    # Could get a "summarize to gene level" option

    ### Test
    if(TRUE) {
        if( all(c( is.null(parentDir), is.null(sampleVector) )) ) {
            stop('Either the \'parentDir\' or the \'sampleVector\' argument must be used.')
        }
        if( !is.null(parentDir) & !is.null(sampleVector) ) {
            stop('Only one of the the \'parentDir\' and \'sampleVector\' argument can be used.')
        }

        inputIsDir <- ! is.null(parentDir)

        if(   inputIsDir ) {
            if( !is.character(parentDir) ) {
                stop('The user should supply a sting to the \'parentDir\' argument.')
            }
            if( parentDir == '' ) {
                stop(
                    paste(
                        'The \'parentDir\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                        '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                        'to import your own data? The system.file() should only be used',
                        'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                        'To access your own data simply provide the string to the directory with the data as:',
                        '"path/to/quantification/".',
                        sep=' '
                    )
                )
            }
            if( ! dir.exists(parentDir) ) {
                if( file_test("-f", parentDir) ) {
                    stop(
                        paste(
                            'The file pointed to with the \'parentDir\' argument seems to be a file (not a directory).',
                            'Did you mean to use the \'sampleVector\' argument?',
                            '\nType "?importIsoformExpression" for more information.',
                            sep=' '
                        )
                    )
                }
                stop(
                    paste(
                        'The directory pointed to with the \'parentDir\' argument does not exists.',
                        '\nDid you accidentially make a spelling mistake or added a unwanted "/" infront of the text string?',
                        sep=' '
                    )
                )
            }

        }
        if( ! inputIsDir ) {
            if( !is.character(sampleVector) ) {
                stop('The user should supply a sting to the \'sampleVector\' argument.')
            }
            if( '' %in% sampleVector ) {
                stop(
                    paste(
                        'The \'sampleVector\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                        '\nDid you try to use the system.file("your/quant/dir/quant.file", package="IsoformSwitchAnalyzeR")',
                        'to import your own data? The system.file() should only be used',
                        'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                        'To access your own data simply provide the string to the files with the data as:',
                        '"path/to/quantification/quantification.file".',
                        sep=' '
                    )
                )
            }
            if( ! all(file.exists(sampleVector))  ) {
                stop(
                    paste(
                        'One or more of the files pointed to with the \'sampleVector\' argument does not exists.',
                        '\nDid you accidentially make a spelling mistake or added a unwanted "/" infront of the text string?',
                        sep=' '
                    )
                )
            }
            if( ! all(file_test("-f", sampleVector)) ) {
                stop(
                    paste(
                        'One or more of the files pointed to with the \'sampleVector\' argument seems to be a directory.',
                        'Did you mean to use the \'parentDir\' argument?',
                        '\nType "?importIsoformExpression" for more information.',
                        sep=' '
                    )
                )
            }
        }

    }

    ### Initialize
    if(TRUE) {
        if( !normalizationMethod %in% c("TMM", "RLE", "upperquartile") ){
            stop('Metod supplied to "normalizationMethod" must be one of "TMM", "RLE" or "upperquartile". See documentation of edgeR::calcNormFactors for more info')
        }

        analysisCount <- 2 +
            as.integer(interLibNormTxPM)

        if (showProgress &  !quiet) {
            progressBar <- 'text'
            progressBarLogic <- TRUE
        } else {
            progressBar <- 'none'
            progressBarLogic <- FALSE
        }

        ### data.frame with nesseary info
        supportedTypes <- data.frame(
            orign          = c('Kallisto'       , 'Salmon'         , 'RSEM'            , 'StringTie'  ),
            fileName       = c('abundance.tsv'  , 'quant.sf'       , 'isoforms.results', 't_data.ctab'),
            eLengthCol     = c('eff_length'     , 'EffectiveLength', 'effective_length', ''),
            stringsAsFactors = FALSE
        )
        ### Add support for detection of compressed files
        supportedTypes2 <- supportedTypes
        supportedTypes2$fileName <- paste0(supportedTypes2$fileName, '.gz')
        supportedTypes <- rbind(
            supportedTypes,
            supportedTypes2
        )

        headerTypes <- list(
            Kallisto = c('target_id','length','eff_length','est_counts','tpm'),
            Salmon = c('Name','Length','EffectiveLength','TPM','NumReads'),
            RSEM = c('transcript_id','gene_id','length','effective_length','expected_count','TPM','FPKM','IsoPct'),
            StringTie = c('t_id','chr','strand','start','end','t_name','num_exons','length','gene_id','gene_name','cov','FPKM')
        )
    }

    ### Handle directory input
    if(inputIsDir) {
        ### Identify directories of interest
        if (TRUE) {
            if (!quiet) {
                message('Step 1 of ', analysisCount, ': Identifying which algorithm was used...')
            }
            dirList <- split(
                list.dirs(
                    path = parentDir,
                    full.names = FALSE,
                    recursive = FALSE
                ),
                list.dirs(
                    path = parentDir,
                    full.names = FALSE,
                    recursive = FALSE
                )
            )
            dirList <- dirList[which(sapply(dirList, nchar) > 0)]


            if(length(dirList) == 0) {
                stop('No subdirecories were found in the supplied folder. Please check and try again.')
            }

            ### Extract those where there are files of interest
            dirList <-
                dirList[sapply(
                    dirList,
                    FUN = function(aDir) {
                        # aDir <- dirList[[6]]
                        localFiles <-
                            list.files(
                                paste0(parentDir, '/', aDir),
                                recursive = FALSE
                            )

                        if(length( localFiles )) {
                            fileOfInterest <- any(
                                sapply(
                                    paste(supportedTypes$fileName, '$', sep = ''),
                                    function(aFileName) {
                                        grepl(pattern = aFileName, x =  localFiles)
                                    })
                                )
                        } else{
                            fileOfInterest <- FALSE
                        }

                        return(fileOfInterest)
                    }
                )]

            ### Remove hidden directories
            if( any( grepl('^\\.', names(dirList)) )  ) {
                nHidden <- sum( grepl('^\\.', names(dirList)) )
                nTotal <- length(dirList)
                warning(
                    paste(
                        'The importIsoformExpression() function identified',
                        nHidden,
                        'hidden sub-directories',
                        paste0('(of a total ',nTotal,' sub-directories of interest)'),
                        '\nThese were identified as having the prefix "." and will be ignored.',
                        '\nIf you want to keep them you will have to re-name the sub-directories omitting the starting ".".',
                        sep=' '
                    )
                )

                dirList <- dirList[which(
                    ! grepl('^\\.', names(dirList))
                )]
            }

            if (length(dirList) == 0) {
                stop(
                    paste(
                        'There were no directories containing the file names/suffixes',
                        'typically generated by Kallisto/Salmon/RSEM/StringTie.',
                        'Have you renamed the quantification files?',
                        '(if so you should probably use the "sampleVector" argument instead).',
                        sep=' '
                    )
                )
            }

        }

        ### Identify input type
        if(TRUE) {
            dataAnalyed <- supportedTypes[which(
                sapply(
                    paste0(supportedTypes$fileName,'$'),
                    function(aFileName) {
                        any(grepl(
                            pattern = aFileName,
                            x = list.files(paste0( parentDir, '/', dirList[[1]] ))
                        ))
                    })
            ), ]

            if (nrow(dataAnalyed) == 0) {
                stop(
                    paste(
                        'Could not identify any files with the names/suffixes',
                        'typically generated by Kallisto/Salmon/RSEM/StringTie.',
                        'Have you renamed the quantification files?',
                        '(if so you should use the "sampleVector" argument instead).',
                        sep=' '
                    )
                )
            }
            if (nrow(dataAnalyed) > 1) {
                stop(
                    paste(
                        'Could not uniquely identify file type.',
                        'Does the subdirectory contain results from multiple different tools?',
                        'If so you should use the "sampleVector" argument instead.',
                        'If not please contact developer.'
                    )
                )
            }
            if (!quiet) {
                message(paste('    The quantification algorithm used was:', dataAnalyed$orign, sep = ' '))
            }

            if( dataAnalyed$orign == 'StringTie' & is.null(readLength)) {
                stop(paste(
                    'When importing StringTie results the \'readLength\' argument',
                    'must be specified.\n',
                    'This argument must be set to the number of base pairs sequenced',
                    '(e.g. if the \n quantified data is 75 bp paired ends \'readLength\' should be set to 75.'
                ))
            }

        }

        ### Make paths for tximport
        if(TRUE) {
            ### make vector with paths
            localFiles <- sapply(
                dirList,
                function(aDir) {
                    list.files(
                        path = paste0( parentDir, '/', aDir, '/' ),
                        pattern = paste0(dataAnalyed$fileName, '$'),
                        full.names = TRUE
                    )

                }
            )
            names(localFiles) <- names(dirList)

            ### Subset to those of interest
            if( invertPattern ) {
                localFiles <- localFiles[which(
                    ! grepl(
                        pattern = pattern,
                        x = localFiles,
                        ignore.case=ignore.case
                    )
                )]
            } else {
                localFiles <- localFiles[which(
                    grepl(
                        pattern = pattern,
                        x = localFiles,
                        ignore.case=ignore.case
                    )
                )]
            }

            if( length(localFiles) == 0 ) {
                stop('No files were left after filtering via the \'pattern\' argument')
            }

            if (!quiet) {
                message(
                    paste0(
                        '    Found ',
                        length(localFiles),
                        ' quantification file(s) of interest'
                    )
                )
            }

            ### Test existence
            if(TRUE) {
                fileTest <- file.exists(localFiles)

                if( !all(fileTest)) {
                    stop(
                        paste0(
                            '\nSomething went wrong with the file-path creation. Please contact developer with reproducible example.',
                            '\n One file which did not work out was:\n ',
                            localFiles[which( ! fileTest) [1]],
                            sep=''
                        )
                    )
                }
            }
        }
    }

    ### Handle file input
    if( ! inputIsDir ) {
        if (!quiet) {
            message('Step 1 of ', analysisCount, ': Identifying which algorithm was used...')
        }

        ### Identify input type
        if(TRUE) {
            dataAnalyedList <- plyr::llply(sampleVector, function(aFilePath) {
                suppressMessages(
                    sampleFile <- readr::read_tsv(
                        aFilePath, col_names = TRUE, n_max = 2
                    )
                )

                localRes <- data.frame(
                    orign = names(headerTypes)[which(
                        sapply(headerTypes, function(x) {
                            all(x %in% colnames(sampleFile))
                        })
                    )],
                    stringsAsFactors = FALSE
                )

                return(localRes)
            })

            if( any(sapply(dataAnalyedList, nrow) != 1) ) {
                stop(
                    paste(
                        'Some of the files pointed to are not quantification',
                        'files from Kallisto/Salmon/RSEM/StringTie.',
                        'They did no contain the column names',
                        'typically generated by Kallisto/Salmon/RSEM/StringTie.',
                        'Are you sure it is the right files?',
                        sep=' '
                    )
                )
            }

            dataAnalyed <- unique(
                do.call(
                    rbind,
                    dataAnalyedList
                )
            )

            if (nrow(dataAnalyed) == 0) {
                stop(
                    paste(
                        'None of the files had the column names',
                        'typically generated by Kallisto/Salmon/RSEM/StringTie.',
                        'Are you sure it is the right files?',
                        sep=' '
                    )
                )
            }
            if (nrow(dataAnalyed) > 1) {
                stop(
                    paste(
                        'Could not uniquely identify file type.',
                        'Does the files pointed to come from a mixture of multiple different tools?',
                        'That is neither recommended nor supported. Please use only one tool'
                    )
                )
            }
            if (!quiet) {
                message(paste('    The quantification algorithm used was:', dataAnalyed$orign, sep = ' '))
            }

            if( dataAnalyed$orign == 'StringTie' & is.null(readLength)) {
                stop(paste(
                    'When importing StringTie results the \'readLength\' argument',
                    'must be specified.\n',
                    'This argument must be set to the number of base pairs sequenced',
                    '(e.g. if the \n quantified data is 75 bp paired ends \'readLength\' should be set to 75.'
                ))
            }

        }

        ### Add names
        if(TRUE) {
            localFiles <- sampleVector
            fileNames <- names(localFiles)

            ### Add file names
            if( is.null( fileNames ) | any(is.na( fileNames )) ) {
                if(any(is.na( fileNames ))) {
                    if (!quiet) {
                        message('    NAs was found in the name vector IsoformSwitchAnalyzeR will try and create new once.')
                    }
                }

                fileNames <- sapply(strsplit(localFiles, '/'), function(x) {
                    tmp <- tail(x, 1)
                    tmp <- gsub('\\.tsv$|\\.sf$|\\.isoforms.results|\\.ctab$|\\.csv|\\.txt', '', tmp)
                    return(tmp)
                })
            }

            if(any(duplicated(fileNames)) ) {
                # fileNames <- c('t','t','b')
                nameCount <- as.data.frame(table(fileNames))

                fileNames <- plyr::ddply(nameCount, 'fileNames', function(aDF) {
                    if( aDF$Freq == 1) {
                        return(
                            data.frame(
                                newId = aDF$fileNames,
                                stringsAsFactors = FALSE
                            )
                        )
                    } else {
                        return(
                            data.frame(
                                newId = paste0(aDF$fileNames, '_', 1:aDF$Freq),
                                stringsAsFactors = FALSE
                            )
                        )
                    }
                })$newId
            }

            if( any(duplicated(fileNames)) ){
                stop(
                    paste(
                        'IsoformSwitchAnalyzeR could not fix the missing name problem.',
                        'Please assign names to the vector provided to',
                        'the \'sampleVector\' argument using the names() function.'
                    )
                )
            }

            names(localFiles) <- fileNames
        }
    }

    ### Import files with txtimport
    if(TRUE) {
        if (!quiet) {
            message('Step 2 of ', analysisCount, ': Reading data...')
        }

        ### Use Txtimporter to import data
        if (!quiet) {
            localDataList <- tximport::tximport(
                files = localFiles,
                type = tolower(dataAnalyed$orign),
                txOut = TRUE, # to get isoform expression
                countsFromAbundance = ifelse(
                    test = calculateCountsFromAbundance,
                    yes= 'scaledTPM',
                    no='no'
                ),
                ignoreAfterBar = ignoreAfterBar,
                readLength=readLength
            )
        } else {
            suppressMessages(
                localDataList <- tximport::tximport(
                    files = localFiles,
                    type = tolower(dataAnalyed$orign),
                    txOut = TRUE, # to get isoform expression
                    countsFromAbundance = ifelse(
                        test = calculateCountsFromAbundance,
                        yes= 'scaledTPM',
                        no='no'
                    ),
                    ignoreAfterBar = ignoreAfterBar,
                    readLength=readLength
                )
            )
        }
    }

    ### test for failed libraies
    if(TRUE) {
        allZero <- apply(localDataList$abundance, 2, function(x) sum(x) == 0)
        if(any(allZero)) {
            toRemove <- names(allZero)[which(allZero)]
            warning(
                paste(
                    'Some quantifications appared to not have worked (zero reads mapped).',
                    '\nThe following libraries were therefore removed:',
                    paste(toRemove, collapse = ', ')
                )
            )

            localDataList$abundance <- localDataList$abundance[,which(!allZero)]
            localDataList$counts <- localDataList$counts[,which(!allZero)]
            localDataList$length <- localDataList$length[,which(!allZero)]

            if( ncol(localDataList$abundance) == 0 ) {
                stop('No libraries left after failed quantifications were removed.')
            }
        }
    }

    ### Noralize TxPM values based on effective counts
    if(interLibNormTxPM) {
        if( ncol(localDataList$abundance) >= 2) {
            if (!quiet) {
                message('Step 3 of 3: Normalizing abundance values (not counts) via edgeR...')
            }

            okIso <- rownames(localDataList$abundance)[which(
                rowSums( localDataList$abundance > 1 ) > 1
            )]
            abundMat <- localDataList$abundance[which( rownames(localDataList$abundance) %in% okIso),]

            ### Calculate normalization factors
            localDGE <- suppressMessages( suppressWarnings( edgeR::DGEList(abundMat, remove.zeros = TRUE) ) )
            localDGE <- suppressMessages( suppressWarnings( edgeR::calcNormFactors(localDGE, method = normalizationMethod) ) )

            ### Apply normalization factors
            localDataList$abundance <- t(t(localDataList$abundance) / localDGE$samples$norm.factors)
        } else {
            if (!quiet) {
                message('Step 3 of 3: Normalizing skipped due to only 1 sample...')
            }

        }
    }

    ### Massage data
    if(TRUE) {
        ### massage
        localDataList$abundance <- as.data.frame(localDataList$abundance)
        localDataList$counts <- as.data.frame(localDataList$counts)
        localDataList$length <- as.data.frame(localDataList$length)
        localDataList$countsFromAbundance <- NULL # remove message of how it was imported

        ### Fix names
        if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {
            ### Test for duplication
            rownames(localDataList$abundance) <- fixNames(
                nameVec = rownames(localDataList$abundance),
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )

            rownames(localDataList$counts) <- fixNames(
                nameVec = rownames(localDataList$counts),
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )

            rownames(localDataList$length) <- fixNames(
                nameVec = rownames(localDataList$length),
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod

            )
        }

        ### Add isoform id as col
        if(addIsofomIdAsColumn) {
            localDataList <- lapply(localDataList, function(x) {
                x$isoform_id <- rownames(x)
                rownames(x) <- NULL
                return(x)
            })

            ### Reorder
            reorderCols <- function(x) {
                x[,c( ncol(x), 1:(ncol(x)-1) )]
            }

            localDataList$abundance <- reorderCols( localDataList$abundance)
            localDataList$counts    <- reorderCols( localDataList$counts   )
            localDataList$length    <- reorderCols( localDataList$length   )
        }

        ### Add options
        localDataList$importOptions <- list(
            'calculateCountsFromAbundance'= calculateCountsFromAbundance,
            'interLibNormTxPM'= interLibNormTxPM,
            'normalizationMethod'= normalizationMethod
        )

        if (!quiet) {
            message('Done\n')
        }
    }

    return(localDataList)
}


importRdata <- function(
    ### Core arguments
    isoformCountMatrix,
    isoformRepExpression = NULL,
    designMatrix,
    isoformExonAnnoation,
    isoformNtFasta = NULL,
    comparisonsToMake = NULL,

    ### Advanced arguments
    addAnnotatedORFs = TRUE,
    onlyConsiderFullORF = FALSE,
    removeNonConvensionalChr = FALSE,
    ignoreAfterBar = TRUE,
    ignoreAfterSpace = TRUE,
    ignoreAfterPeriod = FALSE,
    removeTECgenes = TRUE,
    PTCDistance = 50,
    foldChangePseudoCount = 0.01,
    addIFmatrix = TRUE,
    fixStringTieAnnotationProblem = TRUE,
    fixStringTieViaOverlapInMultiGenes = TRUE,
    fixStringTieMinOverlapSize = 50,
    fixStringTieMinOverlapFrac = 0.2,
    fixStringTieMinOverlapLog2RatioToContender = 0.65,
    estimateDifferentialGeneRange = TRUE,
    showProgress = TRUE,
    quiet = FALSE
) {
    ### Set up progress
    if (showProgress &  !quiet) {
        progressBar <- 'text'
        progressBarLogic <- TRUE
    } else {
        progressBar <- 'none'
        progressBarLogic <- FALSE
    }

    ### Test existance of files
    if(TRUE) {
        if( !is.null(isoformNtFasta)) {
            if( ! is(isoformNtFasta, 'character') ) {
                stop('The \'isoformNtFasta\' argument must be a string (or vector of strings) pointing to the fasta file on the disk.')
            }

            if( any( isoformNtFasta == '') ) {
                stop(
                    paste(
                        'The \'isoformNtFasta\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                        '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                        'to import your own data? The system.file() should only be used',
                        'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                        'To access your own data simply provide the string to the directory with the data as:',
                        '"path/to/quantification/".',
                        sep=' '
                    )
                )
            }
            if( any( ! sapply(isoformNtFasta, file.exists) ) ) {
                stop('At least one of the file(s) pointed to with \'isoformNtFasta\' seems not to exist.')
            }
            if( any(! grepl('\\.fa|\\.fasta|\\.fa.gz|\\.fasta.gz', isoformNtFasta)) ) {
                stop('At least one of the file(s) pointed to with \'isoformNtFasta\' seems not to be a fasta file...')
            }
        }

        if( class(isoformExonAnnoation) == 'character' ) {
            if( isoformExonAnnoation == '' ) {
                stop(
                    paste(
                        'The \'isoformExonAnnoation\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                        '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                        'to import your own data? The system.file() should only be used',
                        'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                        'To access your own data simply provide the string to the directory with the data as:',
                        '"path/to/quantification/".',
                        sep=' '
                    )
                )
            }
            if( ! (file.exists(isoformExonAnnoation) | RCurl::url.exists(isoformExonAnnoation)) ) {
                stop(
                    paste(
                        'The file pointed to with the \'isoformExonAnnoation\' argument does not exists.',
                        '\nDid you accidentially make a spelling mistake or added a unwanted "/" infront of the text string?',
                        sep=' '
                    )
                )
            }
        }
    }

    ### Test whether imput data fits together
    if (!quiet) { message('Step 1 of 7: Checking data...')}
    if (TRUE) {
        ### Test supplied expression
        if(TRUE) {
            countsSuppled <- !is.null(isoformCountMatrix)
            abundSuppled <- !is.null(isoformRepExpression)

            if( !( countsSuppled | abundSuppled) ) {
                stop('At least one of \'isoformCountMatrix\' or \'isoformRepExpression\' arguments must be used.')
            }

            if( abundSuppled ) {
                isoformRepExpression <- as.data.frame(isoformRepExpression)

                if( any( apply(isoformRepExpression[,which(colnames(isoformRepExpression) != 'isoform_id')],2, class) %in% c('character', 'factor') )) {
                    stop('The isoformCountMatrix contains character/factor column(s) (other than the isoform_id column)')
                }

                extremeValues <- range( isoformRepExpression[,which( colnames(isoformRepExpression) != 'isoform_id')], na.rm = TRUE )
                if( max(extremeValues) < 30 ) {
                    warning('The expression data supplied to \'isoformRepExpression\' seems very small - please double-check that it is NOT log-transformed')
                }
                if( min(extremeValues) < 0 ) {
                    stop('The expression data supplied to \'isoformRepExpression\' contains negative values - please double-check that it is NOT log-transformed')
                }

            }

            if( countsSuppled ) {
                isoformCountMatrix <- as.data.frame(isoformCountMatrix)

                if( any( apply(isoformCountMatrix[,which(colnames(isoformCountMatrix) != 'isoform_id')],2, class) %in% c('character', 'factor') )) {
                    stop('The isoformCountMatrix contains character/factor column(s) (other than the isoform_id column)')
                }

                extremeValues <- range( isoformCountMatrix[,which( colnames(isoformCountMatrix) != 'isoform_id')], na.rm = TRUE )
                if( max(extremeValues) < 30 ) {
                    warning('The count data supplied to \'isoformCountMatrix\' seems very small - please double-check that it is NOT log-transformed')
                }
                if( min(extremeValues) < 0 ) {
                    stop('The count data supplied to \'isoformCountMatrix\' contains negative values - please double-check that it is NOT log-transformed')
                }
            }
        }

        ### Contains the collums they should
        if (TRUE) {
            ### Colnames
            if( countsSuppled ) {
                if (!any(colnames(isoformCountMatrix) == 'isoform_id')) {
                    #stop(paste(
                    #    'The data.frame passed to the \'isoformCountMatrix\'',
                    #    'argument must contain a \'isoform_id\' column'
                    #))
                    message(paste(
                        '    Using row.names as \'isoform_id\' for \'isoformCountMatrix\'. If not suitable you must add them manually.'
                    ))
                    isoformCountMatrix$isoform_id <- rownames(isoformCountMatrix)

                }

                if(any(duplicated( isoformCountMatrix$isoform_id) )) {
                    stop('The \'isoform_id\' of the count matrix must have unique ids.')
                }
            }
            if ( abundSuppled ) {
                if (!any(colnames(isoformRepExpression) == 'isoform_id')) {
                    #stop(paste(
                    #    'The data.frame passed to the \'isoformRepExpression\'',
                    #    'argument must contain a \'isoform_id\' column'
                    #))
                    message(paste(
                        '    Using row.names as \'isoform_id\' for \'isoformRepExpression\'. If not suitable you must add them manually.'
                    ))
                    isoformRepExpression$isoform_id <- rownames(isoformRepExpression)
                }
                if(any(duplicated( isoformRepExpression$isoform_id) )) {
                    stop('The \'isoform_id\' of the expression matrix must have unique ids.')
                }
            }

            ### Potentially convert from tibble
            if( class(designMatrix)[1] == 'tbl_df') {
                designMatrix <- as.data.frame(designMatrix)
            }
            if (!all(c('sampleID', 'condition') %in% colnames(designMatrix))) {
                stop(paste(
                    'The data.frame passed to the \'designMatrix\'',
                    'argument must contain both a \'sampleID\' and a',
                    '\'condition\' column'
                ))
            }
            if (length(unique(designMatrix$condition)) < 2) {
                stop('The supplied \'designMatrix\' only contains 1 condition')
            }
            # test information content in design matrix
            if( ncol(designMatrix) > 2 ) {
                otherDesign <- designMatrix[,which(
                    ! colnames(designMatrix) %in% c('sampleID', 'condition')
                ),drop=FALSE]

                nonInformaticColms <- which(
                    apply(otherDesign, 2, function(x) {
                        length(unique(x)) == 1
                    })
                )

                if(length(nonInformaticColms)) {
                    stop(
                        paste(
                            'In the designMatrix the following column(s): ',
                            paste(names(nonInformaticColms), collapse = ', '),
                            '\n Contain constant information. Columns apart from \'sampleID\' and \'condition\'\n',
                            'must describe cofounding effects not if interest. See ?importRdata and\n',
                            'vignette ("How to handle cofounding effects (including batches)" section) for more information.',
                            sep=' '
                        )
                    )
                }
            }

            # test comparisonsToMake
            if (!is.null(comparisonsToMake)) {
                if (!all(c('condition_1', 'condition_2') %in%
                         colnames(comparisonsToMake))) {
                    stop(paste(
                        'The data.frame passed to the \'comparisonsToMake\'',
                        'argument must contain both a \'condition_1\' and a',
                        '\'condition_2\' column indicating',
                        'the comparisons to make'
                    ))
                }
            }
        }

        ### Convert potential factors
        if (TRUE) {
            orgCond <- designMatrix$condition

            designMatrix$sampleID  <- as.character(designMatrix$sampleID)
            designMatrix$condition <- as.character(designMatrix$condition)

            if (!is.null(comparisonsToMake)) {
                comparisonsToMake$condition_1 <-
                    as.character(comparisonsToMake$condition_1)
                comparisonsToMake$condition_2 <-
                    as.character(comparisonsToMake$condition_2)
            }

            if (!is.null(isoformRepExpression)) {
                isoformRepExpression$isoform_id <-
                    as.character(isoformRepExpression$isoform_id)
            }
            if (!is.null(isoformCountMatrix)) {
                isoformCountMatrix$isoform_id <-
                    as.character(isoformCountMatrix$isoform_id)
            }
        }

        ### Check supplied data fits togehter
        if (TRUE) {
            if(countsSuppled) {
                if (!all(designMatrix$sampleID %in% colnames(isoformCountMatrix))) {
                    stop(paste(
                        'Each sample stored in \'designMatrix$sampleID\' must have',
                        'a corresponding expression column in \'isoformCountMatrix\''
                    ))
                }
            }
            if ( abundSuppled ) {
                if (!all(designMatrix$sampleID %in%
                         colnames(isoformRepExpression))) {
                    stop(paste(
                        'Each sample stored in \'designMatrix$sampleID\' must',
                        'have a corresponding expression column',
                        'in \'isoformRepExpression\''
                    ))
                }
            }
            if( abundSuppled & countsSuppled ) {
                if( !  identical( colnames(isoformCountMatrix) , colnames(isoformRepExpression)) ) {
                    stop('The column name and order of \'isoformCountMatrix\' and \'isoformRepExpression\' must be identical')
                }

                if( !  identical( isoformCountMatrix$isoform_id , isoformCountMatrix$isoform_id ) ) {
                    stop('The ids and order of the \'isoform_id\' column in \'isoformCountMatrix\' and \'isoformRepExpression\' must be identical')
                }
            }

            if (!is.null(comparisonsToMake)) {
                if (!all(
                    c(
                        comparisonsToMake$condition_1,
                        comparisonsToMake$condition_2
                    ) %in% designMatrix$condition
                )) {
                    stop(paste(
                        'The conditions supplied in comparisonsToMake and',
                        'designMatrix does not match'
                    ))
                }
            } else {
                # create it
                comparisonsToMake <-
                    allPairwiseFeatures(orgCond)
                colnames(comparisonsToMake) <-
                    c('condition_1', 'condition_2')
            }
        }

        ### Test complexity of setup
        if(TRUE) {
            nCond <- length(unique(designMatrix$condition))
            n <- nrow(designMatrix)
            if(  nCond/n > 2/3  ) {
                warning(paste(
                    'The experimental design seems to be of very low complexity - very few samples per replicate.',
                    'Please check the supplied design matrixt to make sure no mistakes were made.'
                ))
            }

            nComp <- nrow(comparisonsToMake)
            if( nComp > 6 ) {
                warning(paste0(
                    'The number of comparisons (n=', nComp,') is unusually high.',
                    '\n - If this intended please note that with a large number of comparisons IsoformSwitchAnalyzeR might use quite a lot of memmory (aka running on a small computer might be problematic).',
                    '\n - If this was not intended please check the supplied design matrixt to make sure no mistakes were made.'
                ))
            }

            ### Test conditions with n=1
            cndCnt <- table(designMatrix$condition)
            if( any(cndCnt == 1) ) {
                warning(
                    paste0(
                        '\n!!! NB !!! NB !!! NB !!!NB !!! NB !!!',
                        '\nIsoformSwitchAnalyzeR is not made to work with conditions without indepdendet biological replicates and results will not be trustworthy!',
                        '\nAt best data without replicates should be analyzed as a pilot study before investing in more replicates.',
                        '\nPlase consult the "Analysing experiments without replicates" and "What constitute an independent biological replicate?" sections of the vignette.',
                        '\n!!! NB !!! NB !!! NB !!!NB !!! NB !!!\n'
                    )
                )
            }

            ### Test for full rank
            isFullRank <- testFullRank( designMatrix )

            if( ! isFullRank ) {
                stop(
                    paste(
                        'The supplied design matrix will result in a model matrix that is not full rank',
                        '\nPlease make sure there are no co-linearities in the design'
                    )
                )
            }

        }

        ### Test NT input
        if(TRUE) {
            if( !is.null( isoformNtFasta )) {
                if( !is.character( isoformNtFasta)) {
                    stop('The \'isoformNtFasta\' argument must be a charachter string.')
                }

                if( any( ! sapply(isoformNtFasta, file.exists) ) ) {
                    stop('At least one of the file(s) pointed to with \'isoformNtFasta\' seems not to exist.')
                }
                if( any(! grepl('\\.fa|\\.fasta|\\.fa.gz|\\.fasta.gz', isoformNtFasta)) ) {
                    stop('The file pointed to via the \'isoformNtFasta\' argument does not seem to be a fasta file...')
                }
            }
        }
    }

    ### Giver proper R names
    if(TRUE) {
        ### Double check order
        designMatrix <- designMatrix[,c(
            match( c('sampleID','condition'), colnames(designMatrix) ),
            which( ! colnames(designMatrix) %in% c('sampleID','condition') )
        )]

        tmp <- designMatrix

        for( i in 2:ncol(designMatrix) ) { # i <- 2
            if( class(designMatrix[,i]) %in% c('character','factor') ) {
                designMatrix[,i] <- makeProperNames( designMatrix[,i] )
            }
        }

        if( ! identical(tmp, designMatrix) ) {
            message('Please note that some condition names were changed due to names not suited for modeling in R.')
        }

        if( !is.null(comparisonsToMake) ) {
            comparisonsToMake$condition_1 <- makeProperNames(
                comparisonsToMake$condition_1
            )
            comparisonsToMake$condition_2 <- makeProperNames(
                comparisonsToMake$condition_2
            )
        }
    }

    ### Fix names (done before input is handled and compared)
    if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {
        if( countsSuppled ) {
            isoformCountMatrix$isoform_id <- fixNames(
                nameVec = isoformCountMatrix$isoform_id,
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )
        }
        if ( abundSuppled ) {
            isoformRepExpression$isoform_id <- fixNames(
                nameVec = isoformRepExpression$isoform_id,
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )
        }

    }

    ### Handle annotation input
    if (!quiet) { message('Step 2 of 7: Obtaining annotation...')}
    if (TRUE) {
        ### Massage annoation input
        if(TRUE) {
            ### Import GTF is nessesary
            if( class(isoformExonAnnoation) == 'character' ) {
                gtfImported <- TRUE

                ### Test input
                if(TRUE) {
                    if (length(isoformExonAnnoation) != 1) {
                        stop(paste(
                            'You can only supply 1 file to isoformExonAnnoation'
                        ))
                    }
                    if( ! grepl('\\.gtf$|\\.gtf\\.gz$', isoformExonAnnoation, ignore.case = TRUE) ) {
                        warning('The file appearts not to be a GTF file as it does not end with \'.gtf\' or \'.gtf.gz\' - are you sure it is the rigth file?')
                    }

                    if (!quiet) {
                        message('    importing GTF (this may take a while)')
                    }

                }

                ### Note: Isoform names are fixed by importGTF
                suppressWarnings(
                    gtfSwichList <- importGTF(
                        pathToGTF = isoformExonAnnoation,
                        addAnnotatedORFs = addAnnotatedORFs,
                        onlyConsiderFullORF = onlyConsiderFullORF,
                        removeNonConvensionalChr = FALSE,
                        ignoreAfterBar = ignoreAfterBar,
                        ignoreAfterSpace = ignoreAfterSpace,
                        ignoreAfterPeriod = ignoreAfterPeriod,
                        removeTECgenes = FALSE,
                        PTCDistance = PTCDistance,
                        removeFusionTranscripts = FALSE,
                        quiet = TRUE
                    )
                )

                ### Extract isoforms which are quantified
                if(TRUE) {
                    ### Get genes with iso quantified
                    if( countsSuppled ) {
                        genesToKeep <- gtfSwichList$isoformFeatures$gene_id[which(
                            gtfSwichList$isoformFeatures$isoform_id %in% isoformCountMatrix$isoform_id
                        )]

                        ### Ensure all isoforms quantified are kept
                        isoToKeep <- union(
                            gtfSwichList$isoformFeatures$isoform_id[which(
                                gtfSwichList$isoformFeatures$gene_id %in% genesToKeep
                            )],
                            isoformCountMatrix$isoform_id
                        )
                    } else {
                        genesToKeep <- gtfSwichList$isoformFeatures$gene_id[which(
                            gtfSwichList$isoformFeatures$isoform_id %in% isoformRepExpression$isoform_id
                        )]

                        ### Ensure all isoforms quantified are kept
                        isoToKeep <- union(
                            gtfSwichList$isoformFeatures$isoform_id[which(
                                gtfSwichList$isoformFeatures$gene_id %in% genesToKeep
                            )],
                            isoformRepExpression$isoform_id
                        )
                    }
                }

                ### Extract isoforms to remove due to non chanonical nature
                if(TRUE) {
                    ### Identify isoforms to remove
                    isoformsToRemove <- character()
                    if( removeTECgenes & any(!is.na( gtfSwichList$isoformFeatures$gene_biotype)) ) {
                        isoformsToRemove <- c(
                            isoformsToRemove,
                            unique(gtfSwichList$isoformFeatures$isoform_id[which(
                                gtfSwichList$isoformFeatures$gene_biotype == 'TEC'
                            )])
                        )
                    }
                    if( removeNonConvensionalChr ) {
                        nonChanonicalChrsIso <- unique(
                            gtfSwichList$exons$isoform_id[which(
                                grepl('_|\\.'  , as.character(gtfSwichList$exons@seqnames))
                            )]
                        )

                        isoformsToRemove <- unique(c(
                            isoformsToRemove,
                            nonChanonicalChrsIso
                        ))
                    }

                    ### Note:
                    # No need to extend to genes since they per definition are all genes

                    ### Remove non chanonical isoforms
                    if(length(isoformsToRemove)) {
                        isoToKeep <- setdiff(
                            isoToKeep,
                            isoformsToRemove
                        )
                    }
                }

                ### Subset to used data
                if(TRUE) {
                    if( countsSuppled ) {
                        isoformCountMatrix <- isoformCountMatrix[which(
                            isoformCountMatrix$isoform_id %in% isoToKeep
                        ),]
                    }
                    if( abundSuppled ) {
                        isoformRepExpression <- isoformRepExpression[which(
                            isoformRepExpression$isoform_id %in% isoToKeep
                        ),]
                    }

                    if(any(isoToKeep %in% gtfSwichList$isoformFeatures$isoform_id)) {
                        gtfSwichList <- subsetSwitchAnalyzeRlist(
                            gtfSwichList,
                            gtfSwichList$isoformFeatures$isoform_id %in% isoToKeep
                        )
                    }
                }

                ### Extract wanted annotation files form the GTF switchAnalyzeR object
                if(TRUE) {
                    isoformExonStructure <-
                        gtfSwichList$exons[, c('isoform_id', 'gene_id')]
                    isoformExonStructure <- sort(isoformExonStructure)

                    colsToExtract <- c(
                        'isoform_id', 'gene_id', 'gene_name',
                        'gene_biotype','iso_biotype'
                    )
                    isoformAnnotation <-
                        unique(gtfSwichList$isoformFeatures[,na.omit(
                            match(colsToExtract , colnames(gtfSwichList$isoformFeatures))
                        )])
                }
                # where isoformAnnotation and isoformExonStructure is made

                if (addAnnotatedORFs & gtfImported) {
                    isoORF <- gtfSwichList$orfAnalysis

                    if( all( is.na(isoORF$PTC)) ) {
                        warning(
                            paste(
                                '   No CDS annotation was found in the GTF files meaning ORFs could not be annotated.\n',
                                '    (But ORFs can still be predicted with the analyzeORF() function)'
                            )
                        )

                        addAnnotatedORFs <- FALSE
                    }
                }

            }

            if( class(isoformExonAnnoation) != 'character' ) {
                gtfImported <- FALSE

                ### Test input
                if(TRUE) {
                    if( ! is(object = isoformExonAnnoation, 'GRanges') ) {
                        stop('When not using a GTF file (by supplying a text string with the path to the file) the "isoformExonAnnoation" argument must be a GRange.')
                    }

                    if( length(isoformExonAnnoation) == 0 ) {
                        stop('The GRange supplied to the "isoformExonAnnoation" argument have zero enteries (rows).')
                    }

                    ### Test
                    if( !all( c('isoform_id', 'gene_id') %in% colnames(isoformExonAnnoation@elementMetadata) )) {
                        stop('The supplied annotation must contain to meta data collumns: \'isoform_id\' and \'gene_id\'')
                    }

                    ### Test for other than exons by annotation
                    if(any(  colnames(isoformExonAnnoation@elementMetadata) == 'type' )) {
                        stop(
                            paste(
                                'The \'type\' column of the data supplied to \'isoformExonAnnoation\'',
                                'indicate there are multiple levels of data.',
                                'Please fix this (providing only exon-level) or simply',
                                '\nprovide a string with the path to the GTF file to the \'isoformExonAnnoation\' - ',
                                'then IsoformSwitchAnalyzeR will import and massage the GTF file for you.'
                            )
                        )
                    }

                    ### Test for other than exons by overlap of transcript features
                    localExonList <- split(isoformExonAnnoation@ranges, isoformExonAnnoation$isoform_id)
                    localExonListReduced <- GenomicRanges::reduce(localExonList)
                    if(
                        any( sapply( width(localExonList), sum) != sapply( width(localExonListReduced), sum) )
                    ) {
                        stop(
                            paste(
                                'The data supplied to \'isoformExonAnnoation\' appears to be multi-leveled',
                                '(Fx both containing exon and CDS information for transcripts - which a GTF file does).',
                                'If your annotation data originate from a GTF file please supply a string',
                                'indicating the path to the GTF file to the \'isoformExonAnnoation\' argument',
                                'instead - then IsoformSwitchAnalyzeR will handle the multi-levels.'
                            )
                        )
                    }

                }

                ### Fix names
                if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {
                    isoformExonAnnoation$isoform_id <- fixNames(
                        nameVec = isoformExonAnnoation$isoform_id,
                        ignoreAfterBar = ignoreAfterBar,
                        ignoreAfterSpace = ignoreAfterSpace,
                        ignoreAfterPeriod = ignoreAfterPeriod
                    )
                }

                ### Collaps ajecent exons (aka thouse without any intron between)
                if(TRUE) {
                    ### Reduce ajecent exons
                    tmp <- unlist(
                        GenomicRanges::reduce(
                            split(
                                isoformExonAnnoation,
                                isoformExonAnnoation$isoform_id
                            )
                        )
                    )
                    ### Add isoform id
                    tmp$isoform_id <- tmp@ranges@NAMES
                    tmp@ranges@NAMES <- NULL

                    ### add gene id
                    tmp$gene_id <-isoformExonAnnoation$gene_id[match(
                        tmp$isoform_id, isoformExonAnnoation$isoform_id
                    )]

                    ### Add gene names if used
                    if('gene_name' %in% colnames(isoformExonAnnoation@elementMetadata)) {
                        tmp$gene_name <-isoformExonAnnoation$gene_name[match(
                            tmp$isoform_id, isoformExonAnnoation$isoform_id
                        )]
                    }

                    ### sort
                    tmp <- tmp[sort.list(tmp$isoform_id),]

                    ### Overwrite
                    isoformExonAnnoation <- tmp
                }

                ### Extract isoforms which are quantified
                if(TRUE) {
                    ### Get genes with iso quantified
                    if( countsSuppled ) {
                        genesToKeep <- isoformExonAnnoation$gene_id[which(
                            isoformExonAnnoation$isoform_id %in% isoformCountMatrix$isoform_id
                        )]

                        ### Ensure all isoforms quantified are kept
                        isoToKeep <- union(
                            isoformExonAnnoation$isoform_id[which(
                                isoformExonAnnoation$gene_id %in% genesToKeep
                            )],
                            isoformCountMatrix$isoform_id
                        )
                    } else {
                        genesToKeep <- isoformExonAnnoation$gene_id[which(
                            isoformExonAnnoation$isoform_id %in% isoformRepExpression$isoform_id
                        )]

                        ### Ensure all isoforms quantified are kept
                        isoToKeep <- union(
                            isoformExonAnnoation$isoform_id[which(
                                isoformExonAnnoation$gene_id %in% genesToKeep
                            )],
                            isoformCountMatrix$isoform_id
                        )
                    }
                }

                ### Subset to used data
                if(TRUE) {
                    if( countsSuppled ) {
                        isoformCountMatrix <- isoformCountMatrix[which(
                            isoformCountMatrix$isoform_id %in% isoToKeep
                        ),]
                    }
                    if( abundSuppled ) {
                        isoformRepExpression <- isoformRepExpression[which(
                            isoformRepExpression$isoform_id %in% isoToKeep
                        ),]
                    }

                    if(any(isoToKeep %in% isoformExonAnnoation$isoform_id)) {
                        isoformExonAnnoation <- isoformExonAnnoation[which(
                            isoformExonAnnoation$isoform_id %in% isoToKeep
                        ),]
                    }
                }

                ### Devide the data
                colsToUse <-  c(
                    'isoform_id',
                    'gene_id',
                    'gene_name'
                )

                isoformExonStructure <-
                    isoformExonAnnoation[,na.omit(match(
                        colsToUse, colnames(isoformExonAnnoation@elementMetadata)
                    ))]

                isoformAnnotation <-
                    unique(as.data.frame(isoformExonAnnoation@elementMetadata))
                if (!'gene_name' %in% colnames(isoformAnnotation)) {
                    isoformAnnotation$gene_name <- NA
                }

                isoformAnnotation <- isoformAnnotation[order(
                    isoformAnnotation$gene_id,
                    isoformAnnotation$gene_name,
                    isoformAnnotation$isoform_id
                ),]
            }

        }

        ### Test the columns of obtained annoation
        if(TRUE) {
            if (!all(c('isoform_id', 'gene_id', 'gene_name') %in%
                     colnames(isoformAnnotation))) {
                stop(paste(
                    'The data.frame passed to the \'isoformAnnotation\' argument',
                    'must contain the following columns \'isoform_id\',',
                    '\'gene_id\' and \'gene_name\''
                ))
            }
            if (any(is.na(isoformAnnotation[, c('isoform_id', 'gene_id')]))) {
                stop(paste(
                    'The \'isoform_id\' and \'gene_id\' columns in the data.frame',
                    'passed to the \'isoformAnnotation\' argument are not allowed',
                    'to contain NAs'
                ))
            }
            if (!'isoform_id' %in% colnames(isoformExonStructure@elementMetadata)) {
                stop(paste(
                    'The GenomicRanges (GRanges) object passed to the',
                    '\'isoformExonStructure\' argument must contain both a',
                    '\'isoform_id\' and \'gene_id\' metadata column'
                ))
            }
        }

        ### Test overlap with expression data
        if(TRUE) {
            if( countsSuppled ) {
                j1 <- jaccardSimilarity(
                    isoformCountMatrix$isoform_id,
                    isoformAnnotation$isoform_id
                )

                expIso <- isoformCountMatrix$isoform_id
            } else {
                j1 <- jaccardSimilarity(
                    isoformRepExpression$isoform_id,
                    isoformAnnotation$isoform_id
                )

                expIso <- isoformRepExpression$isoform_id
            }

            jcCutoff <- 0.925

            onlyInExp <- setdiff(expIso, isoformAnnotation$isoform_id)

            if (j1 != 1 ) {
                if( j1 < jcCutoff | length(onlyInExp) ) {
                    options(warning.length = 2000L)
                    stop(
                        paste(
                            'The annotation and quantification (count/abundance matrix and isoform annotation)',
                            'seems to be different (Jaccard similarity < 0.925).',
                            '\nEither isforoms found in the annotation are',
                            'not quantifed or vise versa.',
                            '\nSpecifically:\n',
                            length(unique(expIso)), 'isoforms were quantified.\n',
                            length(unique(isoformAnnotation$isoform_id)), 'isoforms are annotated.\n',
                            'Only', length(intersect(expIso, isoformAnnotation$isoform_id)), 'overlap.\n',
                            length(setdiff(unique(expIso), isoformAnnotation$isoform_id)), 'isoforms quantifed had no corresponding annoation\n',
                            '\nThis combination cannot be analyzed since it will',
                            'cause discrepencies between quantification and annotation thereby skewing all analysis.\n',

                            '\nIf there is no overlap (as in zero or close) there are two options:\n',
                            '1) The files do not fit together (e.g. different databases, versions, etc)',
                                '(no fix except using propperly paired files).\n',
                            '2) It is somthing to do with how the isoform ids are stored in the different files.',
                            'This problem might be solvable using some of the',
                            '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
                            '    Examples from expression matrix are :',
                            paste0( sample(expIso, min(c(3, length(expIso)))), collapse = ', '),'\n',
                            '    Examples of annoation are :',
                            paste0( sample(isoformAnnotation$isoform_id, min(c(3, length(isoformAnnotation$isoform_id)))), collapse = ', '),'\n',
                            '    Examples of isoforms which were only found im the quantification are  :',
                            paste0( sample(onlyInExp, min(c(3, length(onlyInExp)))), collapse = ', '),'\n',

                            '\nIf there is a large overlap but still far from complete there are 3 possibilites:\n',
                            '1) The files do not fit together (e.g different databases versions etc.)',
                            '(no fix except using propperly paired files).\n',
                            '2) If you are using Ensembl data you have supplied the GTF without phaplotyps. You need to supply the',
                            '<Ensembl_version>.chr_patch_hapl_scaff.gtf file - NOT the <Ensembl_version>.chr.gtf\n',
                            '3) One file could contain non-chanonical chromosomes while the other do not',
                            '(might be solved using the \'removeNonConvensionalChr\' argument.)\n',
                            '4) It is somthing to do with how a subset of the isoform ids are stored in the different files.',
                            'This problem might be solvable using some of the',
                            '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n\n',

                            '\nFor more info see the FAQ in the vignette.\n',
                            sep=' '
                        )
                    )
                }
                if( j1 >= jcCutoff ) {
                    warning(
                        paste(
                            'The annotation and quantification (count/abundance matrix and isoform annotation)',
                            'Seem to be slightly different.',

                            '\nSpecifically:\n',
                            length(setdiff(isoformAnnotation$isoform_id, unique(expIso))), 'isoforms were only found in the annotation\n',

                            '\nPlease make sure this is on purpouse since differences',
                            'will cause inaccurate quantification and thereby skew all analysis.\n',

                            'If you have quantified with Salmon this could be normal since it as default only keep one copy of identical sequnces (can be prevented using the --keepDuplicates option)\n',
                            'We strongly encurage you to go back and figure out why this is the case.\n\n',
                            sep=' '
                        )
                    )

                    ### Reduce to those found in all
                    if( countsSuppled ) {
                        isoformsUsed <- intersect(
                            isoformCountMatrix$isoform_id,
                            isoformAnnotation$isoform_id
                        )
                    } else {
                        isoformsUsed <- intersect(
                            isoformRepExpression$isoform_id,
                            isoformAnnotation$isoform_id
                        )
                    }

                    isoformExonStructure <- isoformExonStructure[which(
                        isoformExonStructure$isoform_id %in% isoformsUsed
                    ), ]
                    isoformAnnotation <-isoformAnnotation[which(
                        isoformAnnotation$isoform_id %in% isoformsUsed
                    ), ]

                    if( countsSuppled ) {
                        isoformCountMatrix <-isoformCountMatrix[which(
                            isoformCountMatrix$isoform_id %in% isoformsUsed
                        ), ]
                    }
                    if( abundSuppled ) {
                        isoformRepExpression <-isoformRepExpression[which(
                            isoformRepExpression$isoform_id %in% isoformsUsed
                        ), ]
                    }

                }
            }

        }
    }

    ### Subset to libraries used
    if (TRUE) {
        designMatrix <-
            designMatrix[which(
                designMatrix$condition %in% c(
                    comparisonsToMake$condition_1,
                    comparisonsToMake$condition_2
                )
            ), ]

        if( countsSuppled ) {
            isoformCountMatrix <-
                isoformCountMatrix[, which(
                    colnames(isoformCountMatrix) %in%
                        c('isoform_id', designMatrix$sampleID))]
            isoformCountMatrix <-
                isoformCountMatrix[,c(
                    which(colnames(isoformCountMatrix) == 'isoform_id'),
                    which(colnames(isoformCountMatrix) != 'isoform_id')
                )]
            rownames(isoformCountMatrix) <- NULL
        }

        if ( abundSuppled ) {
            isoformRepExpression <-
                isoformRepExpression[, which(
                    colnames(isoformRepExpression) %in%
                        c('isoform_id', designMatrix$sampleID)
                )]

            isoformRepExpression <-
                isoformRepExpression[,c(
                    which(colnames(isoformRepExpression) == 'isoform_id'),
                    which(colnames(isoformRepExpression) != 'isoform_id')
                )]
            rownames(isoformRepExpression) <- NULL
        }
    }

    ### Remove isoforms not expressed
    if (TRUE) {
        if( countsSuppled ) {
            okIsoforms <- isoformCountMatrix$isoform_id[which(
                rowSums(isoformCountMatrix[,which( colnames(isoformCountMatrix) != 'isoform_id')]) > 0
            )]
            nTot <- nrow(isoformCountMatrix)
        } else {
            okIsoforms <-isoformRepExpression$isoform_id[which(
                rowSums(isoformRepExpression[,which( colnames(isoformRepExpression) != 'isoform_id')]) > 0
            )]
            nTot <- nrow(isoformRepExpression)
        }

        nOk <- length(okIsoforms)
        if( nOk != nTot ) {
            if (!quiet) {
                ### Message
                message(
                    paste(
                        '   ',
                        nTot - nOk,
                        paste0( '( ', round( (nTot - nOk) / nTot *100, digits = 2),'%)'),
                        'isoforms were removed since they were not expressed in any samples.'
                    )
                )
            }

            ### Subset expression
            if(abundSuppled) {
                isoformRepExpression <- isoformRepExpression[which(
                    isoformRepExpression$isoform_id %in% okIsoforms
                ),]
            }
            if(countsSuppled) {
                isoformCountMatrix <- isoformCountMatrix[which(
                    isoformCountMatrix$isoform_id %in% okIsoforms
                ),]
            }

            ### Annotation
            isoformExonStructure <- isoformExonStructure[which( isoformExonStructure$isoform_id %in% okIsoforms),]
            isoformAnnotation    <- isoformAnnotation[which( isoformAnnotation$isoform_id %in% okIsoforms),]

            if (addAnnotatedORFs & gtfImported) {
                isoORF <- isoORF[which( isoORF$isoform_id %in% okIsoforms),]
            }

        }
    }

    ### Fix StringTie gene merge problem
    if(TRUE) {
        if(   fixStringTieAnnotationProblem ) {
            if (!quiet) { message('Step 3 of 7: Fixing StringTie gene annoation problems...')}

            ### variables for messages
            anyFixed1 <- FALSE
            anyFixed2 <- FALSE
            anyFixed3 <- FALSE
            anyFixed4 <- FALSE

            ### Fix missing gene_names
            if( any(is.na(isoformAnnotation$gene_name)) ) {
                ### Fix simple missing gene_names (within single gene_name gene_id)
                if( TRUE ) {
                    ### Make list with gene_names (same order)
                    geneNameList <- split(isoformAnnotation$gene_name, isoformAnnotation$gene_id)
                    geneNameList <- geneNameList[unique(isoformAnnotation$gene_id)]

                    ### Count problems
                    nIsoWihoutNames <- sum(
                        is.na(isoformAnnotation$gene_name)
                    )

                    ### Add gene_names to novel StringTie transcripts when possible (only 1 gene_name candidate)
                    isoformAnnotation$gene_name <- unlist(
                        lapply(
                            geneNameList,
                            function(geneNameVec) {
                                localGeneNames <- unique(na.omit( geneNameVec ))

                                if( length( localGeneNames ) == 1 ) {
                                    return(
                                        rep(localGeneNames, times = length(geneNameVec))
                                    )
                                } else {
                                    return(geneNameVec)
                                }
                            }
                        )
                    )

                    ### Re-count problems
                    nIsoWihoutNames2 <- sum(
                        is.na(isoformAnnotation$gene_name)
                    )

                    anyFixed1 <- nIsoWihoutNames - nIsoWihoutNames2 > 0

                    if( anyFixed1 ) {
                        if (!quiet) {
                            message(
                                paste(
                                    '   ',
                                    nIsoWihoutNames - nIsoWihoutNames2,
                                    'isoforms were assigned the gene_name of their associated gene_id.',
                                    '\n        This was only done when the parent gene_id were associated with a single gene_name.',
                                    #'\n',
                                    sep = ' '
                                )
                            )
                        }
                    }

                }

                ### Fix non-simple missing gene_names (within multi gene_names gene_id)
                if(TRUE) {
                    ### Identify gene_ids with problems
                    if(TRUE) {
                        ### Summazie gene properties
                        multiGeneDf <-
                            isoformAnnotation %>%
                            dplyr::select(isoform_id, gene_id, gene_name) %>%
                            dplyr::distinct()

                        geneNameSummary <-
                            multiGeneDf %>%
                            group_by(gene_id) %>%
                            dplyr::summarise(
                                n_gene_names = n_distinct(na.omit(gene_name)),
                                n_iso_na = sum(is.na(gene_name))
                            )

                        ### Identify genes with both missing and multiple gene names
                        multiGenesWithNa <-
                            geneNameSummary %>%
                            dplyr::filter(
                                n_gene_names >= 2 & # at least two genes
                                    n_iso_na > 0           # no novel once
                            )
                    }

                    ### Rescue via genomic overlap of known isoforms (aka with gene_name annotation)
                    if(nrow(multiGenesWithNa) & fixStringTieViaOverlapInMultiGenes) {
                        ### Extract all isoforms with problems
                        genesWithProblems <-
                            multiGeneDf %>%
                            dplyr::filter(gene_id %in% multiGenesWithNa$gene_id)

                        ### Extract exons of interest
                        exonsOi <- isoformExonStructure[which(
                            isoformExonStructure$isoform_id %in% genesWithProblems$isoform_id
                        ),]

                        ### Modify seqnames to ensure only searching within same gene_id?
                        exonsOi <- GRanges(
                            seqnames = exonsOi$gene_id,
                            ranges = IRanges(
                                start = BiocGenerics::start(exonsOi),
                                end   = BiocGenerics::end(exonsOi)
                            ),
                            strand = exonsOi@strand,
                            isoform_id = exonsOi$isoform_id,
                            gene_id = exonsOi$gene_id
                        )

                        ### Convert to list
                        exonsOiList <- split(exonsOi, exonsOi$isoform_id)

                        ### Devide into novel and known
                        knownIsoforms <- genesWithProblems$isoform_id[which(
                            ! is.na(genesWithProblems$gene_name)
                        )]
                        novelIsoforms <- genesWithProblems$isoform_id[which(
                            is.na(genesWithProblems$gene_name)
                        )]
                        knownList <- exonsOiList[ knownIsoforms ]
                        novelList <- exonsOiList[ novelIsoforms ]
                        novelLength <- sapply(width(novelList), sum)

                        ### Identify overlapping isoforms
                        novelIsoOverlap <- findOverlaps(query = novelList, subject = knownList)
                        novelIsoOverlapDf <- as.data.frame(novelIsoOverlap)
                        novelIsoOverlapDf$novel_iso <- names(novelList)[novelIsoOverlapDf$queryHits]
                        novelIsoOverlapDf$known_iso <- names(knownList)[novelIsoOverlapDf$subjectHits]
                        novelIsoOverlapDf <- novelIsoOverlapDf[,c('novel_iso','known_iso')]
                        novelIsoOverlapDf$known_gene_name <- genesWithProblems$gene_name[match(
                            novelIsoOverlapDf$known_iso, genesWithProblems$isoform_id
                        )]

                        ### Calculate overlap
                        novelOverlap <- intersect(novelList[queryHits(novelIsoOverlap)], knownList[subjectHits(novelIsoOverlap)])
                        novelIsoOverlapDf$nt_overlap <- sapply(width(novelOverlap), sum)
                        novelIsoOverlapDf$novel_length <- novelLength[match(
                            novelIsoOverlapDf$novel_iso, names(novelLength)
                        )]
                        novelIsoOverlapDf$frac_overlap <- novelIsoOverlapDf$nt_overlap / novelIsoOverlapDf$novel_length

                        ### For each novel isoform assign gene_name via cutoffs
                        novelAssigned <-
                            novelIsoOverlapDf %>%
                            as_tibble() %>%
                            group_by(novel_iso, known_gene_name) %>%
                            ### For each known_gene extract top contender
                            dplyr::arrange(dplyr::desc(nt_overlap), .by_group = TRUE)  %>%
                            dplyr::slice(1L) %>%
                            ### For each isoform calculate ratios betwen top genes
                            group_by(novel_iso) %>%
                            dplyr::arrange(dplyr::desc(nt_overlap), .by_group = TRUE)  %>%
                            mutate(
                                log2_overlap_ratio = c(
                                    log2( nt_overlap[-length(nt_overlap)] / nt_overlap[-1] ),
                                    Inf # assign Inf if only 1 gene is overlapping
                                )
                            ) %>%
                            ### For each isoform Filter
                            dplyr::filter(
                                nt_overlap         >= fixStringTieMinOverlapSize,
                                frac_overlap       >= fixStringTieMinOverlapFrac,
                                log2_overlap_ratio >= fixStringTieMinOverlapLog2RatioToContender
                            ) %>%
                            ### For each isoform : Extract top contender
                            dplyr::slice(1L)

                        ### Modify annoation
                        toModify <- which(isoformAnnotation$isoform_id %in% novelAssigned$novel_iso)
                        isoformAnnotation$gene_name[toModify] <- novelAssigned$known_gene_name[match(
                            isoformAnnotation$isoform_id[toModify], novelAssigned$novel_iso
                        )]

                        ### Redo problem calculations
                        nIsoWihoutNames3 <- sum(
                            is.na(isoformAnnotation$gene_name)
                        )

                        anyFixed2 <- nIsoWihoutNames2 - nIsoWihoutNames3

                        if( anyFixed2) {
                            if (!quiet) {
                                message(
                                    paste(
                                        '   ',
                                        nIsoWihoutNames2 - nIsoWihoutNames3,
                                        'isoforms were assigned the gene_name of the most similar',
                                        '\n        annotated isoform (defined via overlap in genomic exon coordinates).',
                                        '\n        This was only done if the overlap met the requriements',
                                        '\n        indicated by the three fixStringTieViaOverlap* arguments.',
                                        #'\n',
                                        sep = ' '
                                    )
                                )
                            }
                        }
                    }
                }
            }

            ### Remove non-assigned isoforms within known genes
            if(TRUE) {
                genesWithProblems <-
                    isoformAnnotation %>%
                    dplyr::select(gene_id, gene_name) %>%
                    dplyr::distinct() %>%
                    group_by(gene_id) %>%
                    dplyr::summarise(
                        has_gene_name = any(! is.na(gene_name)),
                        has_novel_iso = any(  is.na(gene_name))
                    ) %>%
                    dplyr::filter(
                        has_gene_name,
                        has_novel_iso
                    )

                ### Extract isoforms to remove
                isoToRemove <- isoformAnnotation$isoform_id[which(
                    isoformAnnotation$gene_id %in% genesWithProblems$gene_id &
                        is.na(isoformAnnotation$gene_name)
                )]

                anyFixed3 <- length(isoToRemove) > 0

                if(length(isoToRemove)) {
                    ### Remove
                    isoformAnnotation <- isoformAnnotation[which(
                        ! isoformAnnotation$isoform_id %in% isoToRemove
                    ),]

                    isoformExonStructure <- isoformExonStructure[which(
                        ! isoformExonStructure$isoform_id %in% isoToRemove
                    ),]

                    if(! is.null(isoformCountMatrix)) {
                        isoformCountMatrix <- isoformCountMatrix[which(
                            ! isoformCountMatrix$isoform_id %in% isoToRemove
                        ),]
                    }

                    if(! is.null(isoformRepExpression)) {
                        isoformRepExpression <- isoformRepExpression[which(
                            ! isoformRepExpression$isoform_id %in% isoToRemove
                        ),]
                    }

                    ### Write message
                    if (!quiet) {
                        message(
                            paste(
                                '   We were unable to assign', length(isoToRemove),
                                'isoform_id\'s (all located within annotated genes) with a gene_name.',
                                '\n        These were removed to enable analysis of the rest of the isoform from within the gene.',
                                #'\n'
                            )
                        )
                    }
                }
            }

            ### Split gene_ids of gene_id with mutiple gene_names
            if(TRUE) {
                ### Summarize problem
                if(TRUE) {
                    multiGeneDf <-
                        isoformAnnotation %>%
                        dplyr::select(isoform_id, gene_id, gene_name) %>%
                        dplyr::distinct()

                    multiGenes <-
                        multiGeneDf %>%
                        group_by(gene_id) %>%
                        dplyr::summarise(
                            n_gene_names = n_distinct(na.omit(gene_name)),
                            n_iso_na = sum(is.na(gene_name))
                        ) %>%
                        dplyr::filter(
                            n_gene_names >= 2 & # at least two genes
                                n_iso_na == 0           # no novel once
                        )

                    nProblems <- nrow(multiGenes)
                }

                ### Split gene_ids
                if(nrow(multiGenes)) {
                    ### Extract corresponding iso data
                    multiGeneDf <-
                        multiGeneDf %>%
                        dplyr::filter(gene_id %in% multiGenes$gene_id)

                    ### Create new gene_ids (by merging with gene_name)
                    multiGeneDf$new_gene_id <- stringr::str_c(
                        multiGeneDf$gene_id,
                        ':',
                        multiGeneDf$gene_name
                    )

                    ### Overwrite in annotation
                    indexToModify <- which(
                        isoformAnnotation$gene_id %in% multiGeneDf$gene_id
                    )
                    isoformAnnotation$gene_id[indexToModify] <-
                        multiGeneDf$new_gene_id[match(
                            isoformAnnotation$isoform_id[indexToModify], multiGeneDf$isoform_id
                        )]

                    ### Overwrite gene_name and gene_ids in exon annotation
                    isoformExonStructure$gene_id <- isoformAnnotation$gene_id[match(
                        isoformExonStructure$isoform_id, isoformAnnotation$isoform_id
                    )]
                    isoformExonStructure$gene_name <- isoformAnnotation$gene_name[match(
                        isoformExonStructure$isoform_id, isoformAnnotation$isoform_id
                    )]
                }

                ### Message summary
                if(TRUE) {
                    ### Redo problem calculations
                    multiGenes <-
                        isoformAnnotation %>%
                        dplyr::select(gene_id, gene_name) %>%
                        dplyr::distinct() %>%
                        group_by(gene_id) %>%
                        dplyr::summarise(
                            n_gene_names = n_distinct(na.omit(gene_name)),
                            n_iso_na = sum(is.na(gene_name))
                        ) %>%
                        dplyr::filter(
                            n_gene_names >= 2 & # at least two genes
                                n_iso_na == 0           # no novel once
                        )

                    nProblems2 <- nrow(multiGenes)

                    anyFixed4 <- nProblems - nProblems2 > 0

                    if( anyFixed4 ) {
                        if (!quiet) {
                            message(
                                paste(
                                    '   ',
                                    nProblems - nProblems2 ,
                                    'gene_ids which were associated with multiple gene_names',
                                    '\n        were split into mutliple genes via their gene_names.',
                                    #'\n',
                                    sep = ' '
                                )
                            )
                        }
                    }
                }

            }

            if(
                ! anyFixed1 &
                ! anyFixed2 &
                ! anyFixed3 &
                ! anyFixed4
            ) {
                message(
                    paste(
                        '    There were no need to fix any annotation',
                        sep = ' '
                    )
                )
            }
        }
    }

    ### If nessesary calculate RPKM values
    if (!quiet) { message('Step 4 of 7: Calculating gene expression and isoform fractions...') }
    if ( ! abundSuppled ) {
        ### Extract isoform lengths
        isoformLengths <- sapply(
            X = split(
                isoformExonStructure@ranges@width,
                f = isoformExonStructure$isoform_id
            ),
            FUN = sum
        )

        ### Calulate CPM
        # convert to matrix
        localCM <- isoformCountMatrix
        rownames(localCM) <- localCM$isoform_id
        localCM$isoform_id <- NULL
        localCM <- as.matrix(localCM)

        myCPM <- t(t(localCM) / colSums(localCM)) * 1e6

        ### Calculate RPKM
        isoformLengths <-
            isoformLengths[match(rownames(myCPM), names(isoformLengths))]

        isoformRepExpression <-
            as.data.frame(myCPM / (isoformLengths / 1e3))

        ### Massage
        isoformRepExpression$isoform_id <-
            rownames(isoformRepExpression)
        isoformRepExpression <-
            isoformRepExpression[, c(
                which(colnames(isoformRepExpression) == 'isoform_id'),
                which(colnames(isoformRepExpression) != 'isoform_id')
            )]
        rownames(isoformRepExpression) <- NULL
    }

    ### Handle sequence input
    if(TRUE) {
        addIsoformNt <- FALSE

        if(!is.null(isoformNtFasta)) {
            isoformNtSeq <- do.call(
                c,
                lapply(isoformNtFasta, function(aFile) {
                    Biostrings::readDNAStringSet(
                        filepath = isoformNtFasta, format = 'fasta'
                    )
                })
            )

            if(!is(isoformNtSeq, "DNAStringSet")) {
                stop('The fasta file supplied to \'isoformNtFasta\' does not contain the nucleotide (DNA) sequence...')
            }

            ### Fix names
            if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {

                names(isoformNtSeq) <- fixNames(
                    nameVec = names(isoformNtSeq),
                    ignoreAfterBar = ignoreAfterBar,
                    ignoreAfterSpace = ignoreAfterSpace,
                    ignoreAfterPeriod = ignoreAfterPeriod
                )
            }

            ### Subset to used
            isoSeqNames <- names(isoformNtSeq)
            isoformNtSeq <- isoformNtSeq[which(
                names(isoformNtSeq) %in% isoformRepExpression$isoform_id
            )]

            ### Remove potential duplication
            isoformNtSeq <- isoformNtSeq[which(
                ! duplicated(names(isoformNtSeq))
            )]

            if(length(isoformNtSeq) == 0) {
                stop(
                    paste(
                        'No sequences in the fasta files had IDs matching the expression data.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
                        '    3 Examples from expression matrix are :',
                        paste0( sample(unique(isoformRepExpression$isoform_id), min(c(3, length(isoformRepExpression$isoform_id))) ), collapse = ', '),'\n',
                        '    3 Examples of sequence annotation are :',
                        paste0( sample(isoSeqNames, min(c(3, length( isoSeqNames ))) ), collapse = ', '),'\n',
                        sep = ' '
                    )
                )

            }

            if( ! all( isoformRepExpression$isoform_id %in% names(isoformNtSeq) ) ) {
                options(warning.length = 2000L)
                warning(
                    paste(
                        'The fasta file supplied to \'isoformNtFasta\' does not contain the',
                        'nucleotide (DNA) sequence for all isoforms quantified and will not be added!',
                        '\nSpecifically:\n',
                        length(unique(isoformRepExpression$isoform_id)), 'isoforms were quantified.\n',
                        length(unique(names(isoformNtSeq))), 'isoforms have a sequence.\n',
                        'Only', length(intersect(names(isoformNtSeq), isoformRepExpression$isoform_id)), 'overlap.\n',
                        length(setdiff(unique(isoformRepExpression$isoform_id), names(isoformNtSeq))), 'isoforms quantifed isoforms had no corresponding nucleotide sequence\n',

                        '\nIf there is no overlap (as in zero or close) there are two options:\n',
                        '1) The files do not fit together (different databases, versions etc)',
                            '(no fix except using propperly paired files).\n',
                        '2) It is somthing to do with how the isoform ids are stored in the different files.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
                        '    3 Examples from expression matrix are :',
                        paste0( sample(unique(isoformRepExpression$isoform_id), min(c(3, length(isoformRepExpression$isoform_id))) ), collapse = ', '),'\n',
                        '    3 Examples of sequence annotation are :',
                        paste0( sample(names(isoformNtSeq), min(c(3, length( isoformNtSeq ))) ), collapse = ', '),'\n',

                        '\nIf there is a large overlap but still far from complete there are 3 possibilites:\n',
                        '1) The files do not fit together (different databases versions)',
                            '(no fix except using propperly paired files).\n',
                        '2) The isoforms quantified have their nucleotide sequence stored in multiple fasta files (common for Ensembl).',
                        'Just supply a vector with the path to each of them to the \'isoformNtFasta\' argument.\n',
                        '3) One file could contain non-chanonical chromosomes while the other do not',
                        '(might be solved using the \'removeNonConvensionalChr\' argument.)\n',
                        '4) It is somthing to do with how a subset of the isoform ids are stored in the different files.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n\n',
                        sep = ' '
                    )
                )
            } else {
                addIsoformNt <- TRUE
            }
        }
    }

    ### Sum to gene level gene expression - updated
    if(TRUE) {
        ### add gene_id
        isoformRepExpression$gene_id <-
            isoformAnnotation$gene_id[match(isoformRepExpression$isoform_id,
                                            isoformAnnotation$isoform_id)]

        ### Sum to gene level
        geneRepExpression <- isoformToGeneExp(
            isoformRepExpression,
            quiet = TRUE
        )

        ### Remove gene id
        isoformRepExpression$gene_id <- NULL
    }

    ### Calculate IF rep matrix
    if(TRUE) {
        isoformRepIF <- isoformToIsoformFraction(
            isoformRepExpression=isoformRepExpression,
            geneRepExpression=geneRepExpression,
            isoformGeneAnnotation=isoformAnnotation,
            quiet = TRUE
        )

    }

    ### in each condition analyzed get mean and standard error of gene and isoforms
    if (!quiet) {
        message('Step 5 of 7: Merging gene and isoform expression...')
    }
    if (TRUE) {
        conditionList <-
            split(designMatrix$sampleID, f = designMatrix$condition)
        conditionSummary <-
            plyr::llply(
                .data = conditionList,
                .progress = progressBar,
                .fun = function(sampleVec) {
                    # sampleVec <- conditionList[[1]]
                    ### Isoform and IF
                    isoIndex <-
                        which(colnames(isoformRepExpression) %in% sampleVec)

                    isoSummary <- data.frame(
                        isoform_id       = isoformRepExpression$isoform_id,
                        iso_overall_mean = rowMeans(isoformRepExpression[,designMatrix$sampleID, drop=FALSE]),
                        iso_value        = rowMeans(isoformRepExpression[, isoIndex, drop=FALSE]),
                        iso_std          = apply(   isoformRepExpression[, isoIndex, drop=FALSE], 1, sd),
                        IF_overall       = rowMeans(isoformRepIF[,designMatrix$sampleID, drop=FALSE], na.rm = TRUE),
                        IF               = rowMeans(isoformRepIF[, isoIndex, drop=FALSE], na.rm = TRUE),
                        stringsAsFactors = FALSE
                    )
                    isoSummary$iso_stderr <-
                        isoSummary$iso_std / sqrt(length(sampleVec))
                    isoSummary$iso_std <- NULL

                    ### Gene
                    geneIndex <-
                        which(colnames(geneRepExpression) %in% sampleVec)

                    geneSummary <- data.frame(
                        gene_id = geneRepExpression$gene_id,
                        gene_overall_mean = rowMeans(geneRepExpression[,designMatrix$sampleID, drop=FALSE]),
                        gene_value = rowMeans(geneRepExpression[, geneIndex, drop=FALSE]),
                        gene_std = apply(geneRepExpression[, geneIndex, drop=FALSE], 1, sd),
                        stringsAsFactors = FALSE
                    )
                    geneSummary$gene_stderr <-
                        geneSummary$gene_std / sqrt(length(sampleVec))
                    geneSummary$gene_std <- NULL

                    ### Combine
                    combinedData <-
                        dplyr::inner_join(isoformAnnotation, geneSummary, by = 'gene_id')
                    combinedData <-
                        dplyr::inner_join(combinedData, isoSummary, by = 'isoform_id')
                    ### return result
                    return(combinedData)
                }
            )
    }

    ### Use comparisonsToMake to create the isoform comparisons
    if (!quiet) {
        message('Step 6 of 7: Making comparisons...')
    }
    if (TRUE) {
        isoAnnot <-
            plyr::ddply(
                .data = comparisonsToMake,
                .variables = c('condition_1', 'condition_2'),
                .drop = TRUE,
                .progress = progressBar,
                .fun = function(aDF) { # aDF <- comparisonsToMake[1,]
                    ### Extract data
                    cond1data <- conditionSummary[[aDF$condition_1]]
                    cond2data <- conditionSummary[[aDF$condition_2]]

                    ### modify colnames in condition 1
                    matchIndex <-
                        match(
                            c(
                                'gene_value',
                                'gene_stderr',
                                'iso_value',
                                'iso_stderr'
                            ),
                            colnames(cond1data)
                        )
                    colnames(cond1data)[matchIndex] <-
                        paste(colnames(cond1data)[matchIndex], '_1', sep = '')
                    colnames(cond1data)[which( colnames(cond1data) == 'IF')] <- 'IF1'

                    ### modify colnames in condition 2
                    matchIndex <-
                        match(
                            c(
                                'gene_value',
                                'gene_stderr',
                                'iso_value',
                                'iso_stderr'
                            ),
                            colnames(cond2data)
                        )
                    colnames(cond2data)[matchIndex] <-
                        paste(colnames(cond2data)[matchIndex], '_2', sep = '')
                    colnames(cond2data)[which( colnames(cond2data) == 'IF')] <- 'IF2'

                    combinedIso <- dplyr::inner_join(
                        cond1data,
                        cond2data[, c(
                            'isoform_id',
                            'gene_value_2',
                            'gene_stderr_2',
                            'iso_value_2',
                            'iso_stderr_2',
                            'IF2'
                        )],
                        by = 'isoform_id'
                    )

                    ### Add comparison data
                    combinedIso$condition_1 <- aDF$condition_1
                    combinedIso$condition_2 <- aDF$condition_2
                    return(combinedIso)
                }
            )

        ### Add comparison data
        # Log2FC
        ps <- foldChangePseudoCount

        isoAnnot$gene_log2_fold_change <-
            log2((isoAnnot$gene_value_2 + ps) / (isoAnnot$gene_value_1 + ps))
        isoAnnot$iso_log2_fold_change  <-
            log2((isoAnnot$iso_value_2  + ps) / (isoAnnot$iso_value_1  + ps))

        # qValues
        isoAnnot$gene_q_value <- NA
        isoAnnot$iso_q_value  <- NA

        # Isoform fraction values
        isoAnnot$dIF <- isoAnnot$IF2 - isoAnnot$IF1

        # Swich values
        isoAnnot$isoform_switch_q_value <- NA
        isoAnnot$gene_switch_q_value    <- NA

        ### Sort
        matchVector <-
            c(
                'isoform_id',
                'gene_id',
                'condition_1',
                'condition_2',
                'gene_name',
                'class_code',
                'gene_biotype',
                'iso_biotype',
                'gene_overall_mean',
                'gene_value_1',
                'gene_value_2',
                'gene_stderr_1',
                'gene_stderr_2',
                'gene_log2_fold_change',
                'gene_q_value',
                'iso_overall_mean',
                'iso_value_1',
                'iso_value_2',
                'iso_stderr_1',
                'iso_stderr_2',
                'iso_log2_fold_change',
                'iso_q_value',
                'IF_overall',
                'IF1',
                'IF2',
                'dIF',
                'isoform_switch_q_value',
                'gene_switch_q_value'
            )
        matchVector <-
            na.omit(match(matchVector, colnames(isoAnnot)))

        isoAnnot <- isoAnnot[, matchVector]
    }

    ### Create the swichList
    if (!quiet) {
        message('Step 7 of 7: Making switchAnalyzeRlist object...')
    }
    if (TRUE) {
        if( countsSuppled ) {
            ### Create switchList
            dfSwichList <- createSwitchAnalyzeRlist(
                isoformFeatures = isoAnnot,
                exons = isoformExonStructure,
                designMatrix = designMatrix,
                isoformCountMatrix = isoformCountMatrix,     # nessesary for drimseq
                isoformRepExpression = isoformRepExpression, # nessesary for limma
                sourceId = 'data.frames'
            )
        } else {
            ### Create switchList
            dfSwichList <- createSwitchAnalyzeRlist(
                isoformFeatures = isoAnnot,
                exons = isoformExonStructure,
                designMatrix = designMatrix,
                isoformRepExpression = isoformRepExpression, # nessesary for limma
                sourceId = 'data.frames'
            )
        }

        ### Add orf if extracted
        if (addAnnotatedORFs & gtfImported) {
            dfSwichList$isoformFeatures$PTC <-
                isoORF$PTC[match(dfSwichList$isoformFeatures$isoform_id,
                                 isoORF$isoform_id)]

            isoORF <-
                isoORF[which(isoORF$isoform_id %in%
                                 isoformRepExpression$isoform_id), ]

            dfSwichList$orfAnalysis <- isoORF
        }

        ### Add IF matrix if needed
        if( addIFmatrix ) {
            dfSwichList$isoformRepIF <- isoformRepIF[,c('isoform_id',designMatrix$sampleID)]
        }

        ### Add nucleotide sequence
        if(addIsoformNt) {
            dfSwichList$ntSequence <- isoformNtSeq[which(
                names(isoformNtSeq) %in% dfSwichList$isoformFeatures$isoform_id
            )]

        }

    }

    ### Estimate DTU
    if(estimateDifferentialGeneRange & !quiet) {
        localEstimate <- estimateDifferentialRange(dfSwichList)

        message('The GUESSTIMATED number of genes with differential isoform usage are:')
        print(localEstimate)
    }


    ### Return switchList
    if (!quiet) {
        message('Done\n')
    }
    return(dfSwichList)
}

### Supporting tximeta
prepareSalmonFileDataFrame <- function(
    ### Core arguments
    parentDir,

    ### Advanced arguments
    pattern='',
    invertPattern=FALSE,
    ignore.case=FALSE,
    quiet = FALSE
) {
    ### Initialize
    if(TRUE) {
        ### data.frame with nesseary info
        supportedTypes <- data.frame(
            orign          = c('Salmon'         ),
            fileName       = c('quant.sf'       ),
            eLengthCol     = c('EffectiveLength'),
            stringsAsFactors = FALSE
        )
        ### Add support for detection of compressed files
        supportedTypes2 <- supportedTypes
        supportedTypes2$fileName <- paste0(supportedTypes2$fileName, '.gz')
        supportedTypes <- rbind(
            supportedTypes,
            supportedTypes2
        )

        headerTypes <- list(
            Salmon = c('Name','Length','EffectiveLength','TPM','NumReads')
        )
    }

    ### Identify directories of interest
    if (TRUE) {
        dirList <- split(
            list.dirs(
                path = parentDir,
                full.names = FALSE,
                recursive = FALSE
            ),
            list.dirs(
                path = parentDir,
                full.names = FALSE,
                recursive = FALSE
            )
        )
        dirList <- dirList[which(sapply(dirList, nchar) > 0)]


        if(length(dirList) == 0) {
            stop('No subdirecories were found in the supplied folder. Please check and try again.')
        }

        ### Extract those where there are files of interest
        dirList <-
            dirList[sapply(
                dirList,
                FUN = function(aDir) {
                    # aDir <- dirList[[6]]
                    localFiles <-
                        list.files(
                            paste0(parentDir, '/', aDir),
                            recursive = FALSE
                        )

                    if(length( localFiles )) {
                        fileOfInterest <- any(
                            sapply(
                                paste(supportedTypes$fileName, '$', sep = ''),
                                function(aFileName) {
                                    grepl(pattern = aFileName, x =  localFiles)
                                })
                        )
                    } else{
                        fileOfInterest <- FALSE
                    }

                    return(fileOfInterest)
                }
            )]

        ### Remove hidden directories
        if( any( grepl('^\\.', names(dirList)) )  ) {
            nHidden <- sum( grepl('^\\.', names(dirList)) )
            nTotal <- length(dirList)
            warning(
                paste(
                    'The importIsoformExpression() function identified',
                    nHidden,
                    'hidden sub-directories',
                    paste0('(of a total ',nTotal,' sub-directories of interest)'),
                    '\nThese were identified as having the prefix "." and will be ignored.',
                    '\nIf you want to keep them you will have to re-name the sub-directories omitting the starting ".".',
                    sep=' '
                )
            )

            dirList <- dirList[which(
                ! grepl('^\\.', names(dirList))
            )]
        }

        if (length(dirList) == 0) {
            stop(
                paste(
                    'There were no directories containing the file names/suffixes',
                    'typically generated by Kallisto/Salmon/RSEM/StringTie.',
                    'Have you renamed the quantification files?',
                    '(if so you should probably use the "sampleVector" argument instead).',
                    sep=' '
                )
            )
        }

    }

    ### Identify input type
    if(TRUE) {
        dataAnalyed <- supportedTypes[which(
            sapply(
                paste0(supportedTypes$fileName,'$'),
                function(aFileName) {
                    any(grepl(
                        pattern = aFileName,
                        x = list.files(paste0( parentDir, '/', dirList[[1]] ))
                    ))
                })
        ), ]

        if (nrow(dataAnalyed) == 0) {
            stop(
                paste(
                    'Could not identify any Salmon files.',
                    sep=' '
                )
            )
        }
        if (nrow(dataAnalyed) > 1) {
            stop(
                paste(
                    'Could not identify any Salmon files.',
                    sep=' '
                )
            )
        }
    }

    ### Make paths for tximport
    if(TRUE) {
        ### make vector with paths
        localFiles <- sapply(
            dirList,
            function(aDir) {
                list.files(
                    path = paste0( parentDir, '/', aDir, '/' ),
                    pattern = paste0(dataAnalyed$fileName, '$'),
                    full.names = TRUE
                )

            }
        )
        names(localFiles) <- names(dirList)

        ### Subset to those of interest
        if( invertPattern ) {
            localFiles <- localFiles[which(
                ! grepl(
                    pattern = pattern,
                    x = localFiles,
                    ignore.case=ignore.case
                )
            )]
        } else {
            localFiles <- localFiles[which(
                grepl(
                    pattern = pattern,
                    x = localFiles,
                    ignore.case=ignore.case
                )
            )]
        }

        if( length(localFiles) == 0 ) {
            stop('No files were left after filtering via the \'pattern\' argument')
        }

        if (!quiet) {
            message(
                paste0(
                    'Found ',
                    length(localFiles),
                    ' Salmon quantifications of interest'
                )
            )
        }

        ### Test existence
        if(TRUE) {
            fileTest <- file.exists(localFiles)

            if( !all(fileTest)) {
                stop(
                    paste0(
                        '\nSomething went wrong with the file-path creation. Please contact developer with reproducible example.',
                        '\n One file which did not work out was:\n ',
                        localFiles[which( ! fileTest) [1]],
                        sep=''
                    )
                )
            }
        }
    }

    ### Make data.frame
    if(TRUE) {
        dfData <- data.frame(
            files = localFiles,
            names = names(localFiles),
            condition = NA,
            row.names = NULL,
            stringsAsFactors = FALSE
        )
    }

    ### Final message
    if (!quiet) {
        message(
            'Adding NAs as conditions. Please modify these manually.'
        )
    }

    return(dfData)
}

importSalmonData <- function(
    ### Core arguments
    salmonFileDataFrame,

    ### Advanced arguments
    comparisonsToMake=NULL,
    ignoreAfterBar = TRUE,
    ignoreAfterSpace = TRUE,
    ignoreAfterPeriod = FALSE,
    showProgress = TRUE,
    quiet = FALSE,
    ...
) {
    ### Test input data.frame
    if(TRUE) {
        colsToHave <- c(
            "files" ,"names","condition"
        )
        if( ! all(colsToHave %in% colnames(salmonFileDataFrame)) ) {
            stop(
                paste(
                    'The \'salmonFileDataFrame\' needs to also contain these column(s)',
                    setdiff(colsToHave, colnames(salmonFileDataFrame))

                )
            )
        }

        if( any(is.na(salmonFileDataFrame$condition)) ) {
            stop('The \'condition\' column cannot contain NAs')
        }

        if( length(unique(salmonFileDataFrame$condition)) == 1) {
            stop('There appear to be only 1 condition annoatated in the \'condition\' column.')
        }
    }

    ### Use tximeta to import data and meta data
    if(TRUE) {
        ### Message
        if (!quiet) { message('Importing quantification data...') }
        suppressMessages(
            suppressWarnings(
                localSe <- tximeta(
                    coldata = salmonFileDataFrame,
                    countsFromAbundance = 'scaledTPM'
                )
            )
        )

        gtfPath <- metadata(localSe)$txomeInfo$gtf

        if (!quiet) { message('Importing annoation data...') }
        #suppressMessages(
        #    suppressWarnings(
        #        localSe <- addExons(localSe)
        #    )
        #)
        #suppressMessages(
        #    suppressWarnings(
        #        localSe <- addCDS(localSe)
        #    )
        #)
        suppressMessages(
            suppressWarnings(
                localNtSeq <- retrieveCDNA(localSe)
            )
        )
    }

    ### Massage imported data
    if(TRUE) {
        if (!quiet) { message('Massaging data...') }

        ### Count data
        if(TRUE) {
            localCm <-
                assay(localSe, "counts") %>%
                as.data.frame() %>%
                rownames_to_column('isoform_id')
        }

        ### NT sequence
        if(TRUE) {
            names(localNtSeq) <- fixNames(
                names(localNtSeq),
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )
        }

        ### Exon data
        if(FALSE) {
            localGr <- unlist(rowRanges(localSe))
            localGr$isoform_id <- names(localGr)
            localGr$exon_id <- NULL
            localGr$exon_rank <- NULL
            localGr$gene_id <- rowData(localSe)$gene_id[match(
                localGr$isoform_id, rowData(localSe)$tx_id
            )]
            localGr$gene_name <- rowData(localSe)$gene_name[match(
                localGr$isoform_id, rowData(localSe)$tx_id
            )]
            names(localGr) <- NULL
            localGr$isoform_id <- fixNames(
                localGr$isoform_id,
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )
        }

        ### Coding regions
        if(FALSE) {
            localCds <- unlist(rowData(localSe)$cds[which(rowData(localSe)$coding)])
            localCds$isoform_id <- names(localCds)
            localCds$exon_id <- NULL
            localCds$exon_rank <- NULL
            localCds$isoform_id <- fixNames(
                localCds$isoform_id,
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )

            ### Analyze CDS
            orfInfo <- analyseCds(
                myCDS = localCds,
                localExons = localGr,
                onlyConsiderFullORF = FALSE,
                PTCDistance = 50
            )

            # make sure all ORFs are annotated (with NAs)
            orfInfo <-
                dplyr::full_join(
                    orfInfo,
                    localCm[, 'isoform_id', drop = FALSE],
                    by = 'isoform_id',
                    all = TRUE
                )
        }

        ### Design
        if(TRUE) {
            localDesign <- data.frame(
                sampleID = salmonFileDataFrame$names,
                condition = salmonFileDataFrame$condition,
                stringsAsFactors = FALSE
            )
        }
    }

    ### Make switchAnalyzeRlist
    if(TRUE) {
        if (!quiet) { message('Making switchAnalyzeRlist...') }

        localSL <- importRdata(
            isoformCountMatrix = localCm,
            designMatrix = localDesign,
            #isoformExonAnnoation = localGr,
            isoformExonAnnoation = gtfPath,
            isoformNtFasta = NULL,
            comparisonsToMake=comparisonsToMake,
            addAnnotatedORFs=FALSE,
            ignoreAfterBar = ignoreAfterBar,
            ignoreAfterSpace = ignoreAfterSpace,
            ignoreAfterPeriod = ignoreAfterPeriod,
            showProgress=showProgress,
            quiet=quiet,
            ...
        )

    }

    ### Add extra annotation data
    if(TRUE) {
        localNtSeq <- localNtSeq[which(
            names(localNtSeq) %in% localSL$isoformFeatures$isoform_id
        )]
        if( ! all(localSL$isoformFeatures$isoform_id %in% names(localNtSeq)) ) {
            stop('Something went wrong with obtaining the nucleotide sequence. Please make sure you link fasta files as well.')
        }

        localSL$ntSequence <- localNtSeq

        #localSL$orfAnalysis <- orfInfo
    }

    ### Subset to ensure everything is alligned
    if(TRUE) {
        localSL <- subsetSwitchAnalyzeRlist(
            localSL,
            TRUE
        )
    }

    ### Return
    return(localSL)
}

### Prefilter
preFilter <- function(
    switchAnalyzeRlist,
    geneExpressionCutoff = 1,
    isoformExpressionCutoff = 0,
    IFcutoff = 0.01,
    acceptedGeneBiotype = NULL,
    acceptedIsoformClassCode = NULL,
    removeSingleIsoformGenes = TRUE,
    reduceToSwitchingGenes = FALSE,
    reduceFurtherToGenesWithConsequencePotential = FALSE,
    onlySigIsoforms = FALSE,
    keepIsoformInAllConditions = FALSE,
    alpha = 0.05,
    dIFcutoff = 0.1,
    quiet = FALSE
) {
    ### Test input
    if (TRUE) {
        if (class(switchAnalyzeRlist) != 'switchAnalyzeRlist') {
            stop(paste(
                'The object supplied to \'switchAnalyzeRlist\'',
                'must be a \'switchAnalyzeRlist\''
            ))
        }

        if( switchAnalyzeRlist$sourceId == 'gtf') {
            warning(
                paste(
                    'The switchAnalyzeRlist seems to be created from a gtf file',
                    'wereby expression is probably not annotated.',
                    'Running preFilter() might not be what you want.',
                    '\nIf expression info was manually added afterwards',
                    'please ignore this warning.',
                    sep=' '
                )
            )
        }

        if( switchAnalyzeRlist$sourceId == 'preDefinedSwitches') {
            if( all(is.na(switchAnalyzeRlist$isoformFeatures$IF_overall)) ) {
                stop('The switchAnalyzeRlist was created without expression data whereby it cannot be filtered')
            }
        }


        if (!is.null(isoformExpressionCutoff)) {
            if (!is.numeric(isoformExpressionCutoff)) {
                stop('The isoformExpressionCutoff argument must be a numeric')
            }
        }
        if (!is.null(geneExpressionCutoff)) {
            if (!is.numeric(geneExpressionCutoff)) {
                stop('The geneExpressionCutoff argument must be a numeric')
            }
        }
        if (!is.null(IFcutoff)) {
            if (!is.numeric(IFcutoff)) {
                stop('The IFcutoff argument must be a numeric')
            }
        }

        if (!is.null(acceptedIsoformClassCode)) {
            if (!'class_code' %in%
                colnames(switchAnalyzeRlist$isoformFeatures)) {
                stop(paste(
                    'The filter on class codes can only be used if',
                    'the switchAnalyzeRlist was generated from cufflinks data'
                ))
            }
        }

        if( !is.null(acceptedGeneBiotype) & 'gene_biotype' %in% colnames(switchAnalyzeRlist$isoformFeatures) ) {
            okBiotypes <- unique(switchAnalyzeRlist$isoformFeatures$gene_biotype)

            if( !all(acceptedGeneBiotype %in% okBiotypes) ) {
                notAnnot <- setdff(acceptedGeneBiotype, okBiotypes)

                warning(
                    paste(
                        'Some of the supplied biotypes are not found in the isoforms supplied and will be ignored\n',
                        'These are:', paste(notAnnot, collapse = ', ')
                    )
                )
            }
        }

        if (!is.logical(removeSingleIsoformGenes)) {
            stop('The removeSingleIsoformGenes must be either TRUE or FALSE')
        }

        if (reduceToSwitchingGenes) {
            hasTest <- any(!is.na(
                c(
                    switchAnalyzeRlist$isoformFeatures$isoform_switch_q_value,
                    switchAnalyzeRlist$isoformFeatures$gene_switch_q_value
                )
            ))
            if( ! hasTest) {
                stop(
                    paste(
                        'The switchAnalyzeRlist does not contain the result',
                        'of a switch analysis.\nPlease turn off','
                        the "reduceToSwitchingGenes" argument try again.',
                        sep=' '
                    )
                )
            }


            if (alpha < 0 |
                alpha > 1) {
                stop('The alpha parameter must be between 0 and 1 ([0,1]).')
            }
            if (alpha > 0.05) {
                warning(paste(
                    'Most journals and scientists consider an alpha larger',
                    'than 0.05 untrustworthy. We therefore recommend using',
                    'alpha values smaller than or queal to 0.05'
                ))
            }
        }
    }

    ### Find which isoforms to keep
    if (TRUE) {
        ### Extract data to filter on
        if(TRUE) {
            columnsToExtraxt <-
                c(
                    'iso_ref',
                    'gene_ref',
                    'isoform_id',
                    'gene_id',
                    'gene_biotype',
                    'class_code',
                    'gene_value_1',
                    'gene_value_2',
                    'iso_overall_mean',
                    'IF_overall',
                    'dIF',
                    'gene_switch_q_value',
                    'isoform_switch_q_value'
                )
            columnsToExtraxt <-
                na.omit(match(
                    columnsToExtraxt,
                    colnames(switchAnalyzeRlist$isoformFeature)
                ))
            #localData <- unique( switchAnalyzeRlist$isoformFeature[, columnsToExtraxt ] ) # no need
            localData <-
                switchAnalyzeRlist$isoformFeature[, columnsToExtraxt]

            # Count features
            transcriptCount <- length(unique(localData$isoform_id))
        }

        ### Reduce to genes with switches
        if (reduceToSwitchingGenes ) {
            if( reduceFurtherToGenesWithConsequencePotential ) {
                tmp <- extractSwitchPairs(
                    switchAnalyzeRlist,
                    alpha = alpha,
                    dIFcutoff = dIFcutoff,
                    onlySigIsoforms = onlySigIsoforms
                )
                deGenes <- unique(tmp$gene_ref)

            } else {
                isoResTest <-
                    any(!is.na(
                        localData$isoform_switch_q_value
                    ))
                if (isoResTest) {
                    deGenes <- localData$gene_ref[which(
                        localData$isoform_switch_q_value < alpha &
                            abs(localData$dIF) > dIFcutoff
                    )]
                } else {
                    deGenes <- localData$gene_ref[which(
                        localData$gene_switch_q_value < alpha &
                            abs(localData$dIF) > dIFcutoff
                    )]
                }
            }

            localData <- localData[which(
                localData$gene_ref %in% deGenes
            ),]

            if (!nrow(localData)) {
                stop('No genes were left after filtering for switching genes')
            }


        }



        if (!is.null(geneExpressionCutoff)) {
            localData <- localData[which(
                localData$gene_value_1 > geneExpressionCutoff &
                localData$gene_value_2 > geneExpressionCutoff
            ), ]
            if (!nrow(localData)) {
                stop('No genes were left after filtering for gene expression')
            }
        }

        if (!is.null(isoformExpressionCutoff)) {
            localData <- localData[which(
                localData$iso_overall_mean > isoformExpressionCutoff
            ), ]
            if (!nrow(localData)) {
                stop('No genes were left after filtering for isoform expression')
            }
        }

        if (!is.null(IFcutoff)) {
            localData <- localData[which(
                localData$IF_overall > IFcutoff
            ), ]
            if (!nrow(localData)) {
                stop('No genes were left after filtering for isoform fraction (IF) values')
            }
        }

        if (!is.null(acceptedGeneBiotype)) {
            if( 'gene_biotype' %in% colnames(localData) ) {
                localData <- localData[which(localData$gene_biotype %in% acceptedGeneBiotype), ]

                if (!nrow(localData)) {
                    stop('No genes were left after filtering for acceptedGeneBiotype.')
                }
            } else {
                warning('Gene biotypes were not annotated so the \'acceptedGeneBiotype\' argument was ignored.')
            }

        }

        if (!is.null(acceptedIsoformClassCode)) {
            localData <- localData[which(localData$class_code %in% acceptedIsoformClassCode), ]

            if (!nrow(localData)) {
                stop('No genes were left after filtering for isoform class codes')
            }
        }

        if (removeSingleIsoformGenes) {
            transcriptsPrGene <-
                split(localData$iso_ref, f = localData$gene_ref)
            transcriptsPrGene <- lapply(transcriptsPrGene, unique)

            genesToKeep <-
                names(transcriptsPrGene)[which(sapply(transcriptsPrGene, function(x)
                    length(x) > 1))]

            if (!length(genesToKeep)) {
                stop('No genes were left after filtering for mutlipe transcrip genes')
            }

            localData <-
                localData[which(localData$gene_ref %in% genesToKeep),]
        }

    }

    ### Do filtering
    if (keepIsoformInAllConditions) {
        switchAnalyzeRlist <- subsetSwitchAnalyzeRlist(
            switchAnalyzeRlist,
            switchAnalyzeRlist$isoformFeatures$isoform_id %in%
                localData$isoform_id
        )
    } else {
        switchAnalyzeRlist <- subsetSwitchAnalyzeRlist(
            switchAnalyzeRlist,
            switchAnalyzeRlist$isoformFeatures$iso_ref %in% localData$iso_ref
        )
    }

    ### Repport filtering
    transcriptsLeft <- length(unique(localData$isoform_id))
    transcriptsRemoved <- transcriptCount - transcriptsLeft
    percentRemoved <-
        round(transcriptsRemoved / transcriptCount, digits = 4) * 100

    if (!quiet) {
        message(
            paste(
                'The filtering removed ',
                transcriptsRemoved,
                ' ( ',
                percentRemoved,
                '% of ) transcripts. There is now ',
                transcriptsLeft,
                ' isoforms left',
                sep = ''
            )
        )
    }

    ### return result
    return(switchAnalyzeRlist)
}

Try the IsoformSwitchAnalyzeR package in your browser

Any scripts or data that you put into this service are public.

IsoformSwitchAnalyzeR documentation built on Nov. 8, 2020, 5:36 p.m.