IsoformSwitchAnalyzeR: Identify, Annotate and Visualize Isoform Switches with Functional Consequences from both short- and long-read RNA-seq data

Documented in importCufflinksFiles importGTF importIsoformExpression importRdata importSalmonData preFilter prepareSalmonFileDataFrame

### Acutal import functions
importCufflinksFiles <- function(
    ### Core arguments
    pathToGTF,
    pathToGeneDEanalysis,
    pathToIsoformDEanalysis,
    pathToGeneFPKMtracking,
    pathToIsoformFPKMtracking,
    pathToIsoformReadGroupTracking,
    pathToSplicingAnalysis = NULL,
    pathToReadGroups,
    pathToRunInfo,
    isoformNtFasta = NULL,

    ### Advanced arguments
    fixCufflinksAnnotationProblem = TRUE,
    addIFmatrix = TRUE,
    estimateDifferentialGeneRange = TRUE,
    quiet = FALSE
) {
    ### Test that files exist
    if (TRUE) {
        if( pathToGTF == '' ) {
            stop(
                paste(
                    'The \'pathToGTF\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                    '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                    'to import your own data? The system.file() should only be used',
                    'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                    'To access your own data simply provide the string to the directory with the data as:',
                    '"path/to/quantification/".',
                    sep=' '
                )
            )
        }
        if( pathToGeneDEanalysis == '' ) {
            stop(
                paste(
                    'The \'pathToGeneDEanalysis\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                    '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                    'to import your own data? The system.file() should only be used',
                    'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                    'To access your own data simply provide the string to the directory with the data as:',
                    '"path/to/quantification/".',
                    sep=' '
                )
            )
        }

        # pathToGTF
        if (!file.exists(pathToGTF)) {
            stop('The \'pathToGTF\' argument does not point to an acutal file')
        }
        if( !is.null(isoformNtFasta)) {
            if( !is.character( isoformNtFasta)) {
                stop('The \'isoformNtFasta\' argument must be a charachter string.')
            }

            if( any(isoformNtFasta == '') ) {
                stop(
                    paste(
                        'The \'isoformNtFasta\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                        '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                        'to import your own data? The system.file() should only be used',
                        'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                        'To access your own data simply provide the string to the directory with the data as:',
                        '"path/to/quantification/".',
                        sep=' '
                    )
                )
            }

            if( any( ! sapply(isoformNtFasta, file.exists) ) ) {
                stop('At least one of the file(s) pointed to with \'isoformNtFasta\' seems not to exist.')
            }

            if( any(! grepl('\\.fa|\\.fasta|\\.fa.gz|\\.fasta.gz', isoformNtFasta)) ) {
                stop('The file pointed to via the \'isoformNtFasta\' argument does not seem to be a fasta file...')
            }
        }

        # DE
        if (!file.exists(pathToGeneDEanalysis))    {
            stop('The \'pathToGeneDEanalysis\' argument does not point to an acutal file')
        }
        if (!file.exists(pathToIsoformDEanalysis)) {
            stop('The \'pathToIsoformDEanalysis\' argument does not point to an acutal file')
        }
        # Tracking
        if (!file.exists(pathToGeneFPKMtracking))    {
            stop('The \'pathToGeneFPKMtracking\' argument does not point to an acutal file')
        }
        if (!file.exists(pathToIsoformFPKMtracking)) {
            stop(
                'The \'pathToIsoformFPKMtracking\' argument does not point to an acutal file'
            )
        }
        if (!file.exists(pathToIsoformReadGroupTracking)) {
            stop(
                'The \'pathToIsoformReadGroupTracking\' argument does not point to an acutal file'
            )
        }
        # splicing
        if (!is.null(pathToSplicingAnalysis)) {
            if (!file.exists(pathToSplicingAnalysis)) {
                stop(
                    'The \'pathToSplicingAnalysis\' argument does not point to an acutal file'
                )
            }
        }
        # info
        if (!file.exists(pathToReadGroups)) {
            stop('The \'pathToReadGroups\' argument does not point to an acutal file')
        }
        if (!file.exists(pathToRunInfo))    {
            stop('The \'pathToRunInfo\' argument does not point to an acutal file')
        }
    }

    ### Import the supplied files (not gtf)
    if (TRUE) {
        if (!quiet) { message('Step 1 of 5: Importing data...')}
        suppressMessages(
            geneDiffanalysis     <-
                readr::read_tsv(
                    file = pathToGeneDEanalysis,
                    col_names = TRUE
                )
        )
        suppressMessages(
            isoformDiffanalysis  <-
                readr::read_tsv(
                    file = pathToIsoformDEanalysis,
                    col_names = TRUE
                )
        )
        suppressMessages(
            geneAnnotation       <-
                readr::read_tsv(
                    file = pathToGeneFPKMtracking,
                    col_names = TRUE
                )
        )
        suppressMessages(
            isoformAnnotation    <-
                readr::read_tsv(
                    file = pathToIsoformFPKMtracking,
                    col_names = TRUE
                )
        )
        suppressMessages(
            isoRepExp        <-
                read.table(
                    file = pathToIsoformReadGroupTracking,
                    header = TRUE,
                    sep='\t',
                    stringsAsFactors = FALSE
                )
        )
        if( !is.null(pathToSplicingAnalysis) ) {
            suppressMessages(
                cuffSplicing         <-
                    readr::read_tsv(
                        file = pathToSplicingAnalysis,
                        col_names = TRUE
                    )
            )
        }

        suppressMessages(
            readGroup <-
                read.table(
                    file = pathToReadGroups,
                    sep='\t',
                    header = TRUE
                )
        )
        suppressMessages(
            runInfo   <-
                readr::read_tsv(
                    file = pathToRunInfo,
                    col_names = TRUE
                )
        )
    }

    ### "Test" that the data.files are what they are supposed to be
    if (TRUE) {
        ### gene diff analysis
        q1 <-
            !all(
                colnames(geneDiffanalysis) %in% c(
                    "test_id",
                    "gene_id",
                    "gene",
                    "locus",
                    "sample_1",
                    "sample_2",
                    "status",
                    "value_1",
                    "value_2",
                    "log2(fold_change)",
                    "test_stat",
                    "p_value",
                    "q_value",
                    "significant"
                )
            )
        if (q1) {
            stop(paste(
                'The file supplied to pathToGeneDEanalysis does not appear',
                'to be the result of the CuffDiff gene expression analysis.'
            ))
        }
        ### transcript diff analysis
        q1 <-
            !all(
                colnames(isoformDiffanalysis) %in% c(
                    "test_id",
                    "gene_id",
                    "gene",
                    "locus",
                    "sample_1",
                    "sample_2",
                    "status",
                    "value_1",
                    "value_2",
                    "log2(fold_change)",
                    "test_stat",
                    "p_value",
                    "q_value",
                    "significant"
                )
            )
        if (q1) {
            stop(paste(
                'The file supplied to isoformDiffanalysis does not appear to',
                'be the result of the CuffDiff transcript expression analysis.'
            ))
        }

        q2 <-
            sum(grepl(
                'TCONS', isoformDiffanalysis$test_id
            )) != nrow(isoformDiffanalysis)
        if (q2) {
            warning(paste(
                'It looks like you have NOT been doing transcript\n',
                'reconstruction/assembly with Cufflinks/Cuffdiff.\n',
                'If you have not reconstructed transcripts we receomend to use Kallisto or Salmon\n',
                'to do the quantification instead - they are more accurate and have better biase correction methods.'
            ))
        }



        ### gene annoation
        q1 <-
            !all(
                colnames(geneAnnotation)[1:8] %in% c(
                    "tracking_id",
                    "class_code",
                    "nearest_ref_id",
                    "gene_id",
                    "gene_short_name",
                    "tss_id",
                    "locus",
                    "length"
                )
            )
        if (q1) {
            stop(paste(
                'The file supplied to geneAnnotation does not appear to be the',
                'gene FPKM traccking of the CuffDiff gene FPKM trascking analysis.'
            ))
        }
        ### transcript annoation
        q1 <-
            !all(
                colnames(isoformAnnotation)[1:8] %in% c(
                    "tracking_id",
                    "class_code",
                    "nearest_ref_id",
                    "gene_id",
                    "gene_short_name",
                    "tss_id",
                    "locus",
                    "length"
                )
            )
        if (q1) {
            stop(paste(
                'The file supplied to isoformAnnotation does not appear to be',
                'the isoform FPKM tracking of the CuffDiff transcript analysis.'
            ))
        }

        ### rep expression
        q1 <-
            !all(
                colnames(isoRepExp)[1:4] %in% c(
                    "tracking_id", "condition", "replicate", "raw_frags"
                )
            )
        if (q1) {
            stop(paste(
                'The file supplied to pathToIsoformCountTracking does not',
                'appear to be the isoform count tracking of the CuffDiff',
                'transcript analysis.'
            ))
        }

        ### splicing analysis
        if( !is.null(pathToSplicingAnalysis) ) {
            q1 <-
                !all(
                    colnames(cuffSplicing) %in% c(
                        "test_id",
                        "gene_id",
                        "gene",
                        "locus",
                        "sample_1",
                        "sample_2",
                        "status",
                        "value_1",
                        "value_2",
                        "sqrt(JS)",
                        "test_stat",
                        "p_value",
                        "q_value",
                        "significant"
                    )
                )
            if (q1) {
                stop(
                    'The file supplied to cuffSplicing does not appear to be the',
                    'result of the CuffDiff differential analysis of alternative splicing.'
                )
            }

        }

        ### Read grous
        q1 <-
            !all(
                colnames(readGroup) %in% c(
                    "file",
                    "condition",
                    "replicate_num",
                    "total_mass",
                    "norm_mass",
                    "internal_scale",
                    "external_scale"
                )
            )
        q2 <-
            !all(readGroup$condition %in% unique(
                unlist(geneDiffanalysis[, c('sample_1', 'sample_2')]))
            )
        if (q1 | q2) {
            stop(paste(
                'The file supplied to readGroup does not appear to be the',
                'pathToReadGroups of the CuffDiff transcript analysis.'
            ))
        }

        ### Run info
        q1 <- !all(colnames(runInfo) %in% c("param", 'value'))
        q2 <-
            !all(
                runInfo$param %in% c(
                    "cmd_line",
                    "version",
                    "SVN_revision",
                    "boost_version"
                )
            )
        if (q1 | q2) {
            stop(paste(
                'The file supplied to runInfo does not appear to be',
                'the runInfo of the CuffDiff transcript analysis.'
            ))
        }

    }

    ### Massage and merge gene and isoform annoation and DE analysis
    if (TRUE) {
        if (!quiet) { message('Step 2 of 5: Merging gene and isoform expression...')}
        ### Design matrix
        readGroup$sample_name <-
            stringr::str_c(readGroup$condition, '_', readGroup$replicate_num)
        designMatrix <- readGroup[, c('sample_name', 'condition')]
        colnames(designMatrix) <- c('sampleID', 'condition')

        ### Massage data frames
        if (TRUE) {
            # gene
            geneDiffanalysis <-
                geneDiffanalysis[, -which(
                    colnames(geneDiffanalysis) %in%
                        c('test_id', 'gene', 'locus', 'test_stat')
                )]
            colnames(geneDiffanalysis)[4:ncol(geneDiffanalysis)] <-
                paste(
                    "gene_",
                    colnames(geneDiffanalysis)[4:ncol(geneDiffanalysis)] ,
                    sep = "") # add gene to the colnames so they can be destinquished from the gene diff data
            colnames(geneDiffanalysis) <- gsub(
                    'gene_log2.fold_change.',
                    'gene_log2_fold_change',
                    colnames(geneDiffanalysis)
                )

            # info
            colnames(isoformAnnotation)[1] <- 'isoform_id'
            isoformAnnotation2 <-
                isoformAnnotation[, na.omit(match(
                    c(
                        'isoform_id',
                        'gene_id',
                        'gene_short_name',
                        'nearest_ref_id',
                        'class_code',
                        'tss_id',
                        'CDS_id',
                        'length',
                        'locus'
                    ),
                    colnames(isoformAnnotation)
                ))]

            colnames(isoformAnnotation2)[which(
                colnames(isoformAnnotation2) == 'gene_short_name'
            )] <- 'gene_name'

            # iso
            isoformDiffanalysis <- isoformDiffanalysis[, which(
                !colnames(isoformDiffanalysis) %in%
                    c('gene_id', 'gene', 'test_stat')
            )]
            colnames(isoformDiffanalysis)[5:ncol(isoformDiffanalysis)] 	<-
                paste(
                    "iso_",
                    colnames(isoformDiffanalysis)[5:ncol(isoformDiffanalysis)],
                    sep = "") # add gene to the colnames so they can be destinquished from the gene diff data
            colnames(isoformDiffanalysis)[1] <- 'isoform_id'
            colnames(isoformDiffanalysis) <-
                gsub(
                    'iso_log2.fold_change.',
                    'iso_log2_fold_change',
                    colnames(isoformDiffanalysis)
                )

            # rep expression
            isoRepExp2 <-
                isoRepExp[, c("tracking_id",
                              "condition",
                              "replicate",
                              "raw_frags")]
            colnames(isoRepExp2)[1] <- 'isoform_id'
            isoRepExp2$rep <-
                paste(isoRepExp2$condition, isoRepExp2$replicate, sep = '_')
            isoRepExp2 <-
                reshape2::dcast(data = isoRepExp2,
                                isoform_id ~ rep,
                                value.var = 'raw_frags')


            ### rep fpkm
            # iso
            isoRepFpkm <- isoRepExp[, c(
                "tracking_id",
                "condition",
                "replicate",
                "FPKM"
            )]
            colnames(isoRepFpkm)[1] <- 'isoform_id'
            isoRepFpkm$rep <-
                paste(isoRepFpkm$condition, isoRepFpkm$replicate, sep = '_')
            isoRepFpkm <-
                reshape2::dcast(data = isoRepFpkm,
                                isoform_id ~ rep,
                                value.var = 'FPKM')

            ### Gene
            isoRepFpkm2 <- isoRepFpkm
            isoRepFpkm2$gene_id <- isoformAnnotation2$gene_id[match(
                isoRepFpkm2$isoform_id, isoformAnnotation2$isoform_id
            )]
            isoRepFpkm2$isoform_id <- NULL

            geneRepFpkm <- isoformToGeneExp(isoRepFpkm2, quiet = TRUE)

            ### Calculate means
            rownames(isoRepFpkm) <- isoRepFpkm$isoform_id
            isoRepFpkm$isoform_id <- NULL
            isoMean <- rowMeans(isoRepFpkm)

            rownames(geneRepFpkm) <- geneRepFpkm$gene_id
            geneRepFpkm$gene_id <- NULL
            geneMean <- rowMeans(geneRepFpkm)

            ### add means
            geneDiffanalysis$gene_overall_mean <- geneMean[match(
                geneDiffanalysis$gene_id, names(geneMean)
            )]

            isoformDiffanalysis$iso_overall_mean <- isoMean[match(
                isoformDiffanalysis$isoform_id, names(isoMean)
            )]

        }

        ### Extract standard error
        if (TRUE) {
            ### Tjek if the Isoform CI collums are switches
            ciLowColumn <-
                which(grepl('_conf_lo', colnames(isoformAnnotation)))[1]
            ciHighColumn <-
                which(grepl('_conf_hi', colnames(isoformAnnotation)))[1]

            if (all(isoformAnnotation[, ciHighColumn] >= isoformAnnotation[, ciLowColumn])) {
                highString <- '_conf_hi'
            } else {
                highString <- '_conf_lo'
            }

            ### extract isoform sddev from CI
            # fpkm
            isoformFPKM <-
                isoformAnnotation[, which(grepl(
                    'isoform_id|_FPKM', colnames(isoformAnnotation)
                ))]
            isoformFPKM <- reshape2::melt(isoformFPKM, id.vars = 'isoform_id')
            isoformFPKM$variable <-
                gsub('_FPKM$', '', isoformFPKM$variable)
            colnames(isoformFPKM)[3] <- 'expression'
            # ci high
            isoformFPKMciHi <-
                isoformAnnotation[, which(grepl(
                    paste('isoform_id|', highString, sep = ''),
                    colnames(isoformAnnotation)
                ))]
            isoformFPKMciHi <-
                reshape2::melt(isoformFPKMciHi, id.vars = 'isoform_id')
            isoformFPKMciHi$variable <-
                gsub(highString, '', isoformFPKMciHi$variable)
            colnames(isoformFPKMciHi)[3] <- 'ci_hi'
            # stderr
            isoformFPKMcombined <-
                dplyr::inner_join(isoformFPKM,
                      isoformFPKMciHi,
                      by = c('isoform_id', 'variable'))
            isoformFPKMcombined$iso_stderr <-
                (isoformFPKMcombined$ci_hi - isoformFPKMcombined$expression) / 2 # How it's done in cufflinks source code
            isoformFPKMcombined$expression <- NULL
            isoformFPKMcombined$ci_hi <- NULL
            colnames(isoformFPKMcombined) <-
                c('isoform_id', 'sample_name', 'iso_stderr')

            ### Tjek if the gene CI collums are switches
            ciLowColumn <-
                which(grepl('_conf_lo', colnames(geneAnnotation)))[1]
            ciHighColumn <-
                which(grepl('_conf_hi', colnames(geneAnnotation)))[1]

            if (all(
                geneAnnotation[, ciHighColumn] >= geneAnnotation[, ciLowColumn])
            ) {
                highString <- '_conf_hi'
            } else {
                highString <- '_conf_lo'
            }

            ### extract gene sddev from CI
            # fpkm
            geneFPKM <- geneAnnotation[, which(grepl(
                    'tracking_id|_FPKM', colnames(geneAnnotation)
                ))]
            geneFPKM <- reshape2::melt(geneFPKM, id.vars = 'tracking_id')
            geneFPKM$variable <-
                gsub('_FPKM$', '', geneFPKM$variable)
            colnames(geneFPKM)[3] <- 'expression'
            # ci high
            geneFPKMciHi <- geneAnnotation[, which(grepl(
                    paste('tracking_id|', highString, sep = ''),
                    colnames(geneAnnotation)
                ))]
            geneFPKMciHi <- reshape2::melt(geneFPKMciHi, id.vars = 'tracking_id')
            geneFPKMciHi$variable <- gsub(highString, '', geneFPKMciHi$variable)
            colnames(geneFPKMciHi)[3] <- 'ci_hi'
            # stderr
            geneFPKMcombined <-
                dplyr::inner_join(geneFPKM,
                      geneFPKMciHi,
                      by = c('tracking_id', 'variable'))
            geneFPKMcombined$iso_stderr <-
                (geneFPKMcombined$ci_hi - geneFPKMcombined$expression) / 2 # how it's done in cufflinks sourece code
            geneFPKMcombined$expression <- NULL
            geneFPKMcombined$ci_hi <- NULL
            colnames(geneFPKMcombined) <-
                c('gene_id', 'sample_name', 'gene_stderr')


            ## Merge stderr with DE analysis
            isoformDiffanalysis <- suppressWarnings( dplyr::inner_join(
                isoformDiffanalysis,
                isoformFPKMcombined,
                by=c("sample_2" = "sample_name", "isoform_id" = "isoform_id")
            ) )
            colnames(isoformDiffanalysis)[which( grepl(
                'iso_stderr', colnames(isoformDiffanalysis))
            )] <- 'iso_stderr_2'

            isoformDiffanalysis <- suppressWarnings( dplyr::inner_join(
                isoformDiffanalysis,
                isoformFPKMcombined,
                by=c("sample_2" = "sample_name", "isoform_id" = "isoform_id")
            ) )
            colnames(isoformDiffanalysis)[which(grepl(
                'iso_stderr$',
                colnames(isoformDiffanalysis),
                perl = TRUE
            ))] <- c('iso_stderr_1')

            isoformDiffanalysis <-
                isoformDiffanalysis[, c(
                    'isoform_id',
                    'sample_1',
                    'sample_2',
                    'iso_status',
                    'iso_overall_mean',
                    'iso_value_1',
                    'iso_value_2',
                    'iso_stderr_1',
                    'iso_stderr_2',
                    'iso_log2_fold_change',
                    'iso_p_value',
                    'iso_q_value',
                    'iso_significant'
                )]

            ### Extract and add gene stderr
            geneDiffanalysis <- suppressWarnings( dplyr::inner_join(
                geneDiffanalysis,
                geneFPKMcombined,
                by=c("sample_2" = "sample_name", "gene_id" = "gene_id")
            ) )
            colnames(geneDiffanalysis)[ which(grepl(
                'gene_stderr', colnames(geneDiffanalysis)
            ))] <- 'gene_stderr_2'

            geneDiffanalysis <- suppressWarnings( dplyr::inner_join(
                geneDiffanalysis,
                geneFPKMcombined,
                by=c("sample_1" = "sample_name", "gene_id" = "gene_id")
            ) )
            colnames(geneDiffanalysis)[which(grepl(
                'gene_stderr$',
                colnames(geneDiffanalysis),
                perl = TRUE
            ))] <- c('gene_stderr_1')

            geneDiffanalysis <-
                geneDiffanalysis[, c(
                    'gene_id',
                    'sample_1',
                    'sample_2',
                    'gene_status',
                    'gene_overall_mean',
                    'gene_value_1',
                    'gene_value_2',
                    'gene_stderr_1',
                    'gene_stderr_2',
                    'gene_log2_fold_change',
                    'gene_p_value',
                    'gene_q_value',
                    'gene_significant'
                )]

        }


        ### Merge data
        if (TRUE) {
            ### Meger gene DE and annotation data
            isoformData <-
                dplyr::inner_join(isoformAnnotation2, geneDiffanalysis, by = 'gene_id')

            ### Merge with iso DE
            isoformData <-
                dplyr::inner_join(
                    isoformData,
                    isoformDiffanalysis,
                    by = c('isoform_id', 'sample_1', 'sample_2')
                )

            ### Massage again
            colnames(isoformData)[which(
                colnames(isoformData) == 'tss_id'
            )] <- 'TSS_group_id'

        }

    }

    ### Obtain transcript structure information
    if (TRUE) {
        if (!quiet) { message('Step 3 of 5: Obtaining annotation...')}

        ### Import file
        tmp <- capture.output(
            suppressWarnings(
                suppressMessages(
                    exonFeatures <-  rtracklayer::import(pathToGTF, format = 'gtf')
                )
            )
        )
        if (length(exonFeatures) == 0)
            stop("No exon information extracted from GTF")

        ### Filter for what is needed
        exonFeatures <- exonFeatures[
            which(tolower(exonFeatures$type) == 'exon'),
            c('gene_id', 'transcript_id')
            ]

        ### rename
        colnames(exonFeatures@elementMetadata) <- gsub(
            'transcript_id', 'isoform_id',
            colnames(exonFeatures@elementMetadata)
        )
    }

    ### import nulceotide fasta file
    if(TRUE) {
        addIsoformNt <- FALSE

        if( !is.null(isoformNtFasta) ) {
            isoformNtSeq <- do.call(
                c,
                lapply(isoformNtFasta, function(aFile) {
                    Biostrings::readDNAStringSet(
                        filepath = isoformNtFasta, format = 'fasta'
                    )
                })
            )

            if(!is(isoformNtSeq, "DNAStringSet")) {
                stop('The fasta file supplied to \'isoformNtFasta\' does not contain the nucleotide (DNA) sequence...')
            }

            ### Remove preceeding ref|
            if(
                sum(grepl('^ref\\|', names(isoformNtSeq))) == length( isoformNtSeq )
            ) {
                names(isoformNtSeq) <- gsub('^ref\\|', '', names(isoformNtSeq))
            }

            ### Remove potential name duplication
            isoformNtSeq <- isoformNtSeq[which(
                ! duplicated(names(isoformNtSeq))
            )]

            if( ! all(isoformData$isoform_id %in% names(isoformNtSeq)) ) {
                warning(
                    paste(
                        'The fasta file supplied to \'isoformNtFasta\' does not contain the',
                        'nucleotide (DNA) sequence for all isoforms annotated and will not be added!',
                        '\nSpecifically:\n',
                        length(unique(isoformData$isoform_id)), 'isoforms were annotated in the GTF\n',
                        length(unique(names(isoformNtSeq))), 'isoforms have a sequence.\n',
                        'Only', length(intersect(names(isoformNtSeq), isoformData$isoform_id)), 'overlap.\n',
                        length(setdiff(unique(isoformData$isoform_id), names(isoformNtSeq))), 'annoated isoforms isoforms had no corresponding nucleotide sequence\n',

                        '\nIf there is no overlap (as in zero or close) there are two options:\n',
                        '1) The files do not fit together (different databases, versions etc)',
                        '(no fix except using propperly paired files).\n',
                        '2) It is somthing to do with how the isoform ids are stored in the different files.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
                        '    3 Examples from GTF are :',
                        paste0( sample(unique(isoformData$isoform_id), min(3, nrow(isoformData)) ), collapse = ', '),'\n',
                        '    3 Examples of isoform sequence are  :',
                        paste0( sample(names(isoformNtSeq), min(3, length(isoformNtSeq)) ), collapse = ', '),'\n',


                        '\nIf there is a large overlap but still far from complete there are 3 possibilites:\n',
                        '1) The files do not fit together (different databases versions)',
                        '(no fix except using propperly paired files).\n',
                        '2) The isoforms quantified have their nucleotide sequence stored in multiple fasta files (common for Ensembl).',
                        'Just supply a vector with the path to each of them to the \'isoformNtFasta\' argument.\n',
                        '3) One file could contain non-chanonical chromosomes while the other do not',
                        '(might be solved using the \'removeNonConvensionalChr\' argument.)\n',
                        '4) It is somthing to do with how a subset of the isoform ids are stored in the different files.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n\n',
                        sep = ' '
                    )
                )
            } else {
                addIsoformNt <- TRUE
            }

            ### Subset to annotated isoforms
            isoformNtSeq <- isoformNtSeq[which(
                names(isoformNtSeq) %in% isoformData$isoform_id
            )]
            #if( !length(isoformNtSeq) ) {
            #    addIsoformNt <- FALSE
            #}
        }
    }

    ### Check it is the same transcripts in transcript structure and expression info
    if (TRUE) {
        myUnion     <-
            unique(c(
                isoformData$isoform_id,
                exonFeatures$isoform_id,
                isoRepExp2$isoform_id
            ))
        myIntersect <- intersect(
            intersect(isoformData$isoform_id, exonFeatures$isoform_id),
            isoRepExp2$isoform_id
        )

        # If there are descripencies
        if(length(myIntersect) == 0) {
            stop(
                paste(
                    'No overlap between isoform annotation',
                    'and isoform expression data was found',
                    sep=' '
                )
            )
        }


        if (length(myUnion) != length(myIntersect)) {
            isoformData <- isoformData[which(
                isoformData$isoform_id     %in% myIntersect), ]
            exonFeatures <- exonFeatures[which(
                    exonFeatures$isoform_id    %in% myIntersect), ]
            isoRepExp2 <- isoRepExp2[which(
                isoRepExp2$isoform_id       %in% myIntersect), ]

            if (!quiet) {
                message(
                    paste(
                        'There were discrepencies between the GTF and the',
                        'expression analysis files. To solve this',
                        abs(length(myUnion) - length(myIntersect)) ,
                        'transcripts were removed.',
                        sep = ' '
                    )
                )
            }
        }
    }

    ### Fix to correct for Cufflinks annotation problem where cufflinks assignes
    # transcripts from several annotated genes to 1 cuffgene
    if (   fixCufflinksAnnotationProblem ) {
        if (!quiet) { message('Step 4 of 3: Fixing cufflinks annotation problem...')}

        geneName <- unique(isoformData[, c('gene_id', 'gene_name')])
        geneNameSplit <-
            split(geneName$gene_name , f = geneName$gene_id)
        # remove all unique
        geneNameSplit <-
            geneNameSplit[which(sapply(geneNameSplit, function(x)
                length(unique(x))) > 1)]

        if (length(geneNameSplit) > 0) {
            # if there are any problems
            #get indexes of those affected
            geneNameIndexesData     <-
                which(isoformData$gene_id %in% names(geneNameSplit))
            geneNameIndexesFeatures <-
                which(exonFeatures$spliceR.gene_id %in% names(geneNameSplit))

            # combine names of cuffgenes and gene short name
            isoformData$gene_id[geneNameIndexesData]              <-
                paste(isoformData$gene_id[geneNameIndexesData]        ,
                      isoformData$gene_name[geneNameIndexesData],
                      sep = ':')
            exonFeatures$spliceR.gene_id[geneNameIndexesFeatures] <-
                paste(exonFeatures$spliceR.gene_id[geneNameIndexesFeatures],
                      exonFeatures$spliceR.gene_name[geneNameIndexesFeatures],
                      sep = ':')

            ## Correct gene expression levels and differntial analysis
            problematicGenes <-
                isoformData[geneNameIndexesData, c(
                    'isoform_id',
                    'gene_id',
                    'sample_1',
                    'sample_2',
                    'gene_overall_mean',
                    'gene_value_1',
                    'gene_value_2',
                    'gene_stderr_1',
                    'gene_stderr_2',
                    'gene_log2_fold_change',
                    'gene_p_value',
                    'gene_q_value',
                    'gene_significant',
                    'iso_status',
                    'iso_overall_mean',
                    'iso_value_1',
                    'iso_value_2'
                )]
            problematicGenesSplit <-
                split(problematicGenes, f = problematicGenes[
                    ,c('gene_id', 'sample_1', 'sample_2')], drop =TRUE)

            correctedGenes <-
                plyr::ldply(
                    problematicGenesSplit,
                    .fun = function(df) {
                        # df <- problematicGenesSplit[[1]]
                        df$gene_overall_mean <- sum(df$iso_overall_mean)
                        df$gene_value_1 <- sum(df$iso_value_1)
                        df$gene_value_2 <- sum(df$iso_value_2)
                        df$gene_stderr_1 <- NA
                        df$gene_stderr_2 <- NA
                        df$gene_log2_fold_change <- log2(
                            (df$gene_value_2[2] + 0.0001) /
                                (df$gene_value_1[1] + 0.0001)
                        )
                        df$gene_p_value <- 1
                        df$gene_q_value <- 1
                        df$gene_significant <- 'no'
                        df$iso_status <- 'NOTEST'
                        return(df)
                    }
                )

            # sort so genes end up being in correct order for overwriting
            correctedGenes <-
                correctedGenes[order(
                    correctedGenes$isoform_id,
                    correctedGenes$gene_id,
                    correctedGenes$sample_1,
                    correctedGenes$sample_2
                ), -1] # -1 removes the index created by ldply
            # overwrite problematic genes
            isoformData[geneNameIndexesData, c(
                'gene_id',
                'sample_1',
                'sample_2',
                'gene_overall_mean',
                'gene_value_1',
                'gene_value_2',
                'gene_stderr_1',
                'gene_stderr_2',
                'gene_log2_fold_change',
                'gene_p_value',
                'gene_q_value',
                'gene_significant',
                'iso_status',
                'iso_overall_mean',
                'iso_value_1',
                'iso_value_2'
            )] <- correctedGenes[, -1] # -1 removes the isoform id


            ### Add to exons
            exonFeatures$gene_id <- isoformData$gene_id[match(
                exonFeatures$isoform_id, isoformData$isoform_id
            )]

            if (!quiet) {
                message(
                    paste(
                        "    Cufflinks annotation problem was fixed for",
                        length(geneNameSplit),
                        "Cuff_genes",
                        sep = ' '
                    )
                )
            }
        } else {
            if (!quiet) {
                message(paste(
                    'No instances of a Cufflinks annotation',
                    'problem found - no changes were made'
                ))
            }
        }
    } # end of fix cufflinks annotatopn problem
    if ( ! fixCufflinksAnnotationProblem ) {
        if (!quiet) { message('Step 4 of 5: Skipped fixing of cufflinks annotation problem (due to fixCufflinksAnnotationProblem argument)...')}
    }

    if (!quiet) { message('Step 5 of 5: Creating switchAanalyzeRlist...')}

    ### Calculate IF values
    localAnnot <- unique(isoformData[,c('gene_id','isoform_id')])
    ifMatrix <- isoformToIsoformFraction(
        isoformRepExpression = isoRepFpkm,
        isoformGeneAnnotation = localAnnot,
        quiet = TRUE
    )

    ### Summarize IF
    myMeanIF <- rowMeans(ifMatrix[,designMatrix$sampleID,drop=FALSE], na.rm = TRUE)
    ifMeanDf <- plyr::ddply(
        .data = designMatrix,
        .variables = 'condition',
        .fun = function(aDF) { # aDF <- switchAnalyzeRlist$designMatrix[1:2,]
            tmp <- rowMeans(ifMatrix[,aDF$sampleID,drop=FALSE], na.rm = TRUE)
            data.frame(
                isoform_id=names(tmp),
                mean=tmp,
                stringsAsFactors = FALSE
            )
        }
    )

    isoformData$IF_overall <- myMeanIF[match(
        isoformData$isoform_id, names(myMeanIF)
    )]
    isoformData$IF1 <- ifMeanDf$mean[match(
        stringr::str_c(isoformData$isoform_id,isoformData$sample_1),
        stringr::str_c(ifMeanDf$isoform_id, ifMeanDf$condition)
    )]
    isoformData$IF2 <- ifMeanDf$mean[match(
        stringr::str_c(isoformData$isoform_id,isoformData$sample_2),
        stringr::str_c(ifMeanDf$isoform_id, ifMeanDf$condition)
    )]
    isoformData$dIF <- isoformData$IF2 - isoformData$IF1

    ### Add q-values
    if (!is.null(pathToSplicingAnalysis)) {
        ### Add cufflinks analysis isoform switch analysis results
        isoformData$isoform_switch_q_value <- NA
        isoformData$gene_switch_q_value <-
            cuffSplicing$q_value[match(
                paste(
                    isoformData$gene_id,
                    isoformData$sample_1,
                    isoformData$sample_2,
                    sep = '_'
                ),
                paste(
                    cuffSplicing$gene_id,
                    cuffSplicing$sample_1,
                    cuffSplicing$sample_2,
                    sep = '_'
                )
            )]
    } else {
        ### Add collumns for isoform switch analysis results
        isoformData$isoform_switch_q_value <- NA
        isoformData$gene_switch_q_value <- NA
    }

    ### Reorder a bit
    ofInterest <- c('isoform_id','gene_id','gene_name','sample_1','sample_2')
    isoformData <- isoformData[, c(
        match( ofInterest, colnames(isoformData)),
        which( ! colnames(isoformData) %in% ofInterest)
    )]
    colnames(isoformData)[4:5] <- c('condition_1', 'condition_2')
    isoformData <- as.data.frame(isoformData)

    ### Extract run info
    # cufflinks version
    cuffVersion <- runInfo$value[2]

    # replicate numbers
    nrRep <- table(readGroup$condition)
    nrRep <-
        data.frame(
            condition = names(nrRep),
            nrReplicates = as.vector(nrRep),
            row.names = NULL,
            stringsAsFactors = FALSE
        )


    isoRepFpkm$isoform_id <- rownames(isoRepFpkm)
    rownames(isoRepFpkm) <- NULL
    isoRepFpkm <- isoRepFpkm[,c(
        which(colnames(isoRepFpkm) == 'isoform_id'),
        which(colnames(isoRepFpkm) != 'isoform_id')
    )]

    # Return SpliceRList
    switchAnalyzeRlist <- createSwitchAnalyzeRlist(
        isoformFeatures = isoformData,
        exons = exonFeatures,
        designMatrix = designMatrix,
        isoformCountMatrix = isoRepExp2,
        isoformRepExpression = isoRepFpkm,
        sourceId = paste("cufflinks", cuffVersion , sep = '_')
    )

    if (!is.null(pathToSplicingAnalysis)) {
        if( nrow(cuffSplicing) ) {
            switchAnalyzeRlist$isoformSwitchAnalysis <- as.data.frame(cuffSplicing)
        }
    }

    if( addIFmatrix ) {
        ifMatrix$isoform_id <- rownames(ifMatrix)
        rownames(ifMatrix) <- NULL
        ifMatrix <- ifMatrix[,c(
            which(colnames(ifMatrix) == 'isoform_id'),
            which(colnames(ifMatrix) != 'isoform_id')
        )]

        switchAnalyzeRlist$isoformRepIF <- ifMatrix
    }

    if(addIsoformNt) {
        switchAnalyzeRlist$ntSequence <- isoformNtSeq[which(
            names(isoformNtSeq) %in% switchAnalyzeRlist$isoformFeatures$isoform_id
        )]
    }

    ### Estimate DTU
    if(estimateDifferentialGeneRange & !quiet) {
        localEstimate <- estimateDifferentialRange(switchAnalyzeRlist)

        message('The GUESSTIMATED number of genes with differential isoform usage are:')
        print(localEstimate)
    }

    if (!quiet) {
        message("Done")
    }

    return(switchAnalyzeRlist)
}

importGTF <- function(
    ### Core arguments
    pathToGTF,
    isoformNtFasta = NULL,

    ### Advanced arguments
    extractAaSeq = FALSE,
    addAnnotatedORFs = TRUE,
    onlyConsiderFullORF = FALSE,
    removeNonConvensionalChr = FALSE,
    ignoreAfterBar = TRUE,
    ignoreAfterSpace = TRUE,
    ignoreAfterPeriod = FALSE,
    removeTECgenes = TRUE,
    PTCDistance = 50,
    removeFusionTranscripts = TRUE,
    removeUnstrandedTranscripts = TRUE,
    quiet = FALSE
) {
    ### Test input
    if(TRUE) {
        ### Test existance of files
        if(TRUE) {
            if( pathToGTF == '' ) {
                stop(
                    paste(
                        'The \'pathToGTF\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                        '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                        'to import your own data? The system.file() should only be used',
                        'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                        'To access your own data simply provide the string to the directory with the data as:',
                        '"path/to/quantification/".',
                        sep=' '
                    )
                )
            }
            if( ! (file.exists(pathToGTF) | RCurl::url.exists(pathToGTF)) ) {
                stop(
                    paste(
                        'The file pointed to with the \'pathToGTF\' argument does not exists.',
                        '\nDid you accidentially make a spelling mistake or added a unwanted "/" infront of the text string?',
                        sep=' '
                    )
                )
            }

            if( !is.null(isoformNtFasta)) {
                if( !is.character( isoformNtFasta)) {
                    stop('The \'isoformNtFasta\' argument must be a charachter string.')
                }

                if( any(isoformNtFasta == '') ) {
                    stop(
                        paste(
                            'The \'isoformNtFasta\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                            '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                            'to import your own data? The system.file() should only be used',
                            'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                            'To access your own data simply provide the string to the directory with the data as:',
                            '"path/to/quantification/".',
                            sep=' '
                        )
                    )
                }

                if( any( ! sapply(isoformNtFasta, file.exists) ) ) {
                    stop('At least one of the file(s) pointed to with \'isoformNtFasta\' seems not to exist.')
                }

                if( any(! grepl('\\.fa|\\.fasta|\\.fa.gz|\\.fasta.gz', isoformNtFasta)) ) {
                    stop('The file pointed to via the \'isoformNtFasta\' argument does not seem to be a fasta file...')
                }
            }


        }


        if( ! grepl('\\.gtf$|\\.gtf\\.gz$|\\.gff$|\\.gff\\.gz$|\\.gff3$|\\.gff3\\.gz$', pathToGTF, ignore.case = TRUE) ) {
            warning('The file pointed to by the "pathToGTF" argument appearts not to be a GTF/GFF file as it does have the right suffix - are you sure it is the rigth file?')
        }

        isGTF <- grepl('\\.gtf$|\\.gtf\\.gz$', pathToGTF, ignore.case = TRUE)
    }

    # Read in from GTF/GFF file
    if(TRUE) {
        if(   isGTF ) {
            if (!quiet) {
                message('Importing GTF (this may take a while)...')
            }
            tmp <- capture.output(
                suppressWarnings(
                    suppressMessages(
                      if(onlyConsiderFullORF){
                        mfGTF <- rtracklayer::import(pathToGTF, format='gtf', feature.type = c('CDS','exon','start_codon','stop_codon'))
                      } else {
                        mfGTF <- rtracklayer::import(pathToGTF, format='gtf', feature.type = c('CDS','exon'))
                      }
                    )
                )
            )
            ### Check GTF
            if (!all(c('transcript_id', 'gene_id') %in% colnames(mfGTF@elementMetadata))) {
                collumnsMissing <- paste(
                    c('transcript_id', 'gene_id')[which(
                        !c('transcript_id', 'gene_id') %in%
                            colnames(mfGTF@elementMetadata)
                    )], collapse = ', ')
                stop(
                    paste(
                        'The GTF file must contain the folliwing collumns',
                        '\'transcript_id\' and \'gene_id\'.',
                        collumnsMissing,
                        'is missing.',
                        sep = ' '
                    )
                )
            }
        }
        if( ! isGTF ){
            if (!quiet) {
                message('importing GFF (this may take a while)...')
            }
            tmp <- capture.output(
                suppressWarnings(
                    suppressMessages(
                        mfGTF <- rtracklayer::import(pathToGTF, format='gff')
                    )
                )
            )

            ### Check GTF
            geneIdPressent <- 'gene_id' %in% colnames(mfGTF@elementMetadata)

            if( ! geneIdPressent ) {
                ### Check for RefSeq "gene" (aka gene_id)
                if (!all(c('transcript_id', 'gene') %in% colnames(mfGTF@elementMetadata) )) {
                    collumnsMissing <- paste(
                        c('transcript_id', 'gene')[which(
                            !c('transcript_id', 'gene') %in%
                                colnames(mfGTF@elementMetadata)
                        )], collapse = ', ')
                    stop(
                        paste(
                            'The GFF file must contain the folliwing collumns',
                            '\'transcript_id\' and \'gene\'.',
                            collumnsMissing,
                            'is missing.',
                            sep = ' '
                        )
                    )
                }

                ### Rename RefSeq gene to gene_id
                if( ! 'gene_id' %in% colnames(mcols(mfGTF)) ) {
                    if( 'gene' %in% colnames(mcols(mfGTF)) ) {
                        colnames(mcols(mfGTF))[which(
                            colnames(mcols(mfGTF)) == 'gene'
                        )] <- 'gene_id'
                    } else {
                        stop('Could not locate the gene id in the gff file.')
                    }
                }
            }
            if(   geneIdPressent ) {
                stop(
                    paste0(
                        'This is not a RefSeq GFF file (from ftp://ftp.ncbi.nlm.nih.gov/genomes/).',
                        '\nIsoformSwitchAnalyzeR only handles RefSeq GFF files so please supply GTF file instead.',
                        '\n(for more info see FAQ about annotate databases in vignette).'
                    )
                )
            }


        }
    }

    ### Reduce if nessesary
    if (removeNonConvensionalChr) {
        mfGTF <- mfGTF[which( ! grepl('_'  , as.character(mfGTF@seqnames))), ]
        mfGTF <- mfGTF[which( ! grepl('\\.', as.character(mfGTF@seqnames))), ]

        if (length(mfGTF) == 0) {
            stop('No exons were left after filtering',
                 'with \'removeNonConvensionalChr\'.')
        }

        seqlevels(mfGTF) <- as.character(mfGTF@seqnames@values)
    }

    if (removeUnstrandedTranscripts) {
        mfGTF <- mfGTF[which( ! grepl('\\*' , as.character(mfGTF@strand))), ]

        if (length(mfGTF) == 0) {
            stop('No exons were left after filtering',
                 'with \'removeUnstrandedTranscripts\'.')
        }
    }

    if( removeTECgenes ) {
        if(isGTF) {
            ### Ensembl
            if( 'gene_biotype' %in% colnames(mcols(mfGTF)) ) {
                toExclude <-  mfGTF$gene_biotype == 'TEC'
                if( any( toExclude ) ) {
                    mfGTF <- mfGTF[-which( toExclude ),]
                }
            }
            ### Gencode
            if( 'gene_type' %in% colnames(mcols(mfGTF)) ) {
                toExclude <-  mfGTF$gene_type == 'TEC'
                if( any( toExclude ) ) {
                    mfGTF <- mfGTF[-which( toExclude ),]
                }

            }
        }
    }

    ### Ensure seqlevels are ok are removal
    seqlevels(mfGTF) <- unique(as.character(mfGTF@seqnames@values))

    ### Potentially add version numbering
    if( TRUE ) {
        if(any( colnames(mcols(mfGTF)) == 'gene_version' )) {
            mfGTF$gene_id <- stringr::str_c(
                mfGTF$gene_id,
                '.',
                mfGTF$gene_version
            )
        }
        if(any( colnames(mcols(mfGTF)) == 'transcript_version' )) {
            mfGTF$transcript_id <- stringr::str_c(
                mfGTF$transcript_id,
                '.',
                mfGTF$transcript_version
            )
        }
    }

    ### Fix names
    if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {

        mfGTF$transcript_id <- fixNames(
            nameVec = mfGTF$transcript_id,
            ignoreAfterBar = ignoreAfterBar,
            ignoreAfterSpace = ignoreAfterSpace,
            ignoreAfterPeriod = ignoreAfterPeriod
        )
        
        mfGTF$gene_id <- fixNames(
          nameVec = mfGTF$gene_id,
          ignoreAfterBar = ignoreAfterBar,
          ignoreAfterSpace = ignoreAfterSpace,
          ignoreAfterPeriod = ignoreAfterPeriod
        )

    }

    ### Make annoation
    if(TRUE) {
        if(   isGTF ) {
            if (!quiet) {
                message('Massaging annoation...')
            }
            exonAnoationIndex <- which(mfGTF$type == 'exon')

            colsToExtract <- c(
                'transcript_id', 'gene_id', 'gene_name',
                'gene_type','gene_biotype',     # respectively gencode and ensembl gene type col
                'transcript_biotype','transcript_type',
                'ref_gene_id' # for StringTie data
            )
            myIso <-
                as.data.frame(unique(mfGTF@elementMetadata[
                    exonAnoationIndex,
                    na.omit(match(colsToExtract, colnames(mfGTF@elementMetadata)))]
                ))

            ### Handle columns not extracted
            if (is.null(myIso$gene_name)) {
                myIso$gene_name <- NA
            }
            if (is.null(myIso$ref_gene_id)) {
                myIso$ref_gene_id <- myIso$gene_name
            }

            ### Handle columns with multiple options
            geneTypeCol <- which(colnames(myIso) %in% c('gene_type','gene_biotype'))
            if( length(geneTypeCol) == 0 ) {
                myIso$geneType <- NA
            } else {
                myIso$geneType <- myIso[,geneTypeCol[1]]
            }

            isoTypeCol <- which(colnames(myIso) %in% c('transcript_biotype','transcript_type'))
            if( length(isoTypeCol) == 0 ) {
                myIso$isoType <- NA
            } else {
                myIso$isoType <- myIso[,isoTypeCol]
            }

        }
        if( ! isGTF ) {
            if (!quiet) {
                message('converting GFF to switchAnalyzeRlist')
            }
            exonAnoationIndex <- which(mfGTF$type == 'exon')
            colsToExtract <- c(
                'Parent',
                'gene_id',
                'transcript_id'
            )

            # extract exon annot
            myIso <-
                as.data.frame(unique(mfGTF@elementMetadata[
                    exonAnoationIndex,
                    na.omit(match(colsToExtract, colnames(mfGTF@elementMetadata)))]
                ))

            if(any(is.na(myIso$transcript_id))) {
                nNa <- sum(is.na(myIso$transcript_id))

                warning(
                    paste(
                        'There were', nNa, 'annotated features without isoform_ids.',
                        'These were removed.'
                    )
                )

                myIso <- myIso[which(
                    ! is.na(myIso$transcript_id)
                ),]
            }

            # extract gene biotype
            if( 'gene_biotype' %in% colnames(mcols(mfGTF)) ) {
                gffGeneAnnot <- mfGTF[which( mfGTF$type == 'gene' ),c(
                    'ID',
                    'gene_id',
                    'gene_biotype'
                )]

                myIso$gene_biotype    <- gffGeneAnnot$gene_biotype[match(myIso$gene_id, gffGeneAnnot$gene_id)]
            }


            ### Handle columns not extracted
            if (is.null(myIso$gene_name)) {
                myIso$gene_name <- NA
            }

            ### Handle columns with multiple options
            geneTypeCol <- which(colnames(myIso) %in% c('gene_type','gene_biotype'))
            if( length(geneTypeCol) == 0 ) {
                myIso$geneType <- NA
            } else {
                myIso$geneType <- myIso[,geneTypeCol[1]]
            }

            isoTypeCol <- which(colnames(myIso) %in% c('transcript_biotype','transcript_type'))
            if( length(isoTypeCol) == 0 ) {
                myIso$isoType <- NA
            } else {
                myIso$isoType <- myIso[,isoTypeCol]
            }
        }


        ### Make annotation
        myIsoAnot <- data.frame(
            isoform_id = myIso$transcript_id,
            gene_id = myIso$gene_id,
            condition_1 = "plaseholder1",
            condition_2 = "plaseholder2",
            gene_name = myIso$gene_name,
            ref_gene_id = myIso$ref_gene_id,
            gene_biotype = myIso$geneType,
            iso_biotype = myIso$isoType,
            class_code = '=',
            gene_overall_mean = 0,
            gene_value_1 = 0,
            gene_value_2 = 0,
            gene_stderr_1 = NA,
            gene_stderr_2 = NA,
            gene_log2_fold_change = 0,
            gene_p_value = 1,
            gene_q_value = 1,
            iso_overall_mean = 0,
            iso_value_1 = 0,
            iso_value_2 = 0,
            iso_stderr_1 = NA,
            iso_stderr_2 = NA,
            iso_log2_fold_change = 0,
            iso_p_value = 1,
            iso_q_value = 1,
            IF_overall = NA,
            IF1 = NA,
            IF2 = NA,
            dIF = NA,
            isoform_switch_q_value = NA,
            gene_switch_q_value = NA,
            stringsAsFactors = FALSE
        )

    }

    ### Test for annoation problems
    if(TRUE) {
        geneSummary <-
            myIsoAnot %>%
            dplyr::select(gene_id, gene_name) %>%
            dplyr::distinct() %>%
            group_by(gene_id) %>%
            dplyr::summarise(
                n_gene_names = length(na.omit(gene_name)),
                have_missing_gene_name = any(is.na(gene_name))
            )

        missingGeneProblem <- any(
            geneSummary$n_gene_names > 0 & geneSummary$have_missing_gene_name
        )
        mergedGeneProblem <- any(
            geneSummary$n_gene_names > 1
        )

        if( missingGeneProblem | mergedGeneProblem ) {
            warning(
                paste0(
                    '\nThe annotaion seems to have probelems that commonly occure',
                    '\n  when transcript assembly is done (gene merging and unassigned novel isoforms).',
                    '\n  These can be fixed and/or rescued by using the importRdata() function instead.',
                    '\n'
                )
            )
        }
    }

    ### Add CDS annoation from GTF file inc convertion to transcript coordinats
    if (addAnnotatedORFs) {

        if(   isGTF ) {
            # test whether any CDS are found
            if (any(mfGTF$type == 'CDS')) {
                if (!quiet) {
                    message('Massaging annotated CDSs...')
                }

                ### Prepare CDS data
                myCDS <-
                    mfGTF[which(mfGTF$type == 'CDS'), 'transcript_id']
                colnames(mcols(myCDS)) <- 'isoform_id'

                ### Prepare exon data
                localExons <- mfGTF[which(mfGTF$type == 'exon'), 'transcript_id']
                colnames(mcols(localExons)) <- 'isoform_id'
                localExons <-
                    localExons[which(
                        as.character(localExons@strand) %in% c('+', '-')), ]
                localExons <-
                    localExons[which(
                        localExons$isoform_id %in% myCDS$isoform_id
                ), ]

                ### Analyze CDS
                orfInfo <- analyseCds(
                    myCDS = myCDS,
                    localExons = localExons,
                    onlyConsiderFullORF = onlyConsiderFullORF,
                    mfGTF = mfGTF,
                    PTCDistance = PTCDistance
                )

                # make sure all ORFs are annotated (with NAs)
                orfInfo <-
                    dplyr::full_join(
                        orfInfo,
                        unique(myIsoAnot[, 'isoform_id', drop = FALSE]),
                        by = 'isoform_id'
                    )

            } else {
                # if no CDS were found
                warning(paste(
                    'No CDS was found in the GTF file. Please make sure the GTF',
                    'file have the CDS "feature" annotation. Adding NAs instead'
                ))

                orfInfo <- data.frame(
                    isoform_id = unique(myIsoAnot$isoform_id),
                    orfTransciptStart = NA,
                    orfTransciptEnd = NA,
                    orfTransciptLength = NA,
                    orfStarExon = NA,
                    orfEndExon = NA,
                    orfStartGenomic = NA,
                    orfEndGenomic = NA,
                    stopDistanceToLastJunction = NA,
                    stopIndex = NA,
                    PTC = NA,
                    stringsAsFactors = FALSE
                )
            }

            ### add to iso annotation
            myIsoAnot$PTC <-
                orfInfo$PTC[match(myIsoAnot$isoform_id, orfInfo$isoform_id)]


        }
        if( ! isGTF ) {
            # test whether any CDS are found
            if (any(mfGTF$type == 'CDS')) {
                if (!quiet) {
                    message('converting annotated CDSs')
                }

                ### Extract CDS Ids
                colsToExtract <- c(
                    'ID',
                    'gene_id',
                    'transcript_id'
                )
                myIso2 <-
                    as.data.frame(unique(mfGTF@elementMetadata[
                        which(mfGTF$type == 'mRNA'),
                        na.omit(match(colsToExtract, colnames(mfGTF@elementMetadata)))]
                    ))

                ### Extract CDS
                myCDS <- mfGTF[which(mfGTF$type == 'CDS'),c('ID','transcript_id','Parent')]
                myCDS$Parent <- sapply(myCDS$Parent, function(x) x[1])

                ### Transfer ids
                myCDS <- myCDS[which(
                    myCDS$Parent %in% myIso2$ID
                ),]
                myCDS$transcript_id <- myIso2$transcript_id[match(
                    myCDS$Parent, myIso2$ID
                )]
                myCDS <- myCDS[which(
                    myCDS$transcript_id %in% myIsoAnot$isoform_id
                ),]

                ### Get it
                myCDS <-
                    sort(myCDS[,'transcript_id'])
                myCDSedges <-
                    suppressMessages(unlist(range(
                        split(myCDS[, 0], f = myCDS$transcript_id)
                    )))  # Extract EDGEs
                myCDSedges$id <- names(myCDSedges)
                names(myCDSedges) <- NULL

                ### Extract Exons
                localExons <- mfGTF[exonAnoationIndex, 'transcript_id']
                localExons <-
                    localExons[which(
                        as.character(localExons@strand) %in% c('+', '-')), ]
                localExons <-
                    localExons[which(localExons$transcript_id %in% myCDSedges$id), ]

                localExons <-
                    localExons[order(localExons$transcript_id,
                                     start(localExons),
                                     end(localExons)), ]
                localExons$exon_id <-
                    paste('exon_', 1:length(localExons), sep = '')

                ### This is where I can remove the stop codon

                ### Extract strand specific ORF info
                cds <- as.data.frame(myCDSedges)
                # start
                plusIndex <- which(cds$strand == '+')
                annoatedStartGRangesPlus <-
                    GRanges(
                        cds$seqnames[plusIndex],
                        IRanges(
                            start = cds$start[plusIndex],
                            end = cds$start[plusIndex] #-3 # -3 since stop codon is included in GFF according to https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
                        ),
                        strand = cds$strand[plusIndex],
                        id = cds$id[plusIndex]
                    )
                minusIndex <- which(cds$strand == '-')
                annoatedStartGRangesMinus <-
                    GRanges(
                        cds$seqnames[minusIndex],
                        IRanges(
                            start = cds$end[minusIndex],
                            end = cds$end[minusIndex] #+3 # +3 since stop codon is included in GFF according to https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
                        ),
                        strand = cds$strand[minusIndex],
                        id = cds$id[minusIndex]
                    )

                annoatedStartGRanges <-
                    c(annoatedStartGRangesPlus,
                      annoatedStartGRangesMinus)
                annoatedStartGRanges$orf_id <-
                    paste('cds_', 1:length(annoatedStartGRanges), sep = '')

                # end
                annoatedEndGRangesPlus  <-
                    GRanges(
                        cds$seqnames[plusIndex],
                        IRanges(
                            start = cds$end[plusIndex],
                            end = cds$end[plusIndex]),
                        strand = cds$strand[plusIndex],
                        id = cds$id[plusIndex]
                    )
                annoatedEndGRangesMinus <-
                    GRanges(
                        cds$seqnames[minusIndex],
                        IRanges(
                            start = cds$start[minusIndex],
                            end = cds$start[minusIndex]),
                        strand = cds$strand[minusIndex],
                        id = cds$id[minusIndex]
                    )

                annoatedEndGRanges <-
                    c(annoatedEndGRangesPlus, annoatedEndGRangesMinus)
                annoatedEndGRanges$orf_id <-
                    paste('stop_', 1:length(annoatedEndGRanges), sep = '')

                # combine
                annotatedORFGR <-
                    c(annoatedStartGRanges, annoatedEndGRanges)


                ### Idenetify overlapping CDS and exons as well as the annoate transcript id
                suppressWarnings(overlappingAnnotStart <-
                                     as.data.frame(
                                         findOverlaps(
                                             query = localExons,
                                             subject = annotatedORFGR,
                                             ignore.strand = FALSE
                                         )
                                     ))
                if (!nrow(overlappingAnnotStart)) {
                    stop(
                        'No overlap between CDS and transcripts were found. This is most likely due to a annoation problem around chromosome name.'
                    )
                }

                # Annoate overlap ids
                overlappingAnnotStart$transcript_id <-
                    localExons$transcript_id[overlappingAnnotStart$queryHits]
                overlappingAnnotStart$exon_id <- localExons$exon_id[
                    overlappingAnnotStart$queryHits
                    ]

                overlappingAnnotStart$cdsTranscriptID <- annotatedORFGR$id[
                    overlappingAnnotStart$subjectHits
                    ]
                overlappingAnnotStart$orf_id <- annotatedORFGR$orf_id[
                    overlappingAnnotStart$subjectHits
                    ]

                # subset to annoateted overlap
                overlappingAnnotStart <-
                    overlappingAnnotStart[which(
                        overlappingAnnotStart$transcript_id ==
                            overlappingAnnotStart$cdsTranscriptID
                    ), c('transcript_id',
                         'exon_id',
                         'cdsTranscriptID',
                         'orf_id')]

                # annoate with genomic site
                overlappingAnnotStart$orfGenomic <-
                    start(annotatedORFGR)[match(
                        overlappingAnnotStart$orf_id, annotatedORFGR$orf_id
                    )]


                ### Enrich exon information
                myExons <-
                    as.data.frame(localExons[which(
                        localExons$transcript_id %in%
                            overlappingAnnotStart$transcript_id),])

                # Strand
                myExonPlus <- myExons[which(myExons$strand == '+'), ]
                myExonMinus <- myExons[which(myExons$strand == '-'), ]

                plusSplit <-
                    split(myExonPlus$width, myExonPlus$transcript_id)
                minusSplit <-
                    split(myExonMinus$width, myExonMinus$transcript_id)

                # cumsum
                myExonPlus$cumSum <-
                    unlist(sapply(plusSplit , function(aVec) {
                        cumsum(c(0, aVec))[1:(length(aVec))]
                    }))
                myExonMinus$cumSum <-
                    unlist(sapply(minusSplit, function(aVec) {
                        cumsum(c(0, rev(aVec)))[(length(aVec)):1] # reverse
                    }))

                # exon number
                myExonPlus$nrExon <-
                    unlist(sapply(plusSplit, function(aVec) {
                        1:length(aVec)
                    }))
                myExonMinus$nrExon <-
                    unlist(sapply(minusSplit, function(aVec) {
                        1:length(aVec)
                    }))

                # total nr exons
                myExonPlus$lastExonIndex <-
                    unlist(sapply(plusSplit, function(aVec) {
                        rep(length(aVec), length(aVec))
                    }))
                myExonMinus$lastExonIndex <-
                    unlist(sapply(minusSplit, function(aVec) {
                        rep(1, length(aVec))
                    }))

                # final exon exon junction trancipt position
                myExonPlus$finalJunctionPos <-
                    unlist(sapply(plusSplit , function(aVec) {
                        rep(cumsum(c(0, aVec))[length(aVec)], times = length(aVec))
                    }))
                myExonMinus$finalJunctionPos <-
                    unlist(sapply(minusSplit, function(aVec) {
                        rep(cumsum(c(0, rev(
                            aVec
                        )))[length(aVec)], times = length(aVec))
                    }))

                myExons2 <- rbind(myExonPlus, myExonMinus)

                ### Annoate with exon information
                matchIndex <-
                    match(overlappingAnnotStart$exon_id, myExons2$exon_id)
                overlappingAnnotStart$strand <- myExons2$strand[matchIndex]
                overlappingAnnotStart$exon_start <- myExons2$start[matchIndex]
                overlappingAnnotStart$exon_end <- myExons2$end[matchIndex]
                overlappingAnnotStart$exon_cumsum <- myExons2$cumSum[matchIndex]
                overlappingAnnotStart$exon_nr <- myExons2$nrExon[matchIndex]
                overlappingAnnotStart$lastExonIndex <-
                    myExons2$lastExonIndex[matchIndex]
                overlappingAnnotStart$finalJunctionPos <-
                    myExons2$finalJunctionPos[matchIndex]

                ### Annoate with transcript coordinats
                overlappingAnnotStartPlus <-
                    overlappingAnnotStart[which(
                        overlappingAnnotStart$strand == '+'), ]
                overlappingAnnotStartPlus$orfTranscript <-
                    overlappingAnnotStartPlus$exon_cumsum + (
                        overlappingAnnotStartPlus$orfGenomic -
                            overlappingAnnotStartPlus$exon_start
                    ) + 1
                overlappingAnnotStartPlus$junctionDistance <-
                    overlappingAnnotStartPlus$finalJunctionPos -
                    overlappingAnnotStartPlus$orfTranscript + 3 # +3 because the ORF does not include the stop codon - but it should in this calculation

                overlappingAnnotStartMinus <-
                    overlappingAnnotStart[which(
                        overlappingAnnotStart$strand == '-'), ]
                overlappingAnnotStartMinus$orfTranscript <-
                    overlappingAnnotStartMinus$exon_cumsum + (
                        overlappingAnnotStartMinus$exon_end -
                            overlappingAnnotStartMinus$orfGenomic
                    ) + 1
                overlappingAnnotStartMinus$junctionDistance <-
                    overlappingAnnotStartMinus$finalJunctionPos -
                    overlappingAnnotStartMinus$orfTranscript + 3 # +3 because the ORF does not include the stop codon - but it should in this calculation

                overlappingAnnotStart2 <-
                    rbind(overlappingAnnotStartPlus,
                          overlappingAnnotStartMinus)
                overlappingAnnotStart2 <-
                    overlappingAnnotStart2[order(
                        overlappingAnnotStart2$transcript_id,
                        overlappingAnnotStart2$exon_start,
                        overlappingAnnotStart2$exon_end
                    ), ]

                ### devide into start and stop
                starInfo <-
                    overlappingAnnotStart2[which(
                        grepl('^cds', overlappingAnnotStart2$orf_id)), ]
                stopInfo <-
                    overlappingAnnotStart2[which(
                        grepl('^stop', overlappingAnnotStart2$orf_id)), ]

                ### predict PTC
                stopInfo$PTC <-
                    stopInfo$exon_nr != stopInfo$lastExonIndex &
                    stopInfo$junctionDistance > PTCDistance

                ### Merge the data
                starInfo2 <-
                    unique(starInfo[, c('transcript_id',
                                        'orfGenomic',
                                        'exon_nr',
                                        'orfTranscript')])
                colnames(starInfo2) <-
                    c('isoform_id',
                      'orfStartGenomic',
                      'orfStarExon',
                      'orfTransciptStart')

                stopInfo2 <-
                    unique(stopInfo[, c(
                        'transcript_id',
                        'orfGenomic',
                        'exon_nr',
                        'orfTranscript',
                        'junctionDistance',
                        'lastExonIndex',
                        'PTC'
                    )])
                colnames(stopInfo2) <-
                    c(
                        'isoform_id',
                        'orfEndGenomic',
                        'orfEndExon',
                        'orfTransciptEnd',
                        'stopDistanceToLastJunction',
                        'stopIndex',
                        'PTC'
                    )

                orfInfo <- dplyr::inner_join(starInfo2, stopInfo2, by = 'isoform_id')
                orfInfo$orfTransciptLength  <-
                    orfInfo$orfTransciptEnd - orfInfo$orfTransciptStart + 1

                # reorder
                orfInfo <-
                    orfInfo[, c(
                        'isoform_id',
                        'orfTransciptStart',
                        'orfTransciptEnd',
                        'orfTransciptLength',
                        'orfStarExon',
                        'orfEndExon',
                        'orfStartGenomic',
                        'orfEndGenomic',
                        'stopDistanceToLastJunction',
                        'stopIndex',
                        'PTC'
                    )]

                # make sure all ORFs are annotated (with NAs)
                orfInfo <-
                    dplyr::full_join(orfInfo,
                                     unique(myIsoAnot[, 'isoform_id', drop = FALSE]),
                                     by = 'isoform_id')

            } else {
                # if no CDS were found
                warning(paste(
                    'No CDS was found in the GTF file. Please make sure the GTF',
                    'file have the CDS "feature" annotation. Adding NAs instead'
                ))

                orfInfo <- data.frame(
                    isoform_id = unique(myIsoAnot$isoform_id),
                    orfTransciptStart = NA,
                    orfTransciptEnd = NA,
                    orfTransciptLength = NA,
                    orfStarExon = NA,
                    orfEndExon = NA,
                    orfStartGenomic = NA,
                    orfEndGenomic = NA,
                    stopDistanceToLastJunction = NA,
                    stopIndex = NA,
                    PTC = NA,
                    stringsAsFactors = FALSE
                )
            }

            ### add to iso annotation
            myIsoAnot$PTC <-
                orfInfo$PTC[match(myIsoAnot$isoform_id, orfInfo$isoform_id)]

        }
    }

    ### Handle sequence input
    if(TRUE) {
        addIsoformNt <- FALSE

        if( !is.null(isoformNtFasta) ) {
            isoformNtSeq <- do.call(
                c,
                lapply(isoformNtFasta, function(aFile) {
                    Biostrings::readDNAStringSet(
                        filepath = isoformNtFasta, format = 'fasta'
                    )
                })
            )

            if(!is(isoformNtSeq, "DNAStringSet")) {
                stop('The fasta file supplied to \'isoformNtFasta\' does not contain the nucleotide (DNA) sequence...')
            }

            ### Remove preceeding ref|
            if(
                sum(grepl('^ref\\|', names(isoformNtSeq))) == length( isoformNtSeq )
            ) {
                names(isoformNtSeq) <- gsub('^ref\\|', '', names(isoformNtSeq))
            }

            ### Remove potential name duplication
            isoformNtSeq <- isoformNtSeq[which(
                ! duplicated(names(isoformNtSeq))
            )]

            ### Fix names
            if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {

                names(isoformNtSeq) <- fixNames(
                    nameVec = names(isoformNtSeq),
                    ignoreAfterBar = ignoreAfterBar,
                    ignoreAfterSpace = ignoreAfterSpace,
                    ignoreAfterPeriod = ignoreAfterPeriod
                )
            }

            ### Subset to those in GTF file
            isoformNtSeq <- isoformNtSeq[which(
                names(isoformNtSeq) %in% myIsoAnot$isoform_id
            )]

            if( ! all(myIsoAnot$isoform_id %in% names(isoformNtSeq)) ) {
                warning(
                    paste(
                        'The fasta file supplied to \'isoformNtFasta\' does not contain the',
                        'nucleotide (DNA) sequence for all isoforms annotated and will not be added!',
                        '\nSpecifically:\n',
                        length(unique(myIsoAnot$isoform_id)), 'isoforms were annotated in the GTF\n',
                        length(unique(names(isoformNtSeq))), 'isoforms have a sequence.\n',
                        'Only', length(intersect(names(isoformNtSeq), myIsoAnot$isoform_id)), 'overlap.\n',
                        length(setdiff(unique(myIsoAnot$isoform_id), names(isoformNtSeq))), 'annoated isoforms isoforms had no corresponding nucleotide sequence\n',

                        '\nIf there is no overlap (as in zero or close) there are two options:\n',
                        '1) The files do not fit together (different databases, versions etc)',
                        '(no fix except using propperly paired files).\n',
                        '2) It is somthing to do with how the isoform ids are stored in the different files.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
                        '    3 Examples from GTF are :',
                        paste0( sample(unique(myIsoAnot$isoform_id), min(3, nrow(myIsoAnot)) ), collapse = ', '),'\n',
                        '    3 Examples of isoform sequence are  :',
                        paste0( sample(names(isoformNtSeq), min(3, length(isoformNtSeq)) ), collapse = ', '),'\n',


                        '\nIf there is a large overlap but still far from complete there are 3 possibilites:\n',
                        '1) The files do not fit together (different databases versions)',
                        '(no fix except using propperly paired files).\n',
                        '2) The isoforms quantified have their nucleotide sequence stored in multiple fasta files (common for Ensembl).',
                        'Just supply a vector with the path to each of them to the \'isoformNtFasta\' argument.\n',
                        '3) One file could contain non-chanonical chromosomes while the other do not',
                        '(might be solved using the \'removeNonConvensionalChr\' argument.)\n',
                        '4) It is somthing to do with how a subset of the isoform ids are stored in the different files.',
                        'This problem might be solvable using some of the',
                        '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n\n',
                        sep = ' '
                    )
                )
            } else {
                addIsoformNt <- TRUE
            }

            ### Subset to annotated isoforms
            isoformNtSeq <- isoformNtSeq[which(
                names(isoformNtSeq) %in% myIsoAnot$isoform_id
            )]
            #if( length(isoformNtSeq) ) {
            #    addIsoformNt <- FALSE
            #}
        }
    }

    ### Create exon_features grange
    myExons <-
        sort(mfGTF[exonAnoationIndex , c('transcript_id', 'gene_id')])
    colnames(myExons@elementMetadata) <- c('isoform_id', 'gene_id')
    myExons <- myExons[which(
        myExons$isoform_id %in% myIsoAnot$isoform_id
    ),]

    # Collaps ajecent exons (without any intron between)
    if(TRUE) {
        ### Reduce ajecent exons
        tmp <- unlist(
            GenomicRanges::reduce(
                split(
                    myExons,
                    myExons$isoform_id
                )
            )
        )
        ### Add isoform id
        tmp$isoform_id <- tmp@ranges@NAMES
        tmp@ranges@NAMES <- NULL

        ### add gene id
        tmp$gene_id <-myExons$gene_id[match(
            tmp$isoform_id, myExons$isoform_id
        )]

        ### sort
        tmp <- tmp[sort.list(tmp$isoform_id),]

        ### Overwrite
        myExons <- tmp
    }

    # create replicates
    nrRep <-
        data.frame(
            condition = c('plaseholder1', 'plaseholder2'),
            nrReplicates = c(NA, NA),
            row.names = NULL,
            stringsAsFactors = FALSE
        )

    # create dummy feature
    repExp <- data.frame(
        isoform_id = myIsoAnot$isoform_id,
        plaseholder1 = NA,
        plaseholder2 = NA,
        stringsAsFactors = FALSE
    )

    designMatrix <-
        data.frame(
            sampleID = c('plaseholder1', 'plaseholder2'),
            condition = c('plaseholder1', 'plaseholder2'),
            stringsAsFactors = FALSE
        )

    ### Create switchList
    if (!quiet) {
        message('Creating switchAnalyzeRlist...')
    }
    localSwichList <- createSwitchAnalyzeRlist(
        isoformFeatures = myIsoAnot,
        exons = myExons,
        designMatrix = designMatrix,
        isoformCountMatrix = repExp,
        removeFusionTranscripts = removeFusionTranscripts,
        sourceId = 'gtf'
    )

    if(addAnnotatedORFs) {
        # subset to those in list
        orfInfo <-
            orfInfo[which(orfInfo$isoform_id %in%
                              localSwichList$isoformFeatures$isoform_id), ]

        # Annotate ORF origin
        orfInfo$orf_origin <- 'Annotation'

        # check for negative ORF lengths
        isoformsToRemove <-
            orfInfo$isoform_id[which(orfInfo$orfTransciptLength < 0)]
        if (length(isoformsToRemove)) {
            genesToRemove <-
                localSwichList$isoformFeatures$gene_id[which(
                    localSwichList$isoformFeatures$isoform_id %in%
                        isoformsToRemove)]
            localSwichList <-
                subsetSwitchAnalyzeRlist(
                    localSwichList,
                    !localSwichList$isoformFeatures$gene_id %in% genesToRemove
                )

            warning(
                paste(
                    length(genesToRemove),
                    'genes where removed due to negative ORF lengths. This',
                    'typically occures because gene_id are not unique',
                    '(meaning are found multiple places accorss the genome).',
                    'Please note there might still be duplicated gene_id',
                    'located on the same chromosome.',
                    sep = ' '
                )
            )
        }
        localSwichList$orfAnalysis <- orfInfo[which(
            orfInfo$isoform_id %in% localSwichList$isoformFeatures$isoform_id)
            ,]

    }

    ### Add nucleotide sequence
    if(addIsoformNt) {
        localSwichList$ntSequence <- isoformNtSeq[which(
            names(isoformNtSeq) %in% localSwichList$isoformFeatures$isoform_id
        )]

        if(addAnnotatedORFs & extractAaSeq) {
            localSwichList <- extractSequence(
                switchAnalyzeRlist = localSwichList,
                onlySwitchingGenes = FALSE,
                writeToFile = FALSE,
                extractNTseq = TRUE,
                extractAAseq = TRUE,
                addToSwitchAnalyzeRlist = TRUE
            )
        }
    }


    # Return switchAnalyzeRlist
    if (!quiet) {
        message('Done.')
    }
    return(localSwichList)
}

importIsoformExpression <- function(
    ### Core arguments
    parentDir = NULL,
    sampleVector = NULL,

    ### Advanced arguments
    calculateCountsFromAbundance=TRUE,
    addIsofomIdAsColumn=TRUE,
    interLibNormTxPM=TRUE,
    normalizationMethod='TMM',
    pattern='',
    invertPattern=FALSE,
    ignore.case=FALSE,
    ignoreAfterBar = TRUE,
    ignoreAfterSpace = TRUE,
    ignoreAfterPeriod = FALSE,
    readLength = NULL,
    showProgress = TRUE,
    quiet = FALSE
) {
    ### To do
    # Could get a "summarize to gene level" option

    ### Test
    if(TRUE) {
        if( all(c( is.null(parentDir), is.null(sampleVector) )) ) {
            stop('Either the \'parentDir\' or the \'sampleVector\' argument must be used.')
        }
        if( !is.null(parentDir) & !is.null(sampleVector) ) {
            stop('Only one of the the \'parentDir\' and \'sampleVector\' argument can be used.')
        }

        inputIsDir <- ! is.null(parentDir)

        if(   inputIsDir ) {
            if( !is.character(parentDir) ) {
                stop('The user should supply a sting to the \'parentDir\' argument.')
            }
            if( parentDir == '' ) {
                stop(
                    paste(
                        'The \'parentDir\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                        '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
                        'to import your own data? The system.file() should only be used',
                        'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                        'To access your own data simply provide the string to the directory with the data as:',
                        '"path/to/quantification/".',
                        sep=' '
                    )
                )
            }
            if( ! dir.exists(parentDir) ) {
                if( file_test("-f", parentDir) ) {
                    stop(
                        paste(
                            'The file pointed to with the \'parentDir\' argument seems to be a file (not a directory).',
                            'Did you mean to use the \'sampleVector\' argument?',
                            '\nType "?importIsoformExpression" for more information.',
                            sep=' '
                        )
                    )
                }
                stop(
                    paste(
                        'The directory pointed to with the \'parentDir\' argument does not exists.',
                        '\nDid you accidentially make a spelling mistake or added a unwanted "/" infront of the text string?',
                        sep=' '
                    )
                )
            }

        }
        if( ! inputIsDir ) {
            if( !is.character(sampleVector) ) {
                stop('The user should supply a sting to the \'sampleVector\' argument.')
            }
            if( '' %in% sampleVector ) {
                stop(
                    paste(
                        'The \'sampleVector\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
                        '\nDid you try to use the system.file("your/quant/dir/quant.file", package="IsoformSwitchAnalyzeR")',
                        'to import your own data? The system.file() should only be used',
                        'to access the example data stored in the IsoformSwitchAnalyzeR package.',
                        'To access your own data simply provide the string to the files with the data as:',
                        '"path/to/quantification/quantification.file".',
                        sep=' '
                    )
                )
            }
            if( ! all(file.exists(sampleVector))  ) {
                stop(
                    paste(
                        'One or more of the files pointed to with the \'sampleVector\' argument does not exists.',
                        '\nDid you accidentially make a spelling mistake or added a unwanted "/" infront of the text string?',
                        sep=' '
                    )
                )
            }
            if( ! all(file_test("-f", sampleVector)) ) {
                stop(
                    paste(
                        'One or more of the files pointed to with the \'sampleVector\' argument seems to be a directory.',
                        'Did you mean to use the \'parentDir\' argument?',
                        '\nType "?importIsoformExpression" for more information.',
                        sep=' '
                    )
                )
            }
        }

    }

    ### Initialize
    if(TRUE) {
        if( !normalizationMethod %in% c("TMM", "RLE", "upperquartile") ){
            stop('Metod supplied to "normalizationMethod" must be one of "TMM", "RLE" or "upperquartile". See documentation of edgeR::calcNormFactors for more info')
        }

        analysisCount <- 2 +
            as.integer(interLibNormTxPM)

        if (showProgress &  !quiet) {
            progressBar <- 'text'
            progressBarLogic <- TRUE
        } else {
            progressBar <- 'none'
            progressBarLogic <- FALSE
        }

        ### data.frame with nesseary info
        supportedTypes <- data.frame(
            orign          = c('Kallisto'       , 'Salmon'         , 'RSEM'            , 'StringTie'  ),
            fileName       = c('abundance.tsv'  , 'quant.sf'       , 'isoforms.results', 't_data.ctab'),
            eLengthCol     = c('eff_length'     , 'EffectiveLength', 'effective_length', ''),
            stringsAsFactors = FALSE
        )
        ### Add support for detection of compressed files
        supportedTypes2 <- supportedTypes
        supportedTypes2$fileName <- paste0(supportedTypes2$fileName, '.gz')
        supportedTypes <- rbind(
            supportedTypes,
            supportedTypes2
        )

        headerTypes <- list(
            Kallisto = c('target_id','length','eff_length','est_counts','tpm'),
            Salmon = c('Name','Length','EffectiveLength','TPM','NumReads'),
            RSEM = c('transcript_id','gene_id','length','effective_length','expected_count','TPM','FPKM','IsoPct'),
            StringTie = c('t_id','chr','strand','start','end','t_name','num_exons','length','gene_id','gene_name','cov','FPKM')
        )
    }

    ### Handle directory input
    if(inputIsDir) {
        ### Identify directories of interest
        if (TRUE) {
            if (!quiet) {
                message('Step 1 of ', analysisCount, ': Identifying which algorithm was used...')
            }
            dirList <- split(
                list.dirs(
                    path = parentDir,
                    full.names = FALSE,
                    recursive = FALSE
                ),
                list.dirs(
                    path = parentDir,
                    full.names = FALSE,
                    recursive = FALSE
                )
            )
            dirList <- dirList[which(sapply(dirList, nchar) > 0)]


            if(length(dirList) == 0) {
                stop('No subdirecories were found in the supplied folder. Please check and try again.')
            }

            ### Extract those where there are files of interest
            dirList <-
                dirList[sapply(
                    dirList,
                    FUN = function(aDir) {
                        # aDir <- dirList[[6]]
                        localFiles <-
                            list.files(
                                paste0(parentDir, '/', aDir),
                                recursive = FALSE
                            )

                        if(length( localFiles )) {
                            fileOfInterest <- any(
                                sapply(
                                    paste(supportedTypes$fileName, '$', sep = ''),
                                    function(aFileName) {
                                        grepl(pattern = aFileName, x =  localFiles)
                                    })
                                )
                        } else{
                            fileOfInterest <- FALSE
                        }

                        return(fileOfInterest)
                    }
                )]

            ### Remove hidden directories
            if( any( grepl('^\\.', names(dirList)) )  ) {
                nHidden <- sum( grepl('^\\.', names(dirList)) )
                nTotal <- length(dirList)
                warning(
                    paste(
                        'The importIsoformExpression() function identified',
                        nHidden,
                        'hidden sub-directories',
                        paste0('(of a total ',nTotal,' sub-directories of interest)'),
                        '\nThese were identified as having the prefix "." and will be ignored.',
                        '\nIf you want to keep them you will have to re-name the sub-directories omitting the starting ".".',
                        sep=' '
                    )
                )

                dirList <- dirList[which(
                    ! grepl('^\\.', names(dirList))
                )]
            }

            if (length(dirList) == 0) {
                stop(
                    paste(
                        'There were no directories containing the file names/suffixes',
                        'typically generated by Kallisto/Salmon/RSEM/StringTie.',
                        'Have you renamed the quantification files?',
                        '(if so you should probably use the "sampleVector" argument instead).',
                        sep=' '
                    )
                )
            }

        }

        ### Identify input type
        if(TRUE) {
            dataAnalyed <- supportedTypes[which(
                sapply(
                    paste0(supportedTypes$fileName,'$'),
                    function(aFileName) {
                        any(grepl(
                            pattern = aFileName,
                            x = list.files(paste0( parentDir, '/', dirList[[1]] ))
                        ))
                    })
            ), ]

            if (nrow(dataAnalyed) == 0) {
                stop(
                    paste(
                        'Could not identify any files with the names/suffixes',
                        'typically generated by Kallisto/Salmon/RSEM/StringTie.',
                        'Have you renamed the quantification files?',
                        '(if so you should use the "sampleVector" argument instead).',
                        sep=' '
                    )
                )
            }
            if (nrow(dataAnalyed) > 1) {
                stop(
                    paste(
                        'Could not uniquely identify file type.',
                        'Does the subdirectory contain results from multiple different tools?',
                        'If so you should use the "sampleVector" argument instead.',
                        'If not please contact developer.'
                    )
                )
            }
            if (!quiet) {
                message(paste('    The quantification algorithm used was:', dataAnalyed$orign, sep = ' '))
            }

            if( dataAnalyed$orign == 'StringTie' & is.null(readLength)) {
                stop(paste(
                    'When importing StringTie results the \'readLength\' argument',
                    'must be specified.\n',
                    'This argument must be set to the number of base pairs sequenced',
                    '(e.g. if the \n quantified data is 75 bp paired ends \'readLength\' should be set to 75.'
                ))
            }

        }

        ### Make paths for tximport
        if(TRUE) {
            ### make vector with paths
            localFiles <- sapply(
                dirList,
                function(aDir) {
                    list.files(
                        path = paste0( parentDir, '/', aDir, '/' ),
                        pattern = paste0(dataAnalyed$fileName, '$'),
                        full.names = TRUE
                    )

                }
            )
            names(localFiles) <- names(dirList)

            ### Subset to those of interest
            if( invertPattern ) {
                localFiles <- localFiles[which(
                    ! grepl(
                        pattern = pattern,
                        x = localFiles,
                        ignore.case=ignore.case
                    )
                )]
            } else {
                localFiles <- localFiles[which(
                    grepl(
                        pattern = pattern,
                        x = localFiles,
                        ignore.case=ignore.case
                    )
                )]
            }

            if( length(localFiles) == 0 ) {
                stop('No files were left after filtering via the \'pattern\' argument')
            }

            if (!quiet) {
                message(
                    paste0(
                        '    Found ',
                        length(localFiles),
                        ' quantification file(s) of interest'
                    )
                )
            }

            ### Test existence
            if(TRUE) {
                fileTest <- file.exists(localFiles)

                if( !all(fileTest)) {
                    stop(
                        paste0(
                            '\nSomething went wrong with the file-path creation. Please contact developer with reproducible example.',
                            '\n One file which did not work out was:\n ',
                            localFiles[which( ! fileTest) [1]],
                            sep=''
                        )
                    )
                }
            }
        }
    }

    ### Handle file input
    if( ! inputIsDir ) {
        if (!quiet) {
            message('Step 1 of ', analysisCount, ': Identifying which algorithm was used...')
        }

        ### Identify input type
        if(TRUE) {
            dataAnalyedList <- plyr::llply(sampleVector, function(aFilePath) {
                suppressMessages(
                    sampleFile <- readr::read_tsv(
                        aFilePath, col_names = TRUE, n_max = 2
                    )
                )

                localRes <- data.frame(
                    orign = names(headerTypes)[which(
                        sapply(headerTypes, function(x) {
                            all(x %in% colnames(sampleFile))
                        })
                    )],
                    stringsAsFactors = FALSE
                )

                return(localRes)
            })

            if( any(sapply(dataAnalyedList, nrow) != 1) ) {
                stop(
                    paste(
                        'Some of the files pointed to are not quantification',
                        'files from Kallisto/Salmon/RSEM/StringTie.',
                        'They did no contain the column names',
                        'typically generated by Kallisto/Salmon/RSEM/StringTie.',
                        'Are you sure it is the right files?',
                        sep=' '
                    )
                )
            }

            dataAnalyed <- unique(
                do.call(
                    rbind,
                    dataAnalyedList
                )
            )

            if (nrow(dataAnalyed) == 0) {
                stop(
                    paste(
                        'None of the files had the column names',
                        'typically generated by Kallisto/Salmon/RSEM/StringTie.',
                        'Are you sure it is the right files?',
                        sep=' '
                    )
                )
            }
            if (nrow(dataAnalyed) > 1) {
                stop(
                    paste(
                        'Could not uniquely identify file type.',
                        'Does the files pointed to come from a mixture of multiple different tools?',
                        'That is neither recommended nor supported. Please use only one tool'
                    )
                )
            }
            if (!quiet) {
                message(paste('    The quantification algorithm used was:', dataAnalyed$orign, sep = ' '))
            }

            if( dataAnalyed$orign == 'StringTie' & is.null(readLength)) {
                stop(paste(
                    'When importing StringTie results the \'readLength\' argument',
                    'must be specified.\n',
                    'This argument must be set to the number of base pairs sequenced',
                    '(e.g. if the \n quantified data is 75 bp paired ends \'readLength\' should be set to 75.'
                ))
            }

        }

        ### Add names
        if(TRUE) {
            localFiles <- sampleVector
            fileNames <- names(localFiles)

            ### Add file names
            if( is.null( fileNames ) | any(is.na( fileNames )) ) {
                if(any(is.na( fileNames ))) {
                    if (!quiet) {
                        message('    NAs was found in the name vector IsoformSwitchAnalyzeR will try and create new once.')
                    }
                }

                fileNames <- sapply(strsplit(localFiles, '/'), function(x) {
                    tmp <- tail(x, 1)
                    tmp <- gsub('\\.tsv$|\\.sf$|\\.isoforms.results|\\.ctab$|\\.csv|\\.txt', '', tmp)
                    return(tmp)
                })
            }

            if(any(duplicated(fileNames)) ) {
                # fileNames <- c('t','t','b')
                nameCount <- as.data.frame(table(fileNames))

                fileNames <- plyr::ddply(nameCount, 'fileNames', function(aDF) {
                    if( aDF$Freq == 1) {
                        return(
                            data.frame(
                                newId = aDF$fileNames,
                                stringsAsFactors = FALSE
                            )
                        )
                    } else {
                        return(
                            data.frame(
                                newId = paste0(aDF$fileNames, '_', 1:aDF$Freq),
                                stringsAsFactors = FALSE
                            )
                        )
                    }
                })$newId
            }

            if( any(duplicated(fileNames)) ){
                stop(
                    paste(
                        'IsoformSwitchAnalyzeR could not fix the missing name problem.',
                        'Please assign names to the vector provided to',
                        'the \'sampleVector\' argument using the names() function.'
                    )
                )
            }

            names(localFiles) <- fileNames
        }
    }

    ### Import files with txtimport
    if(TRUE) {
        if (!quiet) {
            message('Step 2 of ', analysisCount, ': Reading data...')
        }

        ### Use Txtimporter to import data
        if (!quiet) {
            localDataList <- tximport::tximport(
                files = localFiles,
                type = tolower(dataAnalyed$orign),
                txOut = TRUE, # to get isoform expression
                countsFromAbundance = ifelse(
                    test = calculateCountsFromAbundance,
                    yes= 'scaledTPM',
                    no='no'
                ),
                ignoreAfterBar = ignoreAfterBar,
                readLength=readLength
            )
        } else {
            suppressMessages(
                localDataList <- tximport::tximport(
                    files = localFiles,
                    type = tolower(dataAnalyed$orign),
                    txOut = TRUE, # to get isoform expression
                    countsFromAbundance = ifelse(
                        test = calculateCountsFromAbundance,
                        yes= 'scaledTPM',
                        no='no'
                    ),
                    ignoreAfterBar = ignoreAfterBar,
                    readLength=readLength
                )
            )
        }
    }

    ### test for failed libraies
    if(TRUE) {
        allZero <- apply(localDataList$abundance, 2, function(x) sum(x) == 0)
        if(any(allZero)) {
            toRemove <- names(allZero)[which(allZero)]
            warning(
                paste(
                    'Some quantifications appared to not have worked (zero reads mapped).',
                    '\nThe following libraries were therefore removed:',
                    paste(toRemove, collapse = ', ')
                )
            )

            localDataList$abundance <- localDataList$abundance[,which(!allZero)]
            localDataList$counts <- localDataList$counts[,which(!allZero)]
            localDataList$length <- localDataList$length[,which(!allZero)]

            if( ncol(localDataList$abundance) == 0 ) {
                stop('No libraries left after failed quantifications were removed.')
            }
        }
    }

    ### Noralize TxPM values based on effective counts
    if(interLibNormTxPM) {
        if( ncol(localDataList$abundance) >= 2) {
            if (!quiet) {
                message('Step 3 of 3: Normalizing abundance values (not counts) via edgeR...')
            }

            okIso <- rownames(localDataList$abundance)[which(
                rowSums( localDataList$abundance > 1 ) > 1
            )]
            abundMat <- localDataList$abundance[which( rownames(localDataList$abundance) %in% okIso),]

            ### Calculate normalization factors
            localDGE <- suppressMessages( suppressWarnings( edgeR::DGEList(abundMat, remove.zeros = TRUE) ) )
            localDGE <- suppressMessages( suppressWarnings( edgeR::calcNormFactors(localDGE, method = normalizationMethod) ) )

            ### Apply normalization factors
            localDataList$abundance <- t(t(localDataList$abundance) / localDGE$samples$norm.factors)
        } else {
            if (!quiet) {
                message('Step 3 of 3: Normalizing skipped due to only 1 sample...')
            }

        }
    }

    ### Massage data
    if(TRUE) {
        ### massage
        localDataList$abundance <- as.data.frame(localDataList$abundance)
        localDataList$counts <- as.data.frame(localDataList$counts)
        localDataList$length <- as.data.frame(localDataList$length)
        localDataList$countsFromAbundance <- NULL # remove message of how it was imported

        ### Fix names
        if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {
            ### Test for duplication
            rownames(localDataList$abundance) <- fixNames(
                nameVec = rownames(localDataList$abundance),
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )

            rownames(localDataList$counts) <- fixNames(
                nameVec = rownames(localDataList$counts),
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )

            rownames(localDataList$length) <- fixNames(
                nameVec = rownames(localDataList$length),
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod

            )
        }

        ### Add isoform id as col
        if(addIsofomIdAsColumn) {
            localDataList <- lapply(localDataList, function(x) {
                x$isoform_id <- rownames(x)
                rownames(x) <- NULL
                return(x)
            })

            ### Reorder
            reorderCols <- function(x) {
                x[,c( ncol(x), 1:(ncol(x)-1) )]
            }

            localDataList$abundance <- reorderCols( localDataList$abundance)
            localDataList$counts    <- reorderCols( localDataList$counts   )
            localDataList$length    <- reorderCols( localDataList$length   )
        }

        ### Add options
        localDataList$importOptions <- list(
            'calculateCountsFromAbundance'= calculateCountsFromAbundance,
            'interLibNormTxPM'= interLibNormTxPM,
            'normalizationMethod'= normalizationMethod
        )

        if (!quiet) {
            message('GTF file imported successfully.\n')
        }
    }

    return(localDataList)
}


importRdata <- function(
    ### Core arguments
  isoformCountMatrix,
  isoformRepExpression = NULL,
  designMatrix,
  isoformExonAnnoation,
  isoformNtFasta = NULL,
  comparisonsToMake = NULL,
  
  ### Advanced arguments
  detectUnwantedEffects = TRUE,
  addAnnotatedORFs = TRUE,
  onlyConsiderFullORF = FALSE,
  removeNonConvensionalChr = FALSE,
  ignoreAfterBar = TRUE,
  ignoreAfterSpace = TRUE,
  ignoreAfterPeriod = FALSE,
  ignoreSurplusIsoforms = FALSE,
  removeTECgenes = TRUE,
  PTCDistance = 50,
  foldChangePseudoCount = 0.01,
  fixStringTieAnnotationProblem = TRUE,
  fixStringTieViaOverlapInMultiGenes = TRUE,
  fixStringTieMinOverlapSize = 50,
  fixStringTieMinOverlapFrac = 0.2,
  fixStringTieMinOverlapLog2RatioToContender = 0.65,
  estimateDifferentialGeneRange = TRUE,
  showProgress = TRUE,
  quiet = FALSE
) {
  ### Test existence of files
  if(TRUE) {
    if( !is.null(isoformNtFasta)) {
      if( ! is(isoformNtFasta, 'character') ) {
        stop('The \'isoformNtFasta\' argument must be a string (or vector of strings) pointing to the fasta file on the disk.')
      }
      
      if( any( isoformNtFasta == '') ) {
        stop(
          paste(
            'The \'isoformNtFasta\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
            '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
            'to import your own data? The system.file() should only be used',
            'to access the example data stored in the IsoformSwitchAnalyzeR package.',
            'To access your own data simply provide the string to the directory with the data as:',
            '"path/to/quantification/".',
            sep=' '
          )
        )
      }
      if( any( ! sapply(isoformNtFasta, file.exists) ) ) {
        stop('At least one of the file(s) pointed to with \'isoformNtFasta\' seems not to exist.')
      }
      if( any(! grepl('\\.fa|\\.fasta|\\.fa.gz|\\.fasta.gz', isoformNtFasta)) ) {
        stop('At least one of the file(s) pointed to with \'isoformNtFasta\' seems not to be a fasta file...')
      }
    }
    
    if( class(isoformExonAnnoation) == 'character' ) {
      if( isoformExonAnnoation == '' ) {
        stop(
          paste(
            'The \'isoformExonAnnoation\' argument does not lead anywhere (acutally you just suppled "" to the argument).',
            '\nDid you try to use the system.file("your/quant/dir/", package="IsoformSwitchAnalyzeR")',
            'to import your own data? The system.file() should only be used',
            'to access the example data stored in the IsoformSwitchAnalyzeR package.',
            'To access your own data simply provide the string to the directory with the data as:',
            '"path/to/quantification/".',
            sep=' '
          )
        )
      }
      if( ! (file.exists(isoformExonAnnoation) | RCurl::url.exists(isoformExonAnnoation)) ) {
        stop(
          paste(
            'The file pointed to with the \'isoformExonAnnoation\' argument does not exists.',
            '\nDid you accidentially make a spelling mistake or added a unwanted "/" infront of the text string?',
            sep=' '
          )
        )
      }
    }
  }
  
  ### Test whether input data fits together
  if (!quiet) { message('Step 1 of 10: Checking data...')}
  if (TRUE) {
    ### Set up progress
    if (showProgress &  !quiet) {
      progressBar <- 'text'
      progressBarLogic <- TRUE
    } else {
      progressBar <- 'none'
      progressBarLogic <- FALSE
    }
    
    ### Test supplied expression
    if(TRUE) {
      countsSuppled <- ! is.null(isoformCountMatrix)
      abundSuppled  <- ! is.null(isoformRepExpression)
      
      if( ! countsSuppled ) {
        stop('You must supply a count matrix to \'isoformCountMatrix\'.')
      }
      
      if( ! class(isoformCountMatrix)[1] %in% c('data.frame','matrix','tbl_df')) {
        stop('The input given as isoformCountMatrix must be a data.frame or matrix.')
      }
      
      
      if( abundSuppled ) {
        isoformRepExpression <- as.data.frame(isoformRepExpression)
        
        if( any( apply(isoformRepExpression[,which(colnames(isoformRepExpression) != 'isoform_id')],2, class) %in% c('character', 'factor') )) {
          stop('The isoformCountMatrix contains character/factor column(s) (other than the isoform_id column)')
        }
        
        extremeValues <- range( isoformRepExpression[,which( colnames(isoformRepExpression) != 'isoform_id')], na.rm = TRUE )
        if( max(extremeValues) < 30 ) {
          warning('The expression data supplied to \'isoformRepExpression\' seems very small - please double-check that it is NOT log-transformed')
        }
        if( min(extremeValues) < 0 ) {
          stop('The expression data supplied to \'isoformRepExpression\' contains negative values - please double-check that it is NOT log-transformed')
        }
        
      }
      
      if( countsSuppled ) {
        isoformCountMatrix <- as.data.frame(isoformCountMatrix)
        
        if( any( apply(isoformCountMatrix[,which(colnames(isoformCountMatrix) != 'isoform_id')],2, class) %in% c('character', 'factor') )) {
          stop('The isoformCountMatrix contains character/factor column(s) (other than the isoform_id column)')
        }
        
        extremeValues <- range( isoformCountMatrix[,which( colnames(isoformCountMatrix) != 'isoform_id')], na.rm = TRUE )
        if( max(extremeValues) < 30 ) {
          warning('The count data supplied to \'isoformCountMatrix\' seems very small - please double-check that it is NOT log-transformed')
        }
        if( min(extremeValues) < 0 ) {
          stop('The count data supplied to \'isoformCountMatrix\' contains negative values - please double-check that it is NOT log-transformed')
        }
      }
    }
    
    ### Contains the colums they should
    if (TRUE) {
      ### Colnames
      if( countsSuppled ) {
        if (!any(colnames(isoformCountMatrix) == 'isoform_id')) {
          #stop(paste(
          #    'The data.frame passed to the \'isoformCountMatrix\'',
          #    'argument must contain a \'isoform_id\' column'
          #))
          warning(
            paste(
              '    Using row.names as \'isoform_id\' for \'isoformCountMatrix\'.',
              'If not suitable you must add them manually.',
              sep=' '
            )
          )
          isoformCountMatrix$isoform_id <- rownames(isoformCountMatrix)
          
        }
        
        if(any(duplicated( isoformCountMatrix$isoform_id) )) {
          stop('The \'isoform_id\' of the count matrix must have unique ids.')
        }
      }
      if ( abundSuppled ) {
        if (!any(colnames(isoformRepExpression) == 'isoform_id')) {
          #stop(paste(
          #    'The data.frame passed to the \'isoformRepExpression\'',
          #    'argument must contain a \'isoform_id\' column'
          #))
          message(paste(
            '    Using row.names as \'isoform_id\' for \'isoformRepExpression\'. If not suitable you must add them manually.'
          ))
          isoformRepExpression$isoform_id <- rownames(isoformRepExpression)
        }
        if(any(duplicated( isoformRepExpression$isoform_id) )) {
          stop('The \'isoform_id\' of the expression matrix must have unique ids.')
        }
      }
      
      ### Potentially convert from tibble
      if( class(designMatrix)[1] == 'tbl_df') {
        designMatrix <- as.data.frame(designMatrix)
      }
      if (!all(c('sampleID', 'condition') %in% colnames(designMatrix))) {
        stop(paste(
          'The data.frame passed to the \'designMatrix\'',
          'argument must contain both a \'sampleID\' and a',
          '\'condition\' column'
        ))
      }
      if (length(unique(designMatrix$condition)) < 2) {
        stop('The supplied \'designMatrix\' only contains 1 condition')
      }
      # test information content in design matrix
      if( ncol(designMatrix) > 2 ) {
        otherDesign <- designMatrix[,which(
          ! colnames(designMatrix) %in% c('sampleID', 'condition')
        ),drop=FALSE]
        
        nonInformaticColms <- which(
          apply(otherDesign, 2, function(x) {
            length(unique(x)) == 1
          })
        )
        
        if(length(nonInformaticColms)) {
          stop(
            paste(
              'In the designMatrix the following column(s): ',
              paste(names(nonInformaticColms), collapse = ', '),
              '\n Contain constant information. Columns apart from \'sampleID\' and \'condition\'\n',
              'must describe cofounding effects not if interest. See ?importRdata and\n',
              'vignette ("How to handle cofounding effects (including batches)" section) for more information.',
              sep=' '
            )
          )
        }
      }
      
      # test comparisonsToMake
      if (!is.null(comparisonsToMake)) {
        if (!all(c('condition_1', 'condition_2') %in%
                 colnames(comparisonsToMake))) {
          stop(paste(
            'The data.frame passed to the \'comparisonsToMake\'',
            'argument must contain both a \'condition_1\' and a',
            '\'condition_2\' column indicating',
            'the comparisons to make'
          ))
        }
      }
    }
    
    ### Convert potential factors
    if (TRUE) {
      orgCond <- designMatrix$condition
      
      designMatrix$sampleID  <- as.character(designMatrix$sampleID)
      designMatrix$condition <- as.character(designMatrix$condition)
      
      if (!is.null(comparisonsToMake)) {
        comparisonsToMake$condition_1 <-
          as.character(comparisonsToMake$condition_1)
        comparisonsToMake$condition_2 <-
          as.character(comparisonsToMake$condition_2)
      }
      
      if (!is.null(isoformRepExpression)) {
        isoformRepExpression$isoform_id <-
          as.character(isoformRepExpression$isoform_id)
      }
      if (!is.null(isoformCountMatrix)) {
        isoformCountMatrix$isoform_id <-
          as.character(isoformCountMatrix$isoform_id)
      }
      
    }
    
    ### Check supplied data fits togehter
    if (TRUE) {
      if(countsSuppled) {
        if (!all(designMatrix$sampleID %in% colnames(isoformCountMatrix))) {
          stop(paste(
            'Each sample stored in \'designMatrix$sampleID\' must have',
            'a corresponding expression column in \'isoformCountMatrix\''
          ))
        }
      }
      if ( abundSuppled ) {
        if (!all(designMatrix$sampleID %in%
                 colnames(isoformRepExpression))) {
          stop(paste(
            'Each sample stored in \'designMatrix$sampleID\' must',
            'have a corresponding expression column',
            'in \'isoformRepExpression\''
          ))
        }
      }
      if( abundSuppled & countsSuppled ) {
        if( !  identical( colnames(isoformCountMatrix) , colnames(isoformRepExpression)) ) {
          stop('The column name and order of \'isoformCountMatrix\' and \'isoformRepExpression\' must be identical')
        }
        
        if( !  identical( isoformCountMatrix$isoform_id , isoformCountMatrix$isoform_id ) ) {
          stop('The ids and order of the \'isoform_id\' column in \'isoformCountMatrix\' and \'isoformRepExpression\' must be identical')
        }
      }
      
      if (!is.null(comparisonsToMake)) {
        if (!all(
          c(
            comparisonsToMake$condition_1,
            comparisonsToMake$condition_2
          ) %in% designMatrix$condition
        )) {
          stop(paste(
            'The conditions supplied in comparisonsToMake and',
            'designMatrix does not match'
          ))
        }
      } else {
        # create it
        comparisonsToMake <-
          allPairwiseFeatures(orgCond)
        colnames(comparisonsToMake) <-
          c('condition_1', 'condition_2')
      }
    }
    
    ### Test complexity of setup
    if(TRUE) {
      nCond <- length(unique(designMatrix$condition))
      n <- nrow(designMatrix)
      if(  nCond/n > 2/3  ) {
        warning(paste(
          'The experimental design seems to be of very low complexity - very few samples per replicate.',
          'Please check the supplied design matrixt to make sure no mistakes were made.'
        ))
      }
      
      nComp <- nrow(comparisonsToMake)
      if( nComp > 6 ) {
        warning(paste0(
          'The number of comparisons (n=', nComp,') is unusually high.',
          '\n - If this intended please note that with a large number of comparisons IsoformSwitchAnalyzeR might use quite a lot of memmory (aka running on a small computer might be problematic).',
          '\n - If this was not intended please check the supplied design matrixt to make sure no mistakes were made.'
        ))
      }
      
      ### Test conditions with n=1
      cndCnt <- table(designMatrix$condition)
      if( any(cndCnt == 1) ) {
        warning(
          paste0(
            '\n!!! NB !!! NB !!! NB !!!NB !!! NB !!!',
            '\nIsoformSwitchAnalyzeR is not made to work with conditions without indepdendet biological replicates and results will not be trustworthy!',
            '\nAt best data without replicates should be analyzed as a pilot study before investing in more replicates.',
            '\nPlase consult the "Analysing experiments without replicates" and "What constitute an independent biological replicate?" sections of the vignette.',
            '\n!!! NB !!! NB !!! NB !!!NB !!! NB !!!\n'
          )
        )
      }
      
      ### Test for full rank
      isFullRank <- testFullRank( designMatrix )
      
      if( ! isFullRank ) {
        stop(
          paste(
            'The supplied design matrix will result in a model matrix that is not full rank',
            '\nPlease make sure there are no co-linearities in the design'
          )
        )
      }
      
    }
    
    ### Test NT input
    if(TRUE) {
      if( !is.null( isoformNtFasta )) {
        if( !is.character( isoformNtFasta)) {
          stop('The \'isoformNtFasta\' argument must be a charachter string.')
        }
        
        if( any( ! sapply(isoformNtFasta, file.exists) ) ) {
          stop('At least one of the file(s) pointed to with \'isoformNtFasta\' seems not to exist.')
        }
        if( any(! grepl('\\.fa|\\.fasta|\\.fa.gz|\\.fasta.gz', isoformNtFasta)) ) {
          stop('The file pointed to via the \'isoformNtFasta\' argument does not seem to be a fasta file...')
        }
      }
    }
    
  }
  
  ### Giver proper R names
  if(TRUE) {
    ### Double check order
    designMatrix <- designMatrix[,c(
      match( c('sampleID','condition'), colnames(designMatrix) ),
      which( ! colnames(designMatrix) %in% c('sampleID','condition') )
    )]
    
    tmp <- designMatrix
    
    for( i in 2:ncol(designMatrix) ) { # i <- 2
      if( class(designMatrix[,i]) %in% c('character','factor') ) {
        designMatrix[,i] <- makeProperNames( designMatrix[,i] )
      }
    }
    
    if( ! identical(tmp, designMatrix) ) {
      message('Please note that some condition names were changed due to names not suited for modeling in R.')
    }
    
    if( !is.null(comparisonsToMake) ) {
      comparisonsToMake$condition_1 <- makeProperNames(
        comparisonsToMake$condition_1
      )
      comparisonsToMake$condition_2 <- makeProperNames(
        comparisonsToMake$condition_2
      )
    }
  }
  
  ### Fix names (done before input is handled and compared)
  if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {
    if( countsSuppled ) {
      isoformCountMatrix$isoform_id <- fixNames(
        nameVec = isoformCountMatrix$isoform_id,
        ignoreAfterBar = ignoreAfterBar,
        ignoreAfterSpace = ignoreAfterSpace,
        ignoreAfterPeriod = ignoreAfterPeriod
      )
    }
    if ( abundSuppled ) {
      isoformRepExpression$isoform_id <- fixNames(
        nameVec = isoformRepExpression$isoform_id,
        ignoreAfterBar = ignoreAfterBar,
        ignoreAfterSpace = ignoreAfterSpace,
        ignoreAfterPeriod = ignoreAfterPeriod
      )
    }
    
  }
  
  
  
  ### Obtain isoform annotation
  if (!quiet) { message('Step 2 of 10: Obtaining annotation...')}
  if (TRUE) {
    ### Massage annoation input
    if(TRUE) {
      ### Import GTF is nessesary
      if( class(isoformExonAnnoation) == 'character' ) {
        gtfImported <- TRUE
        
        ### Test input
        if(TRUE) {
          if (length(isoformExonAnnoation) != 1) {
            stop(paste(
              'You can only supply 1 file to isoformExonAnnoation'
            ))
          }
          if( ! grepl('\\.gtf$|\\.gtf\\.gz$', isoformExonAnnoation, ignore.case = TRUE) ) {
            warning('The file appearts not to be a GTF file as it does not end with \'.gtf\' or \'.gtf.gz\' - are you sure it is the rigth file?')
          }
          
          if (!quiet) {
            message('    importing GTF (this may take a while)...')
          }
          
        }
        
        ### Note: Isoform names are fixed by importGTF
        suppressWarnings(
          gtfSwichList <- importGTF(
            pathToGTF = isoformExonAnnoation,
            addAnnotatedORFs = addAnnotatedORFs,
            onlyConsiderFullORF = onlyConsiderFullORF,
            removeNonConvensionalChr = FALSE,
            ignoreAfterBar = ignoreAfterBar,
            ignoreAfterSpace = ignoreAfterSpace,
            ignoreAfterPeriod = ignoreAfterPeriod,
            removeTECgenes = FALSE,
            PTCDistance = PTCDistance,
            removeFusionTranscripts = FALSE,
            removeUnstrandedTranscripts = FALSE,
            quiet = TRUE
          )
        )
        
        ### Extract isoforms which are quantified
        if(TRUE) {
          ### Get genes with iso quantified
          if( countsSuppled ) {
            genesToKeep <- gtfSwichList$isoformFeatures$gene_id[which(
              gtfSwichList$isoformFeatures$isoform_id %in% isoformCountMatrix$isoform_id
            )]
            
            ### Ensure all isoforms quantified are kept
            isoToKeep <- union(
              gtfSwichList$isoformFeatures$isoform_id[which(
                gtfSwichList$isoformFeatures$gene_id %in% genesToKeep
              )],
              isoformCountMatrix$isoform_id
            )
          } else {
            genesToKeep <- gtfSwichList$isoformFeatures$gene_id[which(
              gtfSwichList$isoformFeatures$isoform_id %in% isoformRepExpression$isoform_id
            )]
            
            ### Ensure all isoforms quantified are kept
            isoToKeep <- union(
              gtfSwichList$isoformFeatures$isoform_id[which(
                gtfSwichList$isoformFeatures$gene_id %in% genesToKeep
              )],
              isoformRepExpression$isoform_id
            )
          }
        }
        
        ### Extract isoforms to remove due to non chanonical nature
        if(TRUE) {
          ### Identify isoforms to remove
          isoformsToRemove <- character()
          
          ### TEC genes
          if( removeTECgenes & any(!is.na( gtfSwichList$isoformFeatures$gene_biotype)) ) {
            isoformsToRemove <- c(
              isoformsToRemove,
              unique(gtfSwichList$isoformFeatures$isoform_id[which(
                gtfSwichList$isoformFeatures$gene_biotype == 'TEC'
              )])
            )
          }
          
          ### Strange chromosomes
          if( removeNonConvensionalChr ) {
            nonChanonicalChrsIso <- unique(
              gtfSwichList$exons$isoform_id[which(
                grepl('_|\\.'  , as.character(gtfSwichList$exons@seqnames))
              )]
            )
            
            isoformsToRemove <- unique(c(
              isoformsToRemove,
              nonChanonicalChrsIso
            ))
          }
          
          ### Unstranded transcripts
          if(TRUE) {
            unstrandedIso <- unique(
              gtfSwichList$exons$isoform_id[which(
                grepl('\\*'  , as.character(gtfSwichList$exons@strand))
              )]
            )
            
            isoformsToRemove <- unique(c(
              isoformsToRemove,
              unstrandedIso
            ))
            
            if(length(unstrandedIso)) {
              warning(
                paste0(
                  'We found ', length(unstrandedIso),
                  ' (', round(
                    length(unstrandedIso) / length(isoToKeep) * 100,
                    digits = 2
                  ),
                  '%) unstranded transcripts.',
                  '\n  These were removed as unstranded transcripts cannot be analysed'
                )
              )
            }
          }
          
          ### Note:
          # No need to extend to genes since they per definition are all genes
          
          ### Remove non chanonical isoforms
          if(length(isoformsToRemove)) {
            isoToKeep <- setdiff(
              isoToKeep,
              isoformsToRemove
            )
          }
        }
        
        ### Subset to used data
        if(TRUE) {
          if( countsSuppled ) {
            isoformCountMatrix <- isoformCountMatrix[which(
              isoformCountMatrix$isoform_id %in% isoToKeep
            ),]
          }
          if( abundSuppled ) {
            isoformRepExpression <- isoformRepExpression[which(
              isoformRepExpression$isoform_id %in% isoToKeep
            ),]
          }
          
          if(any(isoToKeep %in% gtfSwichList$isoformFeatures$isoform_id)) {
            gtfSwichList$isoformFeatures <- gtfSwichList$isoformFeatures[which(
              gtfSwichList$isoformFeatures$isoform_id %in% isoToKeep
            ),]
            gtfSwichList$exons <- gtfSwichList$exons[which(
              gtfSwichList$exons$isoform_id %in% isoToKeep
            ),]
            gtfSwichList$orfAnalysis <- gtfSwichList$orfAnalysis[which(
              gtfSwichList$orfAnalysis$isoform_id %in% isoToKeep
            ),]
          }
        }
        
        ### Extract wanted annotation files form the GTF switchAnalyzeR object
        if(TRUE) {
          isoformExonStructure <-
            gtfSwichList$exons[, c('isoform_id', 'gene_id')]
          isoformExonStructure <- sort(isoformExonStructure)
          
          colsToExtract <- c(
            'isoform_id', 'gene_id', 'gene_name',
            'ref_gene_id', # stringtie annotation
            'gene_biotype','iso_biotype'
          )
          isoformAnnotation <-
            unique(gtfSwichList$isoformFeatures[,na.omit(
              match(colsToExtract , colnames(gtfSwichList$isoformFeatures))
            )])
        }
        # where isoformAnnotation and isoformExonStructure is made
        
        if (addAnnotatedORFs & gtfImported) {
          isoORF <- gtfSwichList$orfAnalysis
          
          if( all( is.na(isoORF$PTC)) ) {
            warning(
              paste(
                '   No CDS annotation was found in the GTF files meaning ORFs could not be annotated.\n',
                '    (But ORFs can still be predicted with the analyzeORF() function)'
              )
            )
            
            addAnnotatedORFs <- FALSE
          }
        }
      }
      
      if( class(isoformExonAnnoation) != 'character' ) {
        gtfImported <- FALSE
        
        ### Test input
        if(TRUE) {
          if( ! is(object = isoformExonAnnoation, 'GRanges') ) {
            stop('When not using a GTF file (by supplying a text string with the path to the file) the "isoformExonAnnoation" argument must be a GRange.')
          }
          
          if( length(isoformExonAnnoation) == 0 ) {
            stop('The GRange supplied to the "isoformExonAnnoation" argument have zero enteries (rows).')
          }
          
          ### Test
          if( !all( c('isoform_id', 'gene_id') %in% colnames(isoformExonAnnoation@elementMetadata) )) {
            stop('The supplied annotation must contain to meta data collumns: \'isoform_id\' and \'gene_id\'')
          }
          
          ### Test for other than exons by annotation
          if(any(  colnames(isoformExonAnnoation@elementMetadata) == 'type' )) {
            stop(
              paste(
                'The \'type\' column of the data supplied to \'isoformExonAnnoation\'',
                'indicate there are multiple levels of data.',
                'Please fix this (providing only exon-level) or simply',
                '\nprovide a string with the path to the GTF file to the \'isoformExonAnnoation\' - ',
                'then IsoformSwitchAnalyzeR will import and massage the GTF file for you.'
              )
            )
          }
          
          ### Test for other than exons by overlap of transcript features
          localExonList <- split(isoformExonAnnoation@ranges, isoformExonAnnoation$isoform_id)
          localExonListReduced <- GenomicRanges::reduce(localExonList)
          if(
            any( sapply( width(localExonList), sum) != sapply( width(localExonListReduced), sum) )
          ) {
            stop(
              paste(
                'The data supplied to \'isoformExonAnnoation\' appears to be multi-leveled',
                '(Fx both containing exon and CDS information for transcripts - which a GTF file does).',
                'If your annotation data originate from a GTF file please supply a string',
                'indicating the path to the GTF file to the \'isoformExonAnnoation\' argument',
                'instead - then IsoformSwitchAnalyzeR will handle the multi-levels.'
              )
            )
          }
          
        }
        
        ### Fix names
        if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {
          isoformExonAnnoation$isoform_id <- fixNames(
            nameVec = isoformExonAnnoation$isoform_id,
            ignoreAfterBar = ignoreAfterBar,
            ignoreAfterSpace = ignoreAfterSpace,
            ignoreAfterPeriod = ignoreAfterPeriod
          )
        }
        
        ### Collaps ajecent exons (aka thouse without any intron between)
        if(TRUE) {
          ### Reduce ajecent exons
          tmp <- unlist(
            GenomicRanges::reduce(
              split(
                isoformExonAnnoation,
                isoformExonAnnoation$isoform_id
              )
            )
          )
          ### Add isoform id
          tmp$isoform_id <- tmp@ranges@NAMES
          tmp@ranges@NAMES <- NULL
          
          ### add gene id
          tmp$gene_id <-isoformExonAnnoation$gene_id[match(
            tmp$isoform_id, isoformExonAnnoation$isoform_id
          )]
          
          ### Add gene names if used
          if('gene_name' %in% colnames(isoformExonAnnoation@elementMetadata)) {
            tmp$gene_name <-isoformExonAnnoation$gene_name[match(
              tmp$isoform_id, isoformExonAnnoation$isoform_id
            )]
          }
          
          ### sort
          tmp <- tmp[sort.list(tmp$isoform_id),]
          
          ### Overwrite
          isoformExonAnnoation <- tmp
        }
        
        ### Extract isoforms which are quantified
        if(TRUE) {
          ### Get genes with iso quantified
          if( countsSuppled ) {
            genesToKeep <- isoformExonAnnoation$gene_id[which(
              isoformExonAnnoation$isoform_id %in% isoformCountMatrix$isoform_id
            )]
            
            ### Ensure all isoforms quantified are kept
            isoToKeep <- union(
              isoformExonAnnoation$isoform_id[which(
                isoformExonAnnoation$gene_id %in% genesToKeep
              )],
              isoformCountMatrix$isoform_id
            )
          } else {
            genesToKeep <- isoformExonAnnoation$gene_id[which(
              isoformExonAnnoation$isoform_id %in% isoformRepExpression$isoform_id
            )]
            
            ### Ensure all isoforms quantified are kept
            isoToKeep <- union(
              isoformExonAnnoation$isoform_id[which(
                isoformExonAnnoation$gene_id %in% genesToKeep
              )],
              isoformCountMatrix$isoform_id
            )
          }
        }
        
        ### Subset to used data
        if(TRUE) {
          if( countsSuppled ) {
            isoformCountMatrix <- isoformCountMatrix[which(
              isoformCountMatrix$isoform_id %in% isoToKeep
            ),]
          }
          if( abundSuppled ) {
            isoformRepExpression <- isoformRepExpression[which(
              isoformRepExpression$isoform_id %in% isoToKeep
            ),]
          }
          
          if(any(isoToKeep %in% isoformExonAnnoation$isoform_id)) {
            isoformExonAnnoation <- isoformExonAnnoation[which(
              isoformExonAnnoation$isoform_id %in% isoToKeep
            ),]
          }
        }
        
        ### Devide the data
        colsToUse <-  c(
          'isoform_id',
          'gene_id',
          'gene_name'
        )
        
        isoformExonStructure <-
          isoformExonAnnoation[,na.omit(match(
            colsToUse, colnames(isoformExonAnnoation@elementMetadata)
          ))]
        
        isoformAnnotation <-
          unique(as.data.frame(isoformExonAnnoation@elementMetadata))
        if (!'gene_name' %in% colnames(isoformAnnotation)) {
          isoformAnnotation$gene_name <- NA
        }
        
        isoformAnnotation <- isoformAnnotation[order(
          isoformAnnotation$gene_id,
          isoformAnnotation$gene_name,
          isoformAnnotation$isoform_id
        ),]
      }
      
    }
    
    ### Test the columns of obtained annoation
    if(TRUE) {
      if (!all(c('isoform_id', 'gene_id', 'gene_name') %in%
               colnames(isoformAnnotation))) {
        stop(paste(
          'The data.frame passed to the \'isoformAnnotation\' argument',
          'must contain the following columns \'isoform_id\',',
          '\'gene_id\' and \'gene_name\''
        ))
      }
      if (any(is.na(isoformAnnotation[, c('isoform_id', 'gene_id')]))) {
        stop(paste(
          'The \'isoform_id\' and \'gene_id\' columns in the data.frame',
          'passed to the \'isoformAnnotation\' argument are not allowed',
          'to contain NAs'
        ))
      }
      if (!'isoform_id' %in% colnames(isoformExonStructure@elementMetadata)) {
        stop(paste(
          'The GenomicRanges (GRanges) object passed to the',
          '\'isoformExonStructure\' argument must contain both a',
          '\'isoform_id\' and \'gene_id\' metadata column'
        ))
      }
    }
    
    ### Test overlap with expression data
    if(!ignoreSurplusIsoforms) {
      if( countsSuppled ) {
        j1 <- jaccardSimilarity(
          isoformCountMatrix$isoform_id,
          isoformAnnotation$isoform_id
        )
        
        expIso <- isoformCountMatrix$isoform_id
      } else {
        j1 <- jaccardSimilarity(
          isoformRepExpression$isoform_id,
          isoformAnnotation$isoform_id
        )
        
        expIso <- isoformRepExpression$isoform_id
      }
      
      jcCutoff <- 0.925
      
      onlyInExp <- setdiff(expIso, isoformAnnotation$isoform_id)
      
      if (j1 != 1 ) {
        if( j1 < jcCutoff | length(onlyInExp) ) {
          options(warning.length = 2000L)
          stop(
            paste(
              'The annotation and quantification (count/abundance matrix and isoform annotation)',
              'seems to be different (Jaccard similarity < 0.925).',
              '\nEither isforoms found in the annotation are',
              'not quantifed or vise versa.',
              '\nSpecifically:\n',
              length(unique(expIso)), 'isoforms were quantified.\n',
              length(unique(isoformAnnotation$isoform_id)), 'isoforms are annotated.\n',
              'Only', length(intersect(expIso, isoformAnnotation$isoform_id)), 'overlap.\n',
              length(setdiff(unique(expIso), isoformAnnotation$isoform_id)), 'isoforms quantifed had no corresponding annoation\n',
              '\nThis combination cannot be analyzed since it will',
              'cause discrepencies between quantification and annotation thereby skewing all analysis.\n',
              
              '\nIf there is no overlap (as in zero or close) there are two options:\n',
              '1) The files do not fit together (e.g. different databases, versions, etc)',
              '(no fix except using propperly paired files).\n',
              '2) It is somthing to do with how the isoform ids are stored in the different files.',
              'This problem might be solvable using some of the',
              '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
              '    Examples from expression matrix are :',
              paste0( sample(expIso, min(c(3, length(expIso)))), collapse = ', '),'\n',
              '    Examples of annoation are :',
              paste0( sample(isoformAnnotation$isoform_id, min(c(3, length(isoformAnnotation$isoform_id)))), collapse = ', '),'\n',
              '    Examples of isoforms which were only found im the quantification are  :',
              paste0( sample(onlyInExp, min(c(3, length(onlyInExp)))), collapse = ', '),'\n',
              
              '\nIf there is a large overlap but still far from complete there are 3 possibilites:\n',
              '1) The files do not fit together (e.g different databases versions etc.)',
              '(no fix except using propperly paired files).\n',
              '2) If you are using Ensembl data you have supplied the GTF without phaplotyps. You need to supply the',
              '<Ensembl_version>.chr_patch_hapl_scaff.gtf file - NOT the <Ensembl_version>.chr.gtf\n',
              '3) One file could contain non-chanonical chromosomes while the other do not',
              '(might be solved using the \'removeNonConvensionalChr\' argument.)\n',
              '4) It is somthing to do with how a subset of the isoform ids are stored in the different files.',
              'This problem might be solvable using some of the',
              '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n\n',
              
              '\nFor more info see the FAQ in the vignette.\n',
              sep=' '
            )
          )
        }
        if( j1 >= jcCutoff ) {
          warning(
            paste(
              'The annotation and quantification (count/abundance matrix and isoform annotation)',
              'Seem to be slightly different.',
              
              '\nSpecifically:\n',
              length(setdiff(isoformAnnotation$isoform_id, unique(expIso))), 'isoforms were only found in the annotation\n',
              
              '\nPlease make sure this is on purpouse since differences',
              'will cause inaccurate quantification and thereby skew all analysis.\n',
              
              'If you have quantified with Salmon this could be normal since it as default only keep one copy of identical sequnces (can be prevented using the --keepDuplicates option)\n',
              'We strongly encurage you to go back and figure out why this is the case.\n\n',
              sep=' '
            )
          )
          
          ### Reduce to those found in all
          if( countsSuppled ) {
            isoformsUsed <- intersect(
              isoformCountMatrix$isoform_id,
              isoformAnnotation$isoform_id
            )
          } else {
            isoformsUsed <- intersect(
              isoformRepExpression$isoform_id,
              isoformAnnotation$isoform_id
            )
          }
          
          isoformExonStructure <- isoformExonStructure[which(
            isoformExonStructure$isoform_id %in% isoformsUsed
          ), ]
          isoformAnnotation <-isoformAnnotation[which(
            isoformAnnotation$isoform_id %in% isoformsUsed
          ), ]
          
          if( countsSuppled ) {
            isoformCountMatrix <-isoformCountMatrix[which(
              isoformCountMatrix$isoform_id %in% isoformsUsed
            ), ]
          }
          if( abundSuppled ) {
            isoformRepExpression <-isoformRepExpression[which(
              isoformRepExpression$isoform_id %in% isoformsUsed
            ), ]
          }
          
        }
      }
      
    }
    
    ### Now test overlap with expression data (updated: overlap coefficient)
    if(ignoreSurplusIsoforms){
      
      if (countsSuppled) {
        overlapCoef <- overlapCoefficient(
          isoformCountMatrix$isoform_id,
          isoformAnnotation$isoform_id
        )
        expIso <- isoformCountMatrix$isoform_id
      } else {
        overlapCoef <- overlapCoefficient(
          isoformRepExpression$isoform_id,
          isoformAnnotation$isoform_id
        )
        expIso <- isoformRepExpression$isoform_id
      }
      
      overlapCutoff <- 0.95
      
      onlyInExp <- setdiff(expIso, isoformAnnotation$isoform_id)
      onlyInAnno <- setdiff(isoformAnnotation$isoform_id, expIso)
      
      if(overlapCoef == 1){
        if(length(onlyInExp)){
          options(warning.length = 2000L)
          stop(
            paste('The annotation and quantification (count/abundance matrix and isoform annotation)',
                  'seems to be different',length(unique(onlyInExp)),'are quantified but without any annotation information',
                  sep = ' '))
        } else{
          warning(
            paste(
              'Ignoring surplus isoforms as requested by `ignoreSurplusIsoforms = TRUE`.',
              '\nQuantified isoforms: ', length(unique(expIso)),
              '\nAnnotated isoforms: ', length(unique(isoformAnnotation$isoform_id)),
              '\n', length(onlyInAnno), 'isoforms in the annotation will be ignored.\n','Note that the ignoration of isoforms would slightly affect the accuracy of IF!\n',
              sep = ' '
            )
          )
        }
      }
      
      if (overlapCoef != 1 ) {
        if( overlapCoef < overlapCutoff | length(onlyInExp) ) {
          options(warning.length = 2000L)
          stop(
            paste(
              'The annotation and quantification (count/abundance matrix and isoform annotation)',
              'seems to be different (Overlap Coefficient < 0.925).',
              '\nEither isforoms found in the annotation are',
              'not quantifed or vise versa.',
              '\nSpecifically:\n',
              length(unique(expIso)), 'isoforms were quantified.\n',
              length(unique(isoformAnnotation$isoform_id)), 'isoforms are annotated.\n',
              'Only', length(intersect(expIso, isoformAnnotation$isoform_id)), 'overlap.\n',
              length(setdiff(unique(expIso), isoformAnnotation$isoform_id)), 'isoforms quantifed had no corresponding annoation\n',
              '\nThis combination cannot be analyzed since it will',
              'cause discrepencies between quantification and annotation thereby skewing all analysis.\n',
              
              '\nIf there is no overlap (as in zero or close) there are two options:\n',
              '1) The files do not fit together (e.g. different databases, versions, etc)',
              '(no fix except using propperly paired files).\n',
              '2) It is somthing to do with how the isoform ids are stored in the different files.',
              'This problem might be solvable using some of the',
              '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
              '    Examples from expression matrix are :',
              paste0( sample(expIso, min(c(3, length(expIso)))), collapse = ', '),'\n',
              '    Examples of annoation are :',
              paste0( sample(isoformAnnotation$isoform_id, min(c(3, length(isoformAnnotation$isoform_id)))), collapse = ', '),'\n',
              '    Examples of isoforms which were only found im the quantification are  :',
              paste0( sample(onlyInExp, min(c(3, length(onlyInExp)))), collapse = ', '),'\n',
              
              '\nIf there is a large overlap but still far from complete there are 3 possibilites:\n',
              '1) The files do not fit together (e.g different databases versions etc.)',
              '(no fix except using propperly paired files).\n',
              '2) If you are using Ensembl data you have supplied the GTF without phaplotyps. You need to supply the',
              '<Ensembl_version>.chr_patch_hapl_scaff.gtf file - NOT the <Ensembl_version>.chr.gtf\n',
              '3) One file could contain non-chanonical chromosomes while the other do not',
              '(might be solved using the \'removeNonConvensionalChr\' argument.)\n',
              '4) It is somthing to do with how a subset of the isoform ids are stored in the different files.',
              'This problem might be solvable using some of the',
              '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n\n',
              
              '\nFor more info see the FAQ in the vignette.\n',
              sep=' '
            )
          )
        }
        if( overlapCoef >= overlapCutoff ) {
          warning(
            paste(
              'The annotation and quantification (count/abundance matrix and isoform annotation)',
              'Seem to be slightly different.',
              
              '\nSpecifically:\n',
              length(setdiff(isoformAnnotation$isoform_id, unique(expIso))), 'isoforms were only found in the annotation\n',
              
              '\nPlease make sure this is on purpouse since differences',
              'will cause inaccurate quantification and thereby skew all analysis.\n',
              
              'If you have quantified with Salmon this could be normal since it as default only keep one copy of identical sequnces (can be prevented using the --keepDuplicates option)\n',
              'We strongly encurage you to go back and figure out why this is the case.\n\n',
              sep=' '
            )
          )
          
        }
      }
      
      ### Reduce to those found in all
      if( countsSuppled ) {
        isoformsUsed <- intersect(
          isoformCountMatrix$isoform_id,
          isoformAnnotation$isoform_id
        )
      } else {
        isoformsUsed <- intersect(
          isoformRepExpression$isoform_id,
          isoformAnnotation$isoform_id
        )
      }
      
      isoformExonStructure <- isoformExonStructure[which(
        isoformExonStructure$isoform_id %in% isoformsUsed
      ), ]
      isoformAnnotation <-isoformAnnotation[which(
        isoformAnnotation$isoform_id %in% isoformsUsed
      ), ]
      
      if( countsSuppled ) {
        isoformCountMatrix <-isoformCountMatrix[which(
          isoformCountMatrix$isoform_id %in% isoformsUsed
        ), ]
      }
      if( abundSuppled ) {
        isoformRepExpression <-isoformRepExpression[which(
          isoformRepExpression$isoform_id %in% isoformsUsed
        ), ]
      }
      
      
    }
  }
  
  ### Subset to libraries used
  if (TRUE) {
    designMatrix <-
      designMatrix[which(
        designMatrix$condition %in% c(
          comparisonsToMake$condition_1,
          comparisonsToMake$condition_2
        )
      ), ]
    
    if( countsSuppled ) {
      isoformCountMatrix <-
        isoformCountMatrix[, which(
          colnames(isoformCountMatrix) %in%
            c('isoform_id', designMatrix$sampleID))]
      isoformCountMatrix <-
        isoformCountMatrix[,c(
          which(colnames(isoformCountMatrix) == 'isoform_id'),
          which(colnames(isoformCountMatrix) != 'isoform_id')
        )]
      rownames(isoformCountMatrix) <- NULL
    }
    
    if ( abundSuppled ) {
      isoformRepExpression <-
        isoformRepExpression[, which(
          colnames(isoformRepExpression) %in%
            c('isoform_id', designMatrix$sampleID)
        )]
      
      isoformRepExpression <-
        isoformRepExpression[,c(
          which(colnames(isoformRepExpression) == 'isoform_id'),
          which(colnames(isoformRepExpression) != 'isoform_id')
        )]
      rownames(isoformRepExpression) <- NULL
    }
  }
  
  ### Remove isoforms not expressed
  if (TRUE) {
    if( countsSuppled ) {
      okIsoforms <- isoformCountMatrix$isoform_id[which(
        rowSums(isoformCountMatrix[,which( colnames(isoformCountMatrix) != 'isoform_id')]) > 0
      )]
      nTot <- nrow(isoformCountMatrix)
    } else {
      okIsoforms <-isoformRepExpression$isoform_id[which(
        rowSums(isoformRepExpression[,which( colnames(isoformRepExpression) != 'isoform_id')]) > 0
      )]
      nTot <- nrow(isoformRepExpression)
    }
    
    nOk <- length(okIsoforms)
    if( nOk != nTot ) {
      if (!quiet) {
        ### Message
        message(
          paste(
            '   ',
            nTot - nOk,
            paste0( '( ', round( (nTot - nOk) / nTot *100, digits = 2),'%)'),
            'isoforms were removed since they were not expressed in any samples.'
          )
        )
      }
      
      ### Subset expression
      if(abundSuppled) {
        isoformRepExpression <- isoformRepExpression[which(
          isoformRepExpression$isoform_id %in% okIsoforms
        ),]
      }
      if(countsSuppled) {
        isoformCountMatrix <- isoformCountMatrix[which(
          isoformCountMatrix$isoform_id %in% okIsoforms
        ),]
      }
      
      ### Annotation
      isoformExonStructure <- isoformExonStructure[which( isoformExonStructure$isoform_id %in% okIsoforms),]
      isoformAnnotation    <- isoformAnnotation[which( isoformAnnotation$isoform_id %in% okIsoforms),]
      
      if (addAnnotatedORFs & gtfImported) {
        isoORF <- isoORF[which( isoORF$isoform_id %in% okIsoforms),]
      }
      
    }
  }
  
  ### Handle sequence input
  if(TRUE) {
    addIsoformNt <- FALSE
    
    if(!is.null(isoformNtFasta)) {
      isoformNtSeq <- do.call(
        c,
        lapply(isoformNtFasta, function(aFile) {
          Biostrings::readDNAStringSet(
            filepath = isoformNtFasta, format = 'fasta'
          )
        })
      )
      
      if(!is(isoformNtSeq, "DNAStringSet")) {
        stop('The fasta file supplied to \'isoformNtFasta\' does not contain the nucleotide (DNA) sequence...')
      }
      
      ### Fix names
      if( ignoreAfterBar | ignoreAfterSpace | ignoreAfterPeriod) {
        
        names(isoformNtSeq) <- fixNames(
          nameVec = names(isoformNtSeq),
          ignoreAfterBar = ignoreAfterBar,
          ignoreAfterSpace = ignoreAfterSpace,
          ignoreAfterPeriod = ignoreAfterPeriod
        )
      }
      
      ### Subset to used
      isoSeqNames <- names(isoformNtSeq)
      isoformNtSeq <- isoformNtSeq[which(
        names(isoformNtSeq) %in% isoformCountMatrix$isoform_id
      )]
      
      ### Remove potential duplication
      isoformNtSeq <- isoformNtSeq[which(
        ! duplicated(names(isoformNtSeq))
      )]
      
      if(length(isoformNtSeq) == 0) {
        stop(
          paste(
            'No sequences in the fasta files had IDs matching the expression data.',
            'This problem might be solvable using some of the',
            '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
            '    3 Examples from expression matrix are :',
            paste0( sample(unique(isoformCountMatrix$isoform_id), min(c(3, length(isoformCountMatrix$isoform_id))) ), collapse = ', '),'\n',
            '    3 Examples of sequence annotation are :',
            paste0( sample(isoSeqNames, min(c(3, length( isoSeqNames ))) ), collapse = ', '),'\n',
            sep = ' '
          )
        )
        
      }
      
      if( ! all( isoformCountMatrix$isoform_id %in% names(isoformNtSeq) ) ) {
        options(warning.length = 2000L)
        warning(
          paste(
            'The fasta file supplied to \'isoformNtFasta\' does not contain the',
            'nucleotide (DNA) sequence for all isoforms quantified and will not be added!',
            '\nSpecifically:\n',
            length(unique(isoformCountMatrix$isoform_id)), 'isoforms were quantified.\n',
            length(unique(names(isoformNtSeq))), 'isoforms have a sequence.\n',
            'Only', length(intersect(names(isoformNtSeq), isoformCountMatrix$isoform_id)), 'overlap.\n',
            length(setdiff(unique(isoformCountMatrix$isoform_id), names(isoformNtSeq))), 'isoforms quantifed isoforms had no corresponding nucleotide sequence\n',
            
            '\nIf there is no overlap (as in zero or close) there are two options:\n',
            '1) The files do not fit together (different databases, versions etc)',
            '(no fix except using propperly paired files).\n',
            '2) It is somthing to do with how the isoform ids are stored in the different files.',
            'This problem might be solvable using some of the',
            '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n',
            '    3 Examples from expression matrix are :',
            paste0( sample(unique(isoformCountMatrix$isoform_id), min(c(3, length(isoformCountMatrix$isoform_id))) ), collapse = ', '),'\n',
            '    3 Examples of sequence annotation are :',
            paste0( sample(names(isoformNtSeq), min(c(3, length( isoformNtSeq ))) ), collapse = ', '),'\n',
            
            '\nIf there is a large overlap but still far from complete there are 3 possibilites:\n',
            '1) The files do not fit together (different databases versions)',
            '(no fix except using propperly paired files).\n',
            '2) The isoforms quantified have their nucleotide sequence stored in multiple fasta files (common for Ensembl).',
            'Just supply a vector with the path to each of them to the \'isoformNtFasta\' argument.\n',
            '3) One file could contain non-chanonical chromosomes while the other do not',
            '(might be solved using the \'removeNonConvensionalChr\' argument.)\n',
            '4) It is somthing to do with how a subset of the isoform ids are stored in the different files.',
            'This problem might be solvable using some of the',
            '\'ignoreAfterBar\', \'ignoreAfterSpace\' or \'ignoreAfterPeriod\' arguments.\n\n',
            sep = ' '
          )
        )
      } else {
        addIsoformNt <- TRUE
      }
    }
  }
  
  
  
  ### Rescue StringTie gene annotation
  if(TRUE) {
    if (!quiet) { message('Step 3 of 10: Fixing StringTie gene annoation problems...')}
    
    ### Add original gene_ids already assigned to gene_id back to ref_gene_id
    if(TRUE) {
      indexToModify <- which(
        is.na( isoformAnnotation$ref_gene_id  ) &
          ! is.na( isoformAnnotation$gene_name  )
      )
      isoformAnnotation$ref_gene_id[indexToModify] <- isoformAnnotation$gene_id[indexToModify]
    }
    
    ### Assign isoforms to ref_gene_id and gene_names
    if(   fixStringTieAnnotationProblem ) {
      ### variables for messages
      anyFixed1 <- FALSE
      anyFixed2 <- FALSE
      anyFixed3 <- FALSE
      anyFixed4 <- FALSE
      
      ### Fix missing ref_gene_id
      if( any(is.na(isoformAnnotation$ref_gene_id)) ) {
        
        ### Fix simple missing ref_gene_id (within single ref_gene_id gene_id)
        if( TRUE ) {
          ### Make list with ref_gene_id (same order)
          geneNameList <- split(isoformAnnotation$ref_gene_id, isoformAnnotation$gene_id)
          geneNameList <- geneNameList[unique(isoformAnnotation$gene_id)]
          
          ### Count problems
          nIsoWihoutNames <- sum(
            is.na(isoformAnnotation$ref_gene_id)
          )
          
          ### Add ref_gene_ids to novel StringTie transcripts when possible (only 1 ref_gene_id candidate)
          isoformAnnotation$ref_gene_id <- unlist(
            lapply(
              geneNameList,
              function(geneNameVec) {
                localGeneNames <- unique(na.omit( geneNameVec ))
                
                if( length( localGeneNames ) == 1 ) {
                  return(
                    rep(localGeneNames, times = length(geneNameVec))
                  )
                } else {
                  return(geneNameVec)
                }
              }
            )
          )
          
          ### Re-count problems
          nIsoWihoutNames2 <- sum(
            is.na(isoformAnnotation$ref_gene_id)
          )
          
          anyFixed1 <- nIsoWihoutNames - nIsoWihoutNames2 > 0
          
          if( anyFixed1 ) {
            if (!quiet) {
              message(
                paste(
                  '   ',
                  nIsoWihoutNames - nIsoWihoutNames2,
                  ' isoforms were assigned the ref_gene_id and gene_name of their associated gene_id.',
                  '\n        This was only done when the parent gene_id were associated with a single ref_gene_id/gene_name.',
                  #'\n',
                  sep = ''
                )
              )
            }
          }
          
        }
        
        ### Fix non-simple missing ref_gene_ids (within multi ref_gene_ids gene_id)
        if(TRUE) {
          ### Identify gene_ids with problems
          if(TRUE) {
            ### Summazie gene properties
            multiGeneDf <-
              isoformAnnotation %>%
              dplyr::select(isoform_id, gene_id, ref_gene_id) %>%
              dplyr::distinct()
            
            geneNameSummary <-
              multiGeneDf %>%
              group_by(gene_id) %>%
              dplyr::summarise(
                n_ref_gene_ids = n_distinct(na.omit(ref_gene_id)),
                n_iso_na = sum(is.na(ref_gene_id)),
                .groups = 'drop'
              )
            
            ### Identify genes with both missing and multiple gene names
            multiGenesWithNa <-
              geneNameSummary %>%
              dplyr::filter(
                n_ref_gene_ids >= 2 & # at least two genes
                  n_iso_na > 0           # no novel once
              )
          }
          
          ### Rescue via genomic overlap of known isoforms (aka with ref_gene_id annotation)
          if(nrow(multiGenesWithNa) & fixStringTieViaOverlapInMultiGenes) {
            ### Extract all isoforms with problems
            genesWithProblems <-
              multiGeneDf %>%
              dplyr::filter(gene_id %in% multiGenesWithNa$gene_id)
            
            ### Extract exons of interest
            exonsOi <- isoformExonStructure[which(
              isoformExonStructure$isoform_id %in% genesWithProblems$isoform_id
            ),]
            
            ### Modify seqnames to ensure only searching within same gene_id?
            exonsOi <- GRanges(
              seqnames = exonsOi$gene_id,
              ranges = IRanges(
                start = BiocGenerics::start(exonsOi),
                end   = BiocGenerics::end(exonsOi)
              ),
              strand = exonsOi@strand,
              isoform_id = exonsOi$isoform_id,
              gene_id = exonsOi$gene_id
            )
            
            ### Convert to list
            exonsOiList <- split(exonsOi, exonsOi$isoform_id)
            
            ### Devide into novel and known
            knownIsoforms <- genesWithProblems$isoform_id[which(
              ! is.na(genesWithProblems$ref_gene_id)
            )]
            novelIsoforms <- genesWithProblems$isoform_id[which(
              is.na(genesWithProblems$ref_gene_id)
            )]
            knownList <- exonsOiList[ knownIsoforms ]
            novelList <- exonsOiList[ novelIsoforms ]
            novelLength <- sapply(width(novelList), sum)
            
            ### Identify overlapping isoforms
            novelIsoOverlap <- findOverlaps(query = novelList, subject = knownList)
            novelIsoOverlapDf <- as.data.frame(novelIsoOverlap)
            novelIsoOverlapDf$novel_iso <- names(novelList)[novelIsoOverlapDf$queryHits]
            novelIsoOverlapDf$known_iso <- names(knownList)[novelIsoOverlapDf$subjectHits]
            novelIsoOverlapDf <- novelIsoOverlapDf[,c('novel_iso','known_iso')]
            novelIsoOverlapDf$known_ref_gene_id <- genesWithProblems$ref_gene_id[match(
              novelIsoOverlapDf$known_iso, genesWithProblems$isoform_id
            )]
            
            ### Calculate overlap
            novelOverlap <- intersect(novelList[queryHits(novelIsoOverlap)], knownList[subjectHits(novelIsoOverlap)])
            novelIsoOverlapDf$nt_overlap <- sapply(width(novelOverlap), sum)
            novelIsoOverlapDf$novel_length <- novelLength[match(
              novelIsoOverlapDf$novel_iso, names(novelLength)
            )]
            novelIsoOverlapDf$frac_overlap <- novelIsoOverlapDf$nt_overlap / novelIsoOverlapDf$novel_length
            
            ### For each novel isoform assign gene_name via cutoffs
            novelAssigned <-
              novelIsoOverlapDf %>%
              as_tibble() %>%
              group_by(novel_iso, known_ref_gene_id) %>%
              ### For each known_gene extract top contender
              dplyr::arrange(dplyr::desc(nt_overlap), .by_group = TRUE)  %>%
              dplyr::slice(1L) %>%
              ### For each isoform calculate ratios betwen top genes
              group_by(novel_iso) %>%
              dplyr::arrange(dplyr::desc(nt_overlap), .by_group = TRUE)  %>%
              mutate(
                log2_overlap_ratio = c(
                  log2( nt_overlap[-length(nt_overlap)] / nt_overlap[-1] ),
                  Inf # assign Inf if only 1 gene is overlapping
                )
              ) %>%
              ### For each isoform Filter
              dplyr::filter(
                nt_overlap         >= fixStringTieMinOverlapSize,
                frac_overlap       >= fixStringTieMinOverlapFrac,
                log2_overlap_ratio >= fixStringTieMinOverlapLog2RatioToContender
              ) %>%
              ### For each isoform : Extract top contender
              dplyr::slice(1L)
            
            ### Modify annoation
            toModify <- which(isoformAnnotation$isoform_id %in% novelAssigned$novel_iso)
            isoformAnnotation$ref_gene_id[toModify] <- novelAssigned$known_ref_gene_id[match(
              isoformAnnotation$isoform_id[toModify], novelAssigned$novel_iso
            )]
            
            ### Redo problem calculations
            nIsoWihoutNames3 <- sum(
              is.na(isoformAnnotation$ref_gene_id)
            )
            
            anyFixed2 <- nIsoWihoutNames2 - nIsoWihoutNames3
            
            if( anyFixed2) {
              if (!quiet) {
                message(
                  paste(
                    '   ',
                    nIsoWihoutNames2 - nIsoWihoutNames3,
                    ' isoforms were assigned the ref_gene_id and gene_name of the most similar',
                    '\n        annotated isoform (defined via overlap in genomic exon coordinates).',
                    '\n        This was only done if the overlap met the requriements',
                    '\n        indicated by the three fixStringTieViaOverlap* arguments.',
                    #'\n',
                    sep = ''
                  )
                )
              }
            }
          }
        }
      }
      
      ### Remove non-assigned isoforms within known genes
      if(TRUE) {
        genesWithProblems <-
          isoformAnnotation %>%
          dplyr::select(gene_id, ref_gene_id) %>%
          dplyr::distinct() %>%
          group_by(gene_id) %>%
          dplyr::summarise(
            has_ref_gene_id = any(! is.na(ref_gene_id)),
            has_novel_iso = any(  is.na(ref_gene_id)),
            .groups = 'drop'
          ) %>%
          dplyr::filter(
            has_ref_gene_id,
            has_novel_iso
          )
        
        ### Extract isoforms to remove
        isoToRemove <- isoformAnnotation$isoform_id[which(
          isoformAnnotation$gene_id %in% genesWithProblems$gene_id &
            is.na(isoformAnnotation$ref_gene_id)
        )]
        
        anyFixed3 <- length(isoToRemove) > 0
        
        if(length(isoToRemove)) {
          ### Remove
          isoformAnnotation <- isoformAnnotation[which(
            ! isoformAnnotation$isoform_id %in% isoToRemove
          ),]
          
          isoformExonStructure <- isoformExonStructure[which(
            ! isoformExonStructure$isoform_id %in% isoToRemove
          ),]
          
          if(! is.null(isoformCountMatrix)) {
            isoformCountMatrix <- isoformCountMatrix[which(
              ! isoformCountMatrix$isoform_id %in% isoToRemove
            ),]
          }
          
          if(! is.null(isoformRepExpression)) {
            isoformRepExpression <- isoformRepExpression[which(
              ! isoformRepExpression$isoform_id %in% isoToRemove
            ),]
          }
          
          ### Write message
          if (!quiet) {
            message(
              paste(
                '   We were unable to assign', length(isoToRemove),
                'isoforms (located within annotated genes) to a known ref_gene_id/gene_name.',
                '\n        These were removed to enable analysis of the rest of the isoform from within the merged genes.'
              )
            )
          }
        }
      }
      
      ### Split gene_ids of gene_id with mutiple gene_names
      if(TRUE) {
        ### Summarize problem
        if(TRUE) {
          multiGeneDf <-
            isoformAnnotation %>%
            dplyr::select(isoform_id, gene_id, ref_gene_id) %>%
            dplyr::distinct()
          
          multiGenes <-
            multiGeneDf %>%
            group_by(gene_id) %>%
            dplyr::summarise(
              n_ref_gene_ids = n_distinct(na.omit(ref_gene_id)),
              n_iso_na = sum(is.na(ref_gene_id)),
              .groups = 'drop'
            ) %>%
            dplyr::filter(
              n_ref_gene_ids >= 2 & # at least two genes
                n_iso_na == 0           # no novel once
            )
          
          nProblems <- nrow(multiGenes)
        }
        
        ### Split gene_ids
        if(nrow(multiGenes)) {
          ### Extract corresponding iso data
          multiGeneDf <-
            multiGeneDf %>%
            dplyr::filter(gene_id %in% multiGenes$gene_id)
          
          ### Create new gene_ids (by merging with ref_gene_id)
          multiGeneDf$new_gene_id <- stringr::str_c(
            multiGeneDf$gene_id,
            ':',
            multiGeneDf$ref_gene_id
          )
          
          ### Overwrite in annotation
          indexToModify <- which(
            isoformAnnotation$gene_id %in% multiGeneDf$gene_id
          )
          isoformAnnotation$gene_id[indexToModify] <-
            multiGeneDf$new_gene_id[match(
              isoformAnnotation$isoform_id[indexToModify], multiGeneDf$isoform_id
            )]
          
          ### Overwrite ref_gene_id and gene_ids in exon annotation
          isoformExonStructure$gene_id <- isoformAnnotation$gene_id[match(
            isoformExonStructure$isoform_id, isoformAnnotation$isoform_id
          )]
        }
        
        ### Message summary
        if(TRUE) {
          ### Redo problem calculations
          multiGenes <-
            isoformAnnotation %>%
            dplyr::select(gene_id, ref_gene_id) %>%
            dplyr::distinct() %>%
            group_by(gene_id) %>%
            dplyr::summarise(
              n_ref_gene_ids = n_distinct(na.omit(ref_gene_id)),
              n_iso_na = sum(is.na(ref_gene_id)),
              .groups = 'drop'
            ) %>%
            dplyr::filter(
              n_ref_gene_ids >= 2 & # at least two genes
                n_iso_na == 0           # no novel once
            )
          
          nProblems2 <- nrow(multiGenes)
          
          anyFixed4 <- nProblems - nProblems2 > 0
          
          if( anyFixed4 ) {
            if (!quiet) {
              message(
                paste(
                  '   ',
                  nProblems - nProblems2 ,
                  ' gene_ids which were associated with multiple ref_gene_id/gene_names',
                  '\n        were split into mutliple genes via their ref_gene_id/gene_names.',
                  #'\n',
                  sep = ''
                )
              )
            }
          }
        }
        
      }
      
      ### Generalize ref_gene_id assignment to gene_names and update both annotaion objects
      if(TRUE) {
        geneNameDf <-
          isoformAnnotation %>%
          dplyr::select(gene_name, ref_gene_id) %>%
          dplyr::filter(!is.na(gene_name)) %>%
          dplyr::distinct()
        
        isoformAnnotation$gene_name <- geneNameDf$gene_name[match(
          isoformAnnotation$ref_gene_id, geneNameDf$ref_gene_id
        )]
        
        isoformExonStructure$gene_name <- isoformAnnotation$gene_name[match(
          isoformExonStructure$isoform_id, isoformAnnotation$isoform_id
        )]
      }
      
      if(
        ! anyFixed1 &
        ! anyFixed2 &
        ! anyFixed3 &
        ! anyFixed4
      ) {
        message(
          paste(
            '    There were no need to rescue any annotation',
            sep = ' '
          )
        )
      }
      
      ### Overwrite with original gene ids if doable
      if('ref_gene_id' %in% colnames(isoformAnnotation)) {
        ### Figure out those with potential
        geneIdsWithRef <- isoformAnnotation$gene_id[which(
          ! is.na(isoformAnnotation$ref_gene_id)
        )]
        
        ### Devide
        isoAnnotAlreadCorrect <-
          isoformAnnotation %>%
          dplyr::filter(! gene_id %in% geneIdsWithRef)
        
        isoAnnotToCorrect <-
          isoformAnnotation %>%
          dplyr::filter(gene_id %in% geneIdsWithRef)
        
        ### Figure out which one can be corrected
        isoAnnotToCorrect <-
          isoAnnotToCorrect %>%
          dplyr::group_by(gene_id) %>%
          dplyr::mutate(
            n_ref = n_distinct(na.omit(ref_gene_id))
          ) %>%
          dplyr::ungroup()
        
        ### Devide into those that can be corrected and those that cannot
        isoAnnotCannotBeCorrected <-
          isoAnnotToCorrect %>%
          dplyr::filter(n_ref != 1)
        
        isoAnnotCanBeCorrected <-
          isoAnnotToCorrect %>%
          dplyr::filter(n_ref == 1)
        
        if(nrow(isoAnnotCanBeCorrected)) {
          message(
            paste(
              '    ',
              length(unique(isoAnnotCanBeCorrected$gene_id)),
              ' genes_id were assigned their original gene_id instead of the StringTie gene_id.',
              '\n        This was only done when it could be done unambiguous.',
              #'\n',
              sep = ''
            )
          )
        }
        
        ### Correct annnotation
        isoAnnotCorrected <-
          isoAnnotCanBeCorrected %>%
          dplyr::group_by(gene_id) %>%
          dplyr::mutate(
            gene_id = unique(na.omit(ref_gene_id))
          ) %>%
          dplyr::ungroup()
        
        isoAnnotCorrected$n_ref <- NULL
        isoAnnotCannotBeCorrected$n_ref <- NULL
        
        ### Combine the 3 datafames
        isoformAnnotationCorrected <- rbind(
          isoAnnotAlreadCorrect,
          isoAnnotCorrected,
          isoAnnotCannotBeCorrected
        )
        
        ### Reorder
        isoformAnnotationCorrected <-
          isoformAnnotationCorrected[match(
            isoformAnnotation$isoform_id,
            isoformAnnotationCorrected$isoform_id
          ),]
        isoformAnnotationCorrected$ref_gene_id <- NULL
        
        ### Overwrite
        isoformAnnotation <- isoformAnnotationCorrected
        isoformExonStructure$gene_id <- isoformAnnotation$gene_id[match(
          isoformExonStructure$isoform_id, isoformAnnotation$isoform_id
        )]
      }
    }
    if( ! fixStringTieAnnotationProblem ) {
      if (!quiet) { message('    Was skipped as instructed via the \"fixStringTieAnnotationProblem\" argument...')}
    }
    
    
  }
  
  
  
  ### If necessary calculate RPKM values
  if(TRUE) {
    if (!quiet) { message('Step 4 of 10: Calculating expression estimates from count data...') }
    if ( ! abundSuppled ) {
      ### Extract isoform lengths
      isoformLengths <- sapply(
        X = split(
          isoformExonStructure@ranges@width,
          f = isoformExonStructure$isoform_id
        ),
        FUN = sum
      )
      
      ### Calulate CPM
      # convert to matrix
      localCM <- isoformCountMatrix
      rownames(localCM) <- localCM$isoform_id
      localCM$isoform_id <- NULL
      localCM <- as.matrix(localCM)
      
      myCPM <- t(t(localCM) / colSums(localCM)) * 1e6
      
      ### Calculate RPKM
      isoformLengths <-
        isoformLengths[match(rownames(myCPM), names(isoformLengths))]
      
      isoformRepExpression <-
        as.data.frame(myCPM / (isoformLengths / 1e3))
      
      ### Massage
      isoformRepExpression$isoform_id <-
        rownames(isoformRepExpression)
      isoformRepExpression <-
        isoformRepExpression[, c(
          which(colnames(isoformRepExpression) == 'isoform_id'),
          which(colnames(isoformRepExpression) != 'isoform_id')
        )]
      rownames(isoformRepExpression) <- NULL
    }
    if (   abundSuppled ) {
      if (!quiet) { message('    Skipped as user supplied expression via the \"isoformRepExpression\" argument...')}
    }
    # isoformRepExpression
    
  }
  
  
  
  
  
  ### Run SVA
  if(TRUE) {
    if (!quiet) { message('Step 5 of 10: Testing for unwanted effects...') }
    
    ### Ensure there are enougth samples to run SVA
    enougthSamples <- min(table(designMatrix$condition)) >= 2
    
    if( ! enougthSamples ) {
      if (!quiet) { message('    Data was not corrected for unwanted effects since there are to few samples') }
    }
    if(   enougthSamples ) {
      
      ### Massage
      isoformRepExpressionLog <- isoformRepExpression
      rownames(isoformRepExpressionLog) <- isoformRepExpressionLog$isoform_id
      isoformRepExpressionLog$isoform_id <- NULL
      isoformRepExpressionLog <- log2(isoformRepExpressionLog + 1)
      
      ### Filter on expression
      smallestGroup <- min(table(designMatrix$condition))
      if(smallestGroup > 10) {
        smallestGroup <- smallestGroup * 0.7
      }
      minSamples <- max(c(
        2,
        smallestGroup
      ))
      
      isoformRepExpressionLogFilt <- isoformRepExpressionLog[which(
        rowSums( isoformRepExpressionLog > log2(1) ) >= minSamples
      ),]
      
      ### Make model matrix (which also take additional factors into account)
      if(TRUE) {
        localDesign <- designMatrix
        
        ### Convert group of interest to factors
        localDesign$condition <- factor(
          localDesign$condition,
          levels=unique(localDesign$condition)
        )
        
        localFormula <- '~ 0 + condition'
        
        
        ### Check co-founders for group vs continous variables and add to fomula
        if( ncol(localDesign) > 2 ) {
          for(i in 3:ncol(localDesign) ) { # i <- 4
            if( class(localDesign[,i]) %in% c('numeric', 'integer') ) {
              if( uniqueLength( localDesign[,i] ) * 2 <= length(localDesign[,i]) ) {
                localDesign[,i] <- factor(localDesign[,i])
              }
            } else {
              localDesign[,i] <- factor(localDesign[,i])
            }
          }
          
          ### Make formula for model
          localFormula <- paste(
            localFormula,
            '+',
            paste(
              colnames(localDesign)[3:ncol(localDesign)],
              collapse = ' + '
            ),
            sep=' '
          )
          
          
        }
        
        
        localFormula <- as.formula(localFormula)
        
        ### Make model
        localModel <- model.matrix(localFormula, data = localDesign)
        indexToModify <- 1:length(unique( localDesign$condition ))
        colnames(localModel)[indexToModify] <- gsub(
          pattern =  '^condition',
          replacement =  '',
          x =  colnames(localModel)[indexToModify]
        )
        
        ### Test model
        if( ! limma::is.fullrank(localModel) ) {
          stop('The design matrix suggested by the "designMatrix" is not full rank and hence cannot be analyzed.')
        }
        
      }
      # localModel
      
      ### Estimate SVAs
      nSv = sva::num.sv(
        dat = isoformRepExpressionLogFilt,
        mod = localModel,
      )
      svaAdded <- FALSE
      
      
      ### Test if to may SVAs
      if(TRUE) {
        
        nSvaCutoff <- min(c(
          10,
          nrow(designMatrix) * 0.5
        ))
        
        skipSvas <- nSv > nSvaCutoff
        
        if(skipSvas) {
          nSv <- 0
        }
      }
      
      ### Run SVA if necessary
      if( nSv > 0 &   detectUnwantedEffects ) {
        ### Run SVA
        tmp <- capture.output(
          localSv <- tryCatch({
            sva::sva(
              dat = as.matrix(isoformRepExpressionLogFilt),
              mod = localModel,
              n.sv = nSv
            )$sv
          }, error = {
            function(x) {
              NULL
            }
          })
        )
        
        ### Test SVs
        if( ! is.null(localSv) ) {
          ### Test for just diagnoal
          notDiagonal <- which( apply(
            localSv,
            MARGIN = 2,
            function(x) {
              sum( x != 1 ) > 1 & sum( x != 0 ) > 1
            }
          ) )
          
          ### Filter for correlation
          notToHighCor <- which( apply(
            localSv,
            MARGIN = 2,
            function(x) {
              abs(cor(
                x,
                as.integer(as.factor(designMatrix$condition))
              )) < 0.8
            }
          ) )
          
          ## Subset
          okSvs <- intersect(notDiagonal, notToHighCor)
        }
        if(   is.null(localSv) ) {
          okSvs <- integer()
        }
        
        
        ### Add to design
        if( length(okSvs) > 0 ) {
          
          ### Subset
          localSv <- localSv[,okSvs,drop=FALSE]
          colnames(localSv) <- paste0(
            'sv', 1:ncol(localSv)
          )
          
          ### Add SVs
          designMatrix <- cbind(
            designMatrix,
            localSv
          )
          
          svaAdded <- TRUE
        }
      }
      if( nSv > 0 & ! detectUnwantedEffects ) {
        if (!quiet) { message('    Skipped due to \"detectUnwantedEffects=FALSE\". ') }
        
        warning(
          paste(
            '    We detected', nSv, 'batch/covariates in your data.',
            '\n    These will not be corrected in any downstream analysis due to \"detectUnwantedEffects=FALSE\". ',
            '\n    Unless you REALLY know what you are doing we recomend setting \"detectUnwantedEffects=TRUE\"'
          )
        )
      }
      
      ### Send messages
      if( ! skipSvas ) {
        if(   svaAdded ) {
          if (!quiet) { message(paste('    Added', length(okSvs), 'batch/covariates to the design matrix')) }
        }
        if( ! svaAdded ) {
          if( exists('localSv' )) {
            if( is.null(localSv) ) {
              if (!quiet) { message(c(
                '    \n    SVA analysis failed. No unwanted effects were added.'
              )) }
              
              warning(paste0(
                '\n',
                'There were estimated unwanted effects in your dataset but the automatic sva run failed.',
                '\n    We highly reccomend you run sva yourself, add the nessesary surrogate variables',
                '\n    as extra columns in the \"designMatrix\" and re-run this function',
                '\n'
              ))
            } else {
              if (!quiet) { message('    No unwanted effects added') }
            }
          } else {
            if (!quiet) { message('    No unwanted effects added') }
          }
        }
      }
      if(   skipSvas ) {
        if( ! svaAdded ) {
          if (!quiet) { message('    Data was not corrected for unwanted effects') }
        }
        
        warning(paste0(
          '\n',
          'We found MANY unwanted effects in your dataset!',
          '\nTo many for IsoformSwitchAnalyzeR to be trusted with the correction.',
          '\nWe therefore highly reccomend you run sva yourself and add',
          '\nthe nessesary surrogate variables as extra columns in the \"designMatrix\"',
          '\n'
        ))
      }
      
    }
    
    
    
  }
  
  ### Batch correct if necessary
  if(TRUE) {
    if (!quiet) { message('Step 6 of 10: Batch correcting expression estimates...') }
    
    batchCorrectionNeeded <- ncol(designMatrix) >= 3 & enougthSamples
    
    if(   batchCorrectionNeeded ) {
      ### Make new model matrix (which also take SVs into account)
      if(TRUE) {
        localDesign <- designMatrix
        
        ### Convert group of interest to factors
        localDesign$condition <- factor(localDesign$condition, levels=unique(localDesign$condition))
        
        ### Check co-founders for group vs continous variables
        if( ncol(localDesign) > 2 ) {
          for(i in 3:ncol(localDesign) ) { # i <- 4
            if( class(localDesign[,i]) %in% c('numeric', 'integer') ) {
              if( uniqueLength( localDesign[,i] ) * 2 <= length(localDesign[,i]) ) {
                localDesign[,i] <- factor(localDesign[,i])
              }
            } else {
              localDesign[,i] <- factor(localDesign[,i])
            }
          }
        }
        
        ### Make formula for model
        localFormula <- paste(
          '~ 0 + condition',
          '+',
          paste(
            colnames(localDesign)[3:ncol(localDesign)],
            collapse = ' + '
          ),
          sep=' '
        )
        
        localFormula <- as.formula(localFormula)
        
        ### Make model
        localModel <- model.matrix(localFormula, data = localDesign)
        indexToModify <- 1:length(unique( localDesign$condition ))
        colnames(localModel)[indexToModify] <- gsub(
          pattern =  '^condition',
          replacement =  '',
          x =  colnames(localModel)[indexToModify]
        )
      }
      
      ### Batch correct expression matrix
      suppressWarnings(
        isoRepBatch <- as.data.frame( limma::removeBatchEffect(
          x = isoformRepExpressionLog,
          design     = localModel[,which(   colnames(localModel) %in% localDesign$condition), drop=FALSE],
          covariates = localModel[,which( ! colnames(localModel) %in% localDesign$condition), drop=FALSE],
          method='robust'
        ))
      )
      
      ### Overwrite expression
      isoformRepExpression <- 2^isoRepBatch - 1
      isoformRepExpression[which( isoformRepExpression < 0, arr.ind = TRUE)] <- 0
      
      ### Massage back
      isoformRepExpression$isoform_id <- rownames(isoformRepExpression)
      rownames(isoformRepExpression) <- NULL
      
      isoformRepExpression <- isoformRepExpression[,c(
        which(colnames(isoformRepExpression) == 'isoform_id'),
        which(colnames(isoformRepExpression) != 'isoform_id')
      )]
    }
    if( ! batchCorrectionNeeded ) {
      if (!quiet) { message('    Skipped as no batch effects were found or annoated...')}
    }
  }
  
  
  
  ### Calculate gene and IF
  if (!quiet) {
    message('Step 7 of 10: Extracting data from each condition...')
  }
  if(TRUE) {
    ### Sum to gene level gene expression - updated
    if(TRUE) {
      ### add gene_id
      isoformRepExpression$gene_id <-
        isoformAnnotation$gene_id[match(isoformRepExpression$isoform_id,
                                        isoformAnnotation$isoform_id)]
      
      ### Sum to gene level
      geneRepExpression <- isoformToGeneExp(
        isoformRepExpression =  isoformRepExpression,
        quiet = TRUE
      )
      
      ### Remove gene id
      isoformRepExpression$gene_id <- NULL
    }
    
    ### Calculate IF rep matrix
    if(TRUE) {
      isoformRepIF <- isoformToIsoformFraction(
        isoformRepExpression=isoformRepExpression,
        geneRepExpression=geneRepExpression,
        isoformGeneAnnotation=isoformAnnotation,
        quiet = TRUE
      )
    }
  }
  
  
  
  ### in each condition analyzed get mean and standard error of gene and isoforms
  if (TRUE) {
    conditionList <-
      split(designMatrix$sampleID, f = designMatrix$condition)
    conditionSummary <-
      plyr::llply(
        .data = conditionList,
        .progress = progressBar,
        .fun = function(sampleVec) {
          # sampleVec <- conditionList[[1]]
          ### Isoform and IF
          isoIndex <-
            which(colnames(isoformRepExpression) %in% sampleVec)
          
          isoIndex2 <-
            which(colnames(isoformRepIF) %in% sampleVec)
          
          isoSummary <- data.frame(
            isoform_id       = isoformRepExpression$isoform_id,
            iso_overall_mean = rowMeans(isoformRepExpression[,designMatrix$sampleID, drop=FALSE]),
            iso_value        = rowMeans(isoformRepExpression[, isoIndex, drop=FALSE]),
            iso_std          = apply(   isoformRepExpression[, isoIndex, drop=FALSE], 1, sd),
            IF_overall       = rowMeans(isoformRepIF[,designMatrix$sampleID, drop=FALSE], na.rm = TRUE),
            IF               = rowMeans(isoformRepIF[, isoIndex2, drop=FALSE], na.rm = TRUE),
            stringsAsFactors = FALSE
          )
          isoSummary$iso_stderr <-
            isoSummary$iso_std / sqrt(length(sampleVec))
          isoSummary$iso_std <- NULL
          
          ### Gene
          geneIndex <-
            which(colnames(geneRepExpression) %in% sampleVec)
          
          geneSummary <- data.frame(
            gene_id = geneRepExpression$gene_id,
            gene_overall_mean = rowMeans(geneRepExpression[,designMatrix$sampleID, drop=FALSE]),
            gene_value = rowMeans(geneRepExpression[, geneIndex, drop=FALSE]),
            gene_std = apply(geneRepExpression[, geneIndex, drop=FALSE], 1, sd),
            stringsAsFactors = FALSE
          )
          geneSummary$gene_stderr <-
            geneSummary$gene_std / sqrt(length(sampleVec))
          geneSummary$gene_std <- NULL
          
          ### Combine
          combinedData <-
            dplyr::inner_join(isoformAnnotation, geneSummary, by = 'gene_id')
          combinedData <-
            dplyr::inner_join(combinedData, isoSummary, by = 'isoform_id')
          ### return result
          return(combinedData)
        }
      )
  }
  
  ### Use comparisonsToMake to create the isoform comparisons
  if (!quiet) {
    message('Step 8 of 10: Making comparisons...')
  }
  if (TRUE) {
    isoAnnot <-
      plyr::ddply(
        .data = comparisonsToMake,
        .variables = c('condition_1', 'condition_2'),
        .drop = TRUE,
        .progress = progressBar,
        .fun = function(aDF) { # aDF <- comparisonsToMake[1,]
          ### Extract data
          cond1data <- conditionSummary[[aDF$condition_1]]
          cond2data <- conditionSummary[[aDF$condition_2]]
          
          ### modify colnames in condition 1
          matchIndex <-
            match(
              c(
                'gene_value',
                'gene_stderr',
                'iso_value',
                'iso_stderr'
              ),
              colnames(cond1data)
            )
          colnames(cond1data)[matchIndex] <-
            paste(colnames(cond1data)[matchIndex], '_1', sep = '')
          colnames(cond1data)[which( colnames(cond1data) == 'IF')] <- 'IF1'
          
          ### modify colnames in condition 2
          matchIndex <-
            match(
              c(
                'gene_value',
                'gene_stderr',
                'iso_value',
                'iso_stderr'
              ),
              colnames(cond2data)
            )
          colnames(cond2data)[matchIndex] <-
            paste(colnames(cond2data)[matchIndex], '_2', sep = '')
          colnames(cond2data)[which( colnames(cond2data) == 'IF')] <- 'IF2'
          
          combinedIso <- dplyr::inner_join(
            cond1data,
            cond2data[, c(
              'isoform_id',
              'gene_value_2',
              'gene_stderr_2',
              'iso_value_2',
              'iso_stderr_2',
              'IF2'
            )],
            by = 'isoform_id'
          )
          
          ### Add comparison data
          combinedIso$condition_1 <- aDF$condition_1
          combinedIso$condition_2 <- aDF$condition_2
          return(combinedIso)
        }
      )
    
    ### Add comparison data
    # Log2FC
    ps <- foldChangePseudoCount
    
    isoAnnot$gene_log2_fold_change <-
      log2((isoAnnot$gene_value_2 + ps) / (isoAnnot$gene_value_1 + ps))
    isoAnnot$iso_log2_fold_change  <-
      log2((isoAnnot$iso_value_2  + ps) / (isoAnnot$iso_value_1  + ps))
    
    # qValues
    isoAnnot$gene_q_value <- NA
    isoAnnot$iso_q_value  <- NA
    
    # Isoform fraction values
    isoAnnot$dIF <- isoAnnot$IF2 - isoAnnot$IF1
    
    # Swich values
    isoAnnot$isoform_switch_q_value <- NA
    isoAnnot$gene_switch_q_value    <- NA
    
    ### Sort
    matchVector <-
      c(
        'isoform_id',
        'gene_id',
        'condition_1',
        'condition_2',
        'gene_name',
        'class_code',
        'gene_biotype',
        'iso_biotype',
        'gene_overall_mean',
        'gene_value_1',
        'gene_value_2',
        'gene_stderr_1',
        'gene_stderr_2',
        'gene_log2_fold_change',
        'gene_q_value',
        'iso_overall_mean',
        'iso_value_1',
        'iso_value_2',
        'iso_stderr_1',
        'iso_stderr_2',
        'iso_log2_fold_change',
        'iso_q_value',
        'IF_overall',
        'IF1',
        'IF2',
        'dIF',
        'isoform_switch_q_value',
        'gene_switch_q_value'
      )
    matchVector <-
      na.omit(match(matchVector, colnames(isoAnnot)))
    
    isoAnnot <- isoAnnot[, matchVector]
  }
  
  
  
  ### Create the swichList
  if (!quiet) {
    message('Step 9 of 10: Making switchAnalyzeRlist object...')
  }
  if (TRUE) {
    if( countsSuppled ) {
      
      isoformCountMatrix <- isoformCountMatrix[colnames(isoformRepExpression)]
      
      ### Create switchList
      dfSwichList <- createSwitchAnalyzeRlist(
        isoformFeatures = isoAnnot,
        exons = isoformExonStructure,
        designMatrix = designMatrix,
        isoformCountMatrix = isoformCountMatrix,     # nessesary for drimseq
        isoformRepExpression = isoformRepExpression, # nessesary for limma
        sourceId = 'data.frames'
      )
    } else {
      ### Create switchList
      dfSwichList <- createSwitchAnalyzeRlist(
        isoformFeatures = isoAnnot,
        exons = isoformExonStructure,
        designMatrix = designMatrix,
        isoformRepExpression = isoformRepExpression, # nessesary for limma
        sourceId = 'data.frames'
      )
    }
    
    ### Add orf if extracted
    if (addAnnotatedORFs & gtfImported) {
      dfSwichList$isoformFeatures$PTC <-
        isoORF$PTC[match(dfSwichList$isoformFeatures$isoform_id,
                         isoORF$isoform_id)]
      
      isoORF <-
        isoORF[which(isoORF$isoform_id %in%
                       isoformRepExpression$isoform_id), ]
      
      dfSwichList$orfAnalysis <- isoORF
    }
    
    ### Add IF matrix
    dfSwichList$isoformRepIF <- isoformRepIF[,c('isoform_id',designMatrix$sampleID)]
    
    ### Add nucleotide sequence
    if(addIsoformNt) {
      dfSwichList$ntSequence <- isoformNtSeq[which(
        names(isoformNtSeq) %in% dfSwichList$isoformFeatures$isoform_id
      )]
      
    }
    
  }
  
  ### Estimate DTU
  if (!quiet) {
    message('Step 10 of 10: Guestimating differential usage...')
  }
  if(estimateDifferentialGeneRange & !quiet) {
    localEstimate <- estimateDifferentialRange(
      switchAnalyzeRlist = dfSwichList
    )
    if( !is.null(localEstimate)) {
      message('    The GUESSTIMATED number of genes with differential isoform usage are:')
      print(localEstimate)
    } else {
      message('    The estimation of DTU failed. Please proceed with the normal workflow.')
    }
  } else {
    if (!quiet) {
      message('    Skipping due to the \"estimateDifferentialGeneRange\" argument...')
    }
  }
  
  
  
  
  ### Return switchList
  if (!quiet) {
    message('Done\n')
  }
  return(dfSwichList)
}

### Supporting tximeta
prepareSalmonFileDataFrame <- function(
    ### Core arguments
    parentDir,

    ### Advanced arguments
    pattern='',
    invertPattern=FALSE,
    ignore.case=FALSE,
    quiet = FALSE
) {
    ### Initialize
    if(TRUE) {
        ### data.frame with nesseary info
        supportedTypes <- data.frame(
            orign          = c('Salmon'         ),
            fileName       = c('quant.sf'       ),
            eLengthCol     = c('EffectiveLength'),
            stringsAsFactors = FALSE
        )
        ### Add support for detection of compressed files
        supportedTypes2 <- supportedTypes
        supportedTypes2$fileName <- paste0(supportedTypes2$fileName, '.gz')
        supportedTypes <- rbind(
            supportedTypes,
            supportedTypes2
        )

        headerTypes <- list(
            Salmon = c('Name','Length','EffectiveLength','TPM','NumReads')
        )
    }

    ### Identify directories of interest
    if (TRUE) {
        dirList <- split(
            list.dirs(
                path = parentDir,
                full.names = FALSE,
                recursive = FALSE
            ),
            list.dirs(
                path = parentDir,
                full.names = FALSE,
                recursive = FALSE
            )
        )
        dirList <- dirList[which(sapply(dirList, nchar) > 0)]


        if(length(dirList) == 0) {
            stop('No subdirecories were found in the supplied folder. Please check and try again.')
        }

        ### Extract those where there are files of interest
        dirList <-
            dirList[sapply(
                dirList,
                FUN = function(aDir) {
                    # aDir <- dirList[[6]]
                    localFiles <-
                        list.files(
                            paste0(parentDir, '/', aDir),
                            recursive = FALSE
                        )

                    if(length( localFiles )) {
                        fileOfInterest <- any(
                            sapply(
                                paste(supportedTypes$fileName, '$', sep = ''),
                                function(aFileName) {
                                    grepl(pattern = aFileName, x =  localFiles)
                                })
                        )
                    } else{
                        fileOfInterest <- FALSE
                    }

                    return(fileOfInterest)
                }
            )]

        ### Remove hidden directories
        if( any( grepl('^\\.', names(dirList)) )  ) {
            nHidden <- sum( grepl('^\\.', names(dirList)) )
            nTotal <- length(dirList)
            warning(
                paste(
                    'The importIsoformExpression() function identified',
                    nHidden,
                    'hidden sub-directories',
                    paste0('(of a total ',nTotal,' sub-directories of interest)'),
                    '\nThese were identified as having the prefix "." and will be ignored.',
                    '\nIf you want to keep them you will have to re-name the sub-directories omitting the starting ".".',
                    sep=' '
                )
            )

            dirList <- dirList[which(
                ! grepl('^\\.', names(dirList))
            )]
        }

        if (length(dirList) == 0) {
            stop(
                paste(
                    'There were no directories containing the file names/suffixes',
                    'typically generated by Kallisto/Salmon/RSEM/StringTie.',
                    'Have you renamed the quantification files?',
                    '(if so you should probably use the "sampleVector" argument instead).',
                    sep=' '
                )
            )
        }

    }

    ### Identify input type
    if(TRUE) {
        dataAnalyed <- supportedTypes[which(
            sapply(
                paste0(supportedTypes$fileName,'$'),
                function(aFileName) {
                    any(grepl(
                        pattern = aFileName,
                        x = list.files(paste0( parentDir, '/', dirList[[1]] ))
                    ))
                })
        ), ]

        if (nrow(dataAnalyed) == 0) {
            stop(
                paste(
                    'Could not identify any Salmon files.',
                    sep=' '
                )
            )
        }
        if (nrow(dataAnalyed) > 1) {
            stop(
                paste(
                    'Could not identify any Salmon files.',
                    sep=' '
                )
            )
        }
    }

    ### Make paths for tximport
    if(TRUE) {
        ### make vector with paths
        localFiles <- sapply(
            dirList,
            function(aDir) {
                list.files(
                    path = paste0( parentDir, '/', aDir, '/' ),
                    pattern = paste0(dataAnalyed$fileName, '$'),
                    full.names = TRUE
                )

            }
        )
        names(localFiles) <- names(dirList)

        ### Subset to those of interest
        if( invertPattern ) {
            localFiles <- localFiles[which(
                ! grepl(
                    pattern = pattern,
                    x = localFiles,
                    ignore.case=ignore.case
                )
            )]
        } else {
            localFiles <- localFiles[which(
                grepl(
                    pattern = pattern,
                    x = localFiles,
                    ignore.case=ignore.case
                )
            )]
        }

        if( length(localFiles) == 0 ) {
            stop('No files were left after filtering via the \'pattern\' argument')
        }

        if (!quiet) {
            message(
                paste0(
                    'Found ',
                    length(localFiles),
                    ' Salmon quantifications of interest'
                )
            )
        }

        ### Test existence
        if(TRUE) {
            fileTest <- file.exists(localFiles)

            if( !all(fileTest)) {
                stop(
                    paste0(
                        '\nSomething went wrong with the file-path creation. Please contact developer with reproducible example.',
                        '\n One file which did not work out was:\n ',
                        localFiles[which( ! fileTest) [1]],
                        sep=''
                    )
                )
            }
        }
    }

    ### Make data.frame
    if(TRUE) {
        dfData <- data.frame(
            files = localFiles,
            names = names(localFiles),
            condition = NA,
            row.names = NULL,
            stringsAsFactors = FALSE
        )
    }

    ### Final message
    if (!quiet) {
        message(
            'Adding NAs as conditions. Please modify these manually.'
        )
    }

    return(dfData)
}

importSalmonData <- function(
    ### Core arguments
    salmonFileDataFrame,

    ### Advanced arguments
    comparisonsToMake=NULL,
    ignoreAfterBar = TRUE,
    ignoreAfterSpace = TRUE,
    ignoreAfterPeriod = FALSE,
    showProgress = TRUE,
    quiet = FALSE,
    ...
) {
    ### Test input data.frame
    if(TRUE) {
        colsToHave <- c(
            "files" ,"names","condition"
        )
        if( ! all(colsToHave %in% colnames(salmonFileDataFrame)) ) {
            stop(
                paste(
                    'The \'salmonFileDataFrame\' needs to also contain these column(s)',
                    setdiff(colsToHave, colnames(salmonFileDataFrame))

                )
            )
        }

        if( any(is.na(salmonFileDataFrame$condition)) ) {
            stop('The \'condition\' column cannot contain NAs')
        }

        if( length(unique(salmonFileDataFrame$condition)) == 1) {
            stop('There appear to be only 1 condition annoatated in the \'condition\' column.')
        }
    }

    ### Use tximeta to import data and meta data
    if(TRUE) {
        ### Message
        if (!quiet) { message('Importing quantification data...') }
        suppressMessages(
            suppressWarnings(
                localSe <- tximeta(
                    coldata = salmonFileDataFrame,
                    countsFromAbundance = 'scaledTPM'
                )
            )
        )

        gtfPath <- metadata(localSe)$txomeInfo$gtf

        if (!quiet) { message('Importing annoation data...') }
        #suppressMessages(
        #    suppressWarnings(
        #        localSe <- addExons(localSe)
        #    )
        #)
        #suppressMessages(
        #    suppressWarnings(
        #        localSe <- addCDS(localSe)
        #    )
        #)
        suppressMessages(
            suppressWarnings(
                localNtSeq <- retrieveCDNA(localSe)
            )
        )
    }

    ### Massage imported data
    if(TRUE) {
        if (!quiet) { message('Massaging data...') }

        ### Count data
        if(TRUE) {
            localCm <-
                assay(localSe, "counts") %>%
                as.data.frame() %>%
                rownames_to_column('isoform_id')
        }

        ### NT sequence
        if(TRUE) {
            names(localNtSeq) <- fixNames(
                names(localNtSeq),
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )
        }

        ### Exon data
        if(FALSE) {
            localGr <- unlist(rowRanges(localSe))
            localGr$isoform_id <- names(localGr)
            localGr$exon_id <- NULL
            localGr$exon_rank <- NULL
            localGr$gene_id <- rowData(localSe)$gene_id[match(
                localGr$isoform_id, rowData(localSe)$tx_id
            )]
            localGr$gene_name <- rowData(localSe)$gene_name[match(
                localGr$isoform_id, rowData(localSe)$tx_id
            )]
            names(localGr) <- NULL
            localGr$isoform_id <- fixNames(
                localGr$isoform_id,
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )
        }

        ### Coding regions
        if(FALSE) {
            localCds <- unlist(rowData(localSe)$cds[which(rowData(localSe)$coding)])
            localCds$isoform_id <- names(localCds)
            localCds$exon_id <- NULL
            localCds$exon_rank <- NULL
            localCds$isoform_id <- fixNames(
                localCds$isoform_id,
                ignoreAfterBar = ignoreAfterBar,
                ignoreAfterSpace = ignoreAfterSpace,
                ignoreAfterPeriod = ignoreAfterPeriod
            )

            ### Analyze CDS
            orfInfo <- analyseCds(
                myCDS = localCds,
                localExons = localGr,
                onlyConsiderFullORF = FALSE,
                PTCDistance = 50
            )

            # make sure all ORFs are annotated (with NAs)
            orfInfo <-
                dplyr::full_join(
                    orfInfo,
                    localCm[, 'isoform_id', drop = FALSE],
                    by = 'isoform_id'
                )

            # Annotate ORF origin
            orfInfo$orf_origin <- 'Annotation'
        }

        ### Design
        if(TRUE) {
            localDesign <- data.frame(
                sampleID = salmonFileDataFrame$names,
                condition = salmonFileDataFrame$condition,
                stringsAsFactors = FALSE
            )
        }
    }

    ### Make switchAnalyzeRlist
    if(TRUE) {
        if (!quiet) { message('Making switchAnalyzeRlist...') }

        localSL <- importRdata(
            isoformCountMatrix = localCm,
            designMatrix = localDesign,
            #isoformExonAnnoation = localGr,
            isoformExonAnnoation = gtfPath,
            isoformNtFasta = NULL,
            comparisonsToMake=comparisonsToMake,
            addAnnotatedORFs=FALSE,
            ignoreAfterBar = ignoreAfterBar,
            ignoreAfterSpace = ignoreAfterSpace,
            ignoreAfterPeriod = ignoreAfterPeriod,
            showProgress=showProgress,
            quiet=quiet,
            ...
        )

    }

    ### Add extra annotation data
    if(TRUE) {
        localNtSeq <- localNtSeq[which(
            names(localNtSeq) %in% localSL$isoformFeatures$isoform_id
        )]
        if( ! all(localSL$isoformFeatures$isoform_id %in% names(localNtSeq)) ) {
            stop('Something went wrong with obtaining the nucleotide sequence. Please make sure you link fasta files as well.')
        }

        localSL$ntSequence <- localNtSeq

        #localSL$orfAnalysis <- orfInfo
    }

    ### Subset to ensure everything is aligned
    if(TRUE) {
        localSL <- subsetSwitchAnalyzeRlist(
            localSL,
            TRUE
        )
    }

    ### Return
    return(localSL)
}

### Prefilter
preFilter <- function(
    switchAnalyzeRlist,
    isoCount = 10,
    min.Count.prop = 0.7,
    IFcutoff = 0.1,
    min.IF.prop = 0.5,
    acceptedGeneBiotype = NULL,
    acceptedIsoformClassCode = NULL,
    removeSingleIsoformGenes = TRUE,
    reduceToSwitchingGenes = FALSE,
    reduceFurtherToGenesWithConsequencePotential = FALSE,
    onlySigIsoforms = FALSE,
    keepIsoformInAllConditions = FALSE,
    alpha = 0.05,
    dIFcutoff = 0.1,
    quiet = FALSE
) {
  ### Test input
  if (TRUE) {
    if (class(switchAnalyzeRlist) != 'switchAnalyzeRlist') {
      stop(paste(
        'The object supplied to \'switchAnalyzeRlist\'',
        'must be a \'switchAnalyzeRlist\''
      ))
    }
    
    if( switchAnalyzeRlist$sourceId == 'preDefinedSwitches' ) {
      stop(
        paste(
          'The switchAnalyzeRlist is made from pre-defined isoform switches.',
          'All filtering should be done before you used the approach',
          'that gave rise to the isoform pairs you have pre-defined',
          sep = ' '
        )
      )
    }
    
    if( switchAnalyzeRlist$sourceId == 'gtf') {
      warning(
        paste(
          'The switchAnalyzeRlist seems to be created from a gtf file',
          'wereby expression is probably not annotated.',
          'Running preFilter() might not be what you want.',
          '\nIf expression info was manually added afterwards',
          'please ignore this warning.',
          sep=' '
        )
      )
    }
    
    if( switchAnalyzeRlist$sourceId == 'preDefinedSwitches') {
      if( all(is.na(switchAnalyzeRlist$isoformFeatures$IF_overall)) ) {
        stop('The switchAnalyzeRlist was created without expression data whereby it cannot be filtered')
      }
    }
    
    
    if (!is.null(isoCount)) {
      if (!is.numeric(isoCount)) {
        stop('The isoCount argument must be a numeric')
      }
    }
    if (!is.null(min.Count.prop)) {
      if (!is.numeric(min.Count.prop)) {
        stop('The min.Count.prop argument must be a numeric')
      }
    }
    if (!is.null(IFcutoff)) {
      if (!is.numeric(IFcutoff)) {
        stop('The IFcutoff argument must be a numeric')
      }
    }
    
    if (!is.null(min.IF.prop)) {
      if (!is.numeric(min.IF.prop)) {
        stop('The min.IF.prop argument must be a numeric')
      }
    }
    
    if (!is.null(acceptedIsoformClassCode)) {
      if (!'class_code' %in%
          colnames(switchAnalyzeRlist$isoformFeatures)) {
        stop(paste(
          'The filter on class codes can only be used if',
          'the switchAnalyzeRlist was generated from cufflinks data'
        ))
      }
    }
    
    if( !is.null(acceptedGeneBiotype) & 'gene_biotype' %in% colnames(switchAnalyzeRlist$isoformFeatures) ) {
      okBiotypes <- unique(switchAnalyzeRlist$isoformFeatures$gene_biotype)
      
      if( !all(acceptedGeneBiotype %in% okBiotypes) ) {
        notAnnot <- setdff(acceptedGeneBiotype, okBiotypes)
        
        warning(
          paste(
            'Some of the supplied biotypes are not found in the isoforms supplied and will be ignored\n',
            'These are:', paste(notAnnot, collapse = ', ')
          )
        )
      }
    }
    
    if (!is.logical(removeSingleIsoformGenes)) {
      stop('The removeSingleIsoformGenes must be either TRUE or FALSE')
    }
    
    if (reduceToSwitchingGenes) {
      hasTest <- any(!is.na(
        c(
          switchAnalyzeRlist$isoformFeatures$isoform_switch_q_value,
          switchAnalyzeRlist$isoformFeatures$gene_switch_q_value
        )
      ))
      if( ! hasTest) {
        stop(
          paste(
            'The switchAnalyzeRlist does not contain the result',
            'of a switch analysis.\nPlease turn off','
                        the "reduceToSwitchingGenes" argument try again.',
            sep=' '
          )
        )
      }
      
      
      if (alpha < 0 |
          alpha > 1) {
        stop('The alpha parameter must be between 0 and 1 ([0,1]).')
      }
      if (alpha > 0.05) {
        warning(paste(
          'Most journals and scientists consider an alpha larger',
          'than 0.05 untrustworthy. We therefore recommend using',
          'alpha values smaller than or queal to 0.05'
        ))
      }
    }
  }
  
  ### Find which isoforms to keep
  if (TRUE) {
    ### Extract data to filter on
    if(TRUE) {
      columnsToExtraxt <-
        c(
          'iso_ref',
          'gene_ref',
          'isoform_id',
          'gene_id',
          'gene_biotype',
          'condition_1',
          'condition_2',
          'class_code',
          'gene_value_1',
          'gene_value_2',
          'iso_overall_mean',
          'iso_value_1',
          'iso_value_2',
          'IF_overall',
          'IF1',
          'IF2',
          'dIF',
          'gene_switch_q_value',
          'isoform_switch_q_value'
        )
      columnsToExtraxt <-
        na.omit(match(
          columnsToExtraxt,
          colnames(switchAnalyzeRlist$isoformFeature)
        ))
      #localData <- unique( switchAnalyzeRlist$isoformFeature[, columnsToExtraxt ] ) # no need
      isoformCountData <- switchAnalyzeRlist$isoformCountMatrix %>%
        rename_with(~ paste0("Count_", .), -isoform_id)
      IFData <- switchAnalyzeRlist$isoformRepIF %>%
        #rownames_to_column(var = "isoform_id") %>%
        rename_with(~ paste0("IF_", .), -isoform_id)
      localData <-
        switchAnalyzeRlist$isoformFeature[, columnsToExtraxt]
      localData <- localData %>%
        inner_join(isoformCountData, by = "isoform_id") %>%
        inner_join(IFData, by = "isoform_id")
      
      # Count features
      transcriptCount <- length(unique(localData$isoform_id))
    }
    
    ### Reduce to genes with switches
    if (reduceToSwitchingGenes ) {
      if( reduceFurtherToGenesWithConsequencePotential ) {
        tmp <- extractSwitchPairs(
          switchAnalyzeRlist,
          alpha = alpha,
          dIFcutoff = dIFcutoff,
          onlySigIsoforms = onlySigIsoforms
        )
        deGenes <- unique(tmp$gene_ref)
        
      } else {
        isoResTest <-
          any(!is.na(
            localData$isoform_switch_q_value
          ))
        if (isoResTest) {
          deGenes <- localData$gene_ref[which(
            localData$isoform_switch_q_value < alpha &
              abs(localData$dIF) > dIFcutoff
          )]
        } else {
          deGenes <- localData$gene_ref[which(
            localData$gene_switch_q_value < alpha &
              abs(localData$dIF) > dIFcutoff
          )]
        }
      }
      
      localData <- localData[which(
        localData$gene_ref %in% deGenes
      ),]
      
      if (!nrow(localData)) {
        stop('No genes were left after filtering for switching genes')
      }
      
      
    }
    
    ### Select distinct pairs of conditions as comparisons
    comparisons <- localData %>% dplyr::select(condition_1, condition_2) %>% distinct()
    
    ### Function to filter based on the cutoff and min proportion 
    filter_comparison <- function(data, comparison, cutoff, min.prop, designMatrix,prefix = "") {
      
      sampleIDs <- designMatrix %>%
        filter(condition %in% c(comparison$condition_1, comparison$condition_2)) %>%
        pull(sampleID)
      
      filter_logic <- function(row) {
        sample_values <- row[match(paste0(prefix, sampleIDs), names(row))]
        count_total <- sum(sample_values > cutoff, na.rm = TRUE)
        total_samples <- length(sampleIDs)
        count_total >= 3 & count_total >= min.prop * total_samples
      }
      
      filteredData <- data %>%
        filter(condition_1 == comparison$condition_1 & condition_2 == comparison$condition_2) %>%
        rowwise() %>%
        filter(filter_logic(across()))
      return(filteredData)
    }
    
    ### Filter within the comparisons to keep the isoforms appear in > min.Count.prop of samples with more than isoCount counts
    if (!is.null(isoCount)) {
      if (!is.null(min.Count.prop)){
        localData <- comparisons %>%
          rowwise() %>%
          do(filter_comparison(localData, ., isoCount, min.Count.prop, switchAnalyzeRlist$designMatrix, prefix = "Count_")) %>%
          ungroup()
        if (!nrow(localData)) {
          stop('No genes were left after filtering for gene expression')
        }
      }
    }
    
    ### Filter within the comparisons to keep the isoforms appear in > min.IF.prop of samples with more than IFcutoff
    if (!is.null(IFcutoff)) {
      if (!is.null(min.IF.prop)){
        localData <- comparisons %>%
          rowwise() %>%
          do(filter_comparison(localData, ., IFcutoff, min.IF.prop, switchAnalyzeRlist$designMatrix, prefix = "IF_")) %>%
          ungroup()
        if (!nrow(localData)) {
          stop('No genes were left after filtering for gene expression')
        }
      }
    }
    
    
    if (!is.null(acceptedGeneBiotype)) {
      if( 'gene_biotype' %in% colnames(localData) ) {
        localData <- localData[which(localData$gene_biotype %in% acceptedGeneBiotype), ]
        
        if (!nrow(localData)) {
          stop('No genes were left after filtering for acceptedGeneBiotype.')
        }
      } else {
        warning('Gene biotypes were not annotated so the \'acceptedGeneBiotype\' argument was ignored.')
      }
      
    }
    
    if (!is.null(acceptedIsoformClassCode)) {
      localData <- localData[which(localData$class_code %in% acceptedIsoformClassCode), ]
      
      if (!nrow(localData)) {
        stop('No genes were left after filtering for isoform class codes')
      }
    }
    
    if (removeSingleIsoformGenes) {
      transcriptsPrGene <-
        split(localData$iso_ref, f = localData$gene_ref)
      transcriptsPrGene <- lapply(transcriptsPrGene, unique)
      
      genesToKeep <-
        names(transcriptsPrGene)[which(sapply(transcriptsPrGene, function(x)
          length(x) > 1))]
      
      if (!length(genesToKeep)) {
        stop('No genes were left after filtering for mutlipe transcrip genes')
      }
      
      localData <-
        localData[which(localData$gene_ref %in% genesToKeep),]
    }
    
  }
  
  ### Do filtering
  if (keepIsoformInAllConditions) {
    switchAnalyzeRlist <- subsetSwitchAnalyzeRlist(
      switchAnalyzeRlist,
      switchAnalyzeRlist$isoformFeatures$isoform_id %in%
        localData$isoform_id
    )
  } else {
    switchAnalyzeRlist <- subsetSwitchAnalyzeRlist(
      switchAnalyzeRlist,
      switchAnalyzeRlist$isoformFeatures$iso_ref %in% localData$iso_ref
    )
  }
  
  ### Report filtering
  transcriptsLeft <- length(unique(localData$isoform_id))
  transcriptsRemoved <- transcriptCount - transcriptsLeft
  percentRemoved <-
    round(transcriptsRemoved / transcriptCount, digits = 4) * 100
  
  if (!quiet) {
    message(
      paste(
        'The filtering removed ',
        transcriptsRemoved,
        ' ( ',
        percentRemoved,
        '% of ) transcripts. There is now ',
        transcriptsLeft,
        ' isoforms left',
        sep = ''
      )
    )
  }
  
  ### return result
  return(switchAnalyzeRlist)
}

importPairedGSEA <- function(
    # Core arguments (intermidiate and final output of pairedGSEA)
  splicing_results,
  diff_results,
  pathToGTF,
  condition_labels = c("B", "C"), # Default to "B" (baseline) and "C" (case)
  
  # importRdata() function relevant
  detectUnwantedEffects = TRUE,
  addAnnotatedORFs = TRUE,
  onlyConsiderFullORF = FALSE,
  removeNonConvensionalChr = FALSE,
  ignoreAfterBar = TRUE,
  ignoreAfterSpace = TRUE,
  ignoreAfterPeriod = FALSE,
  ignoreSurplusIsoforms = TRUE,
  removeTECgenes = TRUE,
  PTCDistance = 50,
  foldChangePseudoCount = 0.01,
  fixStringTieAnnotationProblem = TRUE,
  fixStringTieViaOverlapInMultiGenes = TRUE,
  fixStringTieMinOverlapSize = 50,
  fixStringTieMinOverlapFrac = 0.2,
  fixStringTieMinOverlapLog2RatioToContender = 0.65,
  estimateDifferentialGeneRange = TRUE,
  
  # preFilter() function relevant
  isoCount = 10,
  min.Count.prop = 0.7,
  IFcutoff = 0.1,
  min.IF.prop = 0.5,
  acceptedGeneBiotype = NULL,
  acceptedIsoformClassCode = NULL,
  removeSingleIsoformGenes = TRUE,
  reduceToSwitchingGenes = FALSE,
  reduceFurtherToGenesWithConsequencePotential = FALSE,
  onlySigIsoforms = FALSE,
  keepIsoformInAllConditions = FALSE,
  alpha = 0.05,
  dIFcutoff = 0.1,
  
  # Advanced arguments
  showProgress = TRUE,
  quiet = FALSE
  
) {
  ### Step 1: Extract Count Matrix and Design Matrix
  message("Extracting count matrix and design matrix from splicing_results...")
  
  # Extract count matrix from splicing_results
  tryCatch({
    count_matrix <- as.data.frame(splicing_results@listData$countData)
    sample_data <- as.data.frame(splicing_results@sampleData)
  }, error = function(e) {
    stop(
      paste0(
        "The input splicing_results appears to be invalid. ",
        "Please ensure that it is the transcript-level DGS splicing_reaults, which is the intermediate output of paired_diff() from pairedGSEA.\n"
      )
    )
  })
  ### count_matrix <- as.data.frame(splicing_results@listData$countData)
  # colnames(count_matrix) <- gsub("^.*\\.", "", colnames(count_matrix)) # Ensure column names match sample IDs
  count_matrix$isoform_id <- sapply(rownames(count_matrix), function(x) strsplit(x, ":")[[1]][2])
  
  # Extract design matrix from splicing_results
  ### sample_data <- as.data.frame(splicing_results@sampleData)
  design_matrix <- sample_data[, c("sample", "condition")]
  colnames(design_matrix) <- c("sampleID", "condition") # Rename for compatibility
  
  # Validate custom condition labels
  if (length(condition_labels) != 2) {
    stop("Error: condition_labels must be a vector of length 2, e.g., c('Healthy', 'Lung_cancer').")
  }
  
  # Replace "B" and "C" with user-specified condition labels
  design_matrix$condition <- ifelse(
    design_matrix$condition == "B", condition_labels[1],
    ifelse(design_matrix$condition == "C", condition_labels[2], design_matrix$condition)
  )
  
  ### Step 2: Apply `importRdata()` to Create Initial SwitchAnalyzeRlist
  message("Creating SwitchAnalyzeRlist...")
  switch_list <- importRdata(
    isoformCountMatrix = count_matrix,
    designMatrix = design_matrix,
    isoformExonAnnoation = pathToGTF,
    detectUnwantedEffects = detectUnwantedEffects,
    addAnnotatedORFs = addAnnotatedORFs,
    onlyConsiderFullORF = onlyConsiderFullORF,
    removeNonConvensionalChr = removeNonConvensionalChr,
    ignoreAfterBar = ignoreAfterBar,
    ignoreAfterSpace = ignoreAfterSpace,
    ignoreAfterPeriod = ignoreAfterPeriod,
    ignoreSurplusIsoforms = ignoreSurplusIsoforms, # default to TRUE
    removeTECgenes = removeTECgenes,
    PTCDistance = PTCDistance,
    foldChangePseudoCount = foldChangePseudoCount,
    fixStringTieAnnotationProblem = fixStringTieAnnotationProblem,
    fixStringTieViaOverlapInMultiGenes = fixStringTieViaOverlapInMultiGenes,
    fixStringTieMinOverlapSize = fixStringTieMinOverlapSize,
    fixStringTieMinOverlapFrac = fixStringTieMinOverlapFrac,
    fixStringTieMinOverlapLog2RatioToContender = fixStringTieMinOverlapLog2RatioToContender,
    estimateDifferentialGeneRange = estimateDifferentialGeneRange,
    showProgress = showProgress,
    quiet = quiet
  )
  
  message("Integrating adjusted p-values from splicing_results...")
  
  # Extract isoform-level p-values (padj) from splicing_results
  dexseq_results <- as.data.frame(splicing_results)
  dexseq_results <- dexseq_results[!is.na(dexseq_results$padj), ] # Filter rows with valid p-values
  
  # Map DEXSeq results to `SwitchAnalyzeRlist`
  switch_list$isoformFeatures$isoform_switch_q_value <- dexseq_results$padj[
    match(switch_list$isoformFeatures$isoform_id, dexseq_results$featureID)
  ]
  
  # Aggregate isoform-level p-values to gene-level p-values
  gene_padj <- tapply(
    dexseq_results$padj, dexseq_results$groupID,
    function(x) min(x, na.rm = TRUE) # Gene-level p-value is the minimum adjusted p-value of its isoforms
  )
  switch_list$isoformFeatures$gene_switch_q_value <- gene_padj[
    match(switch_list$isoformFeatures$gene_id, names(gene_padj))
  ]
  
  # Add DEXSeq results to `isoformSwitchAnalysis` for further analysis
  ### switch_list$isoformSwitchAnalysis <- dexseq_results
  
  ### Step 4: Integrating Gene-Level Differential Expression Results
  message("Integrating gene-level differential expression results...")
  
  # Ensure that diff_results has the required columns
  required_cols <- c("gene", "lfc_expression", "pvalue_expression", "padj_expression")
  tryCatch({
    missing_cols <- setdiff(required_cols, colnames(diff_results))
    if (length(missing_cols) > 0) {
      stop(
        paste0(
          "The input diff_results is missing required columns: ",
          paste(missing_cols, collapse = ", "), ".\n",
          "Please provide a differential expression result table containing at least ",
          "'gene', 'lfc_expression', 'pvalue_expression', and 'padj_expression'."
        )
      )
    }
  }, error = function(e) {
    stop(
      paste0(
        "The input diff_results appears to be invalid. ",
        "Please ensure it is a valid paired DGE/DGS analysis result, which is the output of paired_diff() from pairedGSEA."
      )
    )
  })
  
  # Match and add gene-level results to isoformFeatures
  switch_list$isoformFeatures$gene_log2_fold_change <- diff_results$lfc_expression[
    match(switch_list$isoformFeatures$gene_id, diff_results$gene)
  ]
  
  switch_list$isoformFeatures$gene_p_value <- diff_results$pvalue_expression[
    match(switch_list$isoformFeatures$gene_id, diff_results$gene)
  ]
  
  switch_list$isoformFeatures$gene_q_value <- diff_results$padj_expression[
    match(switch_list$isoformFeatures$gene_id, diff_results$gene)
  ]
  
  switch_list$isoformFeatures$gene_significant <- ifelse(
    switch_list$isoformFeatures$gene_q_value < alpha, "yes", "no"
  )
  
  ### Step 5: Apply PreFilter to Filter SwitchAnalyzeRlist
  message("Applying preFilter...")
  
  switch_list <- preFilter(
    switchAnalyzeRlist = switch_list,
    isoCount = isoCount,
    min.Count.prop = min.Count.prop,
    IFcutoff = IFcutoff,
    min.IF.prop = min.IF.prop,
    removeSingleIsoformGenes = removeSingleIsoformGenes,
    reduceToSwitchingGenes = reduceToSwitchingGenes,
    reduceFurtherToGenesWithConsequencePotential = reduceFurtherToGenesWithConsequencePotential,
    alpha = alpha,
    dIFcutoff = dIFcutoff
  )
  
  return(switch_list)
}
kvittingseerup/IsoformSwitchAnalyzeR documentation built on Feb. 6, 2025, 3:26 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
kvittingseerup/IsoformSwitchAnalyzeR
Identify, Annotate and Visualize Isoform Switches with Functional Consequences from both short- and long-read RNA-seq data

R/import_data.R
In kvittingseerup/IsoformSwitchAnalyzeR: Identify, Annotate and Visualize Isoform Switches with Functional Consequences from both short- and long-read RNA-seq data

Defines functions preFilter importSalmonData prepareSalmonFileDataFrame importRdata importIsoformExpression importGTF importCufflinksFiles

Documented in importCufflinksFiles importGTF importIsoformExpression importRdata importSalmonData preFilter prepareSalmonFileDataFrame

R Package Documentation

Browse R Packages

We want your feedback!

kvittingseerup/IsoformSwitchAnalyzeR Identify, Annotate and Visualize Isoform Switches with Functional Consequences from both short- and long-read RNA-seq data

R/import_data.R In kvittingseerup/IsoformSwitchAnalyzeR: Identify, Annotate and Visualize Isoform Switches with Functional Consequences from both short- and long-read RNA-seq data

Defines functions preFilter importSalmonData prepareSalmonFileDataFrame importRdata importIsoformExpression importGTF importCufflinksFiles

Documented in importCufflinksFiles importGTF importIsoformExpression importRdata importSalmonData preFilter prepareSalmonFileDataFrame

R Package Documentation

Browse R Packages

We want your feedback!

kvittingseerup/IsoformSwitchAnalyzeR
Identify, Annotate and Visualize Isoform Switches with Functional Consequences from both short- and long-read RNA-seq data

R/import_data.R
In kvittingseerup/IsoformSwitchAnalyzeR: Identify, Annotate and Visualize Isoform Switches with Functional Consequences from both short- and long-read RNA-seq data