R/CollateData.R

Defines functions .collateData_cleanup .loadOntology .loadIREvents .loadSpliceEvents .loadTranscripts .getGeneList .loadViewRef .prepare_covplot_data .collateData_simplify_assay_paths .collateData_simplify_assay_path .collateData_save_NxtSE .collateData_initialise_HDF5 .collateData_write_colData .collateData_write_stats .collateData_H5_write_assays .collateData_H5_initialize .collateData_compile_assays_from_fst .collateData_process_assays_as_fst .collateData_process_splice_depth .collateData_process_splice .collateData_process_sw .collateData_process_junc .collateData_seed_init .copy_DT .collateData_compile_agglist .collateData_rowEvent_full .collateData_rowEvent_splice_option .collateData_rowEvent_mapGenes .collateData_rowEvent_brief .collateData_rowEvent .collateData_splice_anno .collateData_sw_group .collateData_junc_group .collateData_construct_novelGTF .collateData_novel_injectPutativeTJ .collateData_novel_assemble_transcripts .collateData_assemble_reference .collateData_tj_annotate .collateData_junc_annotate .collateData_annotate_BPPARAM .collateData_annotate .collateData_stop_sw_mismatch .collateData_tj_merge .collateData_sw_merge .collateData_junc_count .collateData_junc_stats_merge_fn .collateData_junc_stats .collateData_junc_merge .collateData_stats_QC .collateData_stats_reads .collateData_stats .collateData_jobs .collateData_expr .collateData_COV .collateData_validate collateData

Documented in collateData

#' Collates a dataset from (processBAM) output files of individual samples
#'
#' `collateData()` creates a dataset from a collection of [processBAM] 
#' output files belonging to an experiment.
#'
#' @details
#' In Windows, collateData runs using only 1 thread, as 
#' BiocParallel's MulticoreParam is not supported. 
#' 
#' It is assumed that all sample [processBAM] outputs were generated using the 
#' same reference.
#'
#' The combination of junction counts and IR quantification from
#' [processBAM] is used to calculate percentage spliced in (PSI) of alternative
#' splice events, and intron retention ratios (IR-ratio) of retained introns. 
#' Also, QC information is collated. Data is organised in a H5file and FST files
#' for memory and processor efficient downstream access using [makeSE].
#'
#'   The original IRFinder algorithm, see the following
#'   [wiki](https://github.com/williamritchie/IRFinder/wiki/IRFinder-Output),
#'   uses `SpliceMax` to estimate abundance of spliced transcripts.
#'   This calculates the number of mapped splice events
#'   that share the boundary coordinate of either the left or right flanking
#'   exon `SpliceLeft,SpliceRight`, estimating splice abundance as the larger
#'   of the two values.
#'
#'   SpliceWiz proposes a new algorithm, `SpliceOver`,
#'   to account for the possibility that the major isoform shares neither
#'   boundary, but arises from either of the flanking exon clusters. Exon
#'   clusters are contiguous regions covered by exons from any transcript
#'   (except those designated as `retained_intron` or
#'   `sense_intronic`), and are separated by
#'   obligate intronic regions (genomic regions that are introns for all
#'   transcripts). For introns that are internal to a single exon cluster
#'   (i.e. akin to "known-exon" introns from IRFinder), `SpliceOver`
#'   uses [GenomicRanges::findOverlaps] to sum all splice reads that overlap
#'   the same genomic region as the intron of interest.
#'
#'   Detection of novel ASEs:  When `novelSplicing` is set to `TRUE`, 
#'   novel junctions (split reads across unannotated junctions from samples
#'   of the dataset being collated) are used in conjunction with the reference
#'   to compile a list of novel ASEs. To avoid being overwhelmed by a large
#'   number of false positive novel junctions (often due to mis-alignments),
#'   a simple filtering strategy is used. This involves including novel 
#'   junctions only if it occurs in a minimum number of samples (default 3),
#'   or if the number of split reads of a novel junction is above a pre-defined
#'   threshold (default 10) in a certain number of samples (default 1). These
#'   parameters can be set using `novelSplicing_minSamples`,
#'   `novelSplicing_countThreshold` and `novelSplicing_minSamplesAboveThreshold`
#'   respectively.
#'
#' @param Experiment (Required) A 2 or 3 column data frame, ideally generated by
#'   [findSpliceWizOutput] or [findSamples].
#'   The first column designate the sample names, and the 2nd column
#'   contains the path to the [processBAM] output file (of type 
#'   `sample.txt.gz`).
#'   (Optionally) a 3rd column contains the coverage files (of type
#'   `sample.cov`) of the corresponding samples.
#'   NB: all other columns are ignored.
#' @param reference_path (Required) The path to the reference generated by
#'   [Build-Reference-methods]
#' @param output_path (Required)  The path to contain the output files for the
#'   collated dataset
#' @param IRMode (default `SpliceOver`) The algorithm to calculate
#'   'splice abundance' in IR quantification.
#'  Valid options are `SpliceOver` and `SpliceMax`.
#'  See details
#' @param lowMemoryMode (default `TRUE`) `collateData()` will perform
#'   optimizations to conserve memory if this is set to `TRUE`. Otherwise,
#'   will prioritise performance.
#' @param n_threads (default `1`) The number of threads to use. If you run out
#'   of memory, try lowering the number of threads
#' @param overwrite (default `FALSE`) If `collateData()` has previously been run
#'   using the same set of samples, it will not be overwritten unless this is
#'   set to `TRUE`.
#' @param novelSplicing (default FALSE) Whether collateData will use
#'   novel junction reads detected in samples to infer novel splice variants.
#'   All tandem split reads (those bridging two consecutive splice junctions)
#'   are used, as well as novel split reads that satisfy abundance criteria
#'   (see `novelSplicing_minSamples`, `novelSplicing_minSamplesAboveThreshold`, and
#'   `novelSplicing_countThreshold`) are used to synthesise a dataset-specific
#'   SpliceWiz reference. See details.
#' @param novelSplicing_minSamples (default 3) Novel junctions are included in
#'   building of novel reference  if number samples with non-zero counts
#'   exceeds this number.
#' @param novelSplicing_countThreshold (default 10) Threshold of split-reads across
#'   novel junctions; used in conjunction with 
#'   `novelSplicing_minSamplesAboveThreshold`
#' @param novelSplicing_minSamplesAboveThreshold (default 1) Novel junctions are 
#'   included in building of novel reference if novel junction reads are
#'   above a pre-defined threshold exceeds this number
#' @param novelSplicing_requireOneAnnotatedSJ (default `TRUE`) The default requires
#'   novel junctions to have one annotated splice site. If this is disabled,
#'   collateData will include novel junctions where neither splice site is
#'   annotated.
#' @param novelSplicing_useTJ (default `TRUE`) For novel splicing, should
#'   SpliceWiz use reads with 2 or more junctions to find novel exons? Ignored
#'   if novelSplicing is set to `FALSE`.
#' @param forceStrandAgnostic (default `FALSE`) In poorly-prepared stranded
#'   libraries, it may be better to quantify in unstranded mode. Set this to
#'   `TRUE` if your stranded libraries may be contaminated with unstranded reads
#' @param packageCOVfiles (default `FALSE`) Whether COV files should be copied
#'   over to the NxtSE object. This is useful if one wishes to transfer the
#'   NxtSE folder to a collaborator, who can then open the NxtSE object with
#'   valid COV file paths.
#' @return `collateData()` writes to the directory given by `output_path`.
#'   This output directory is portable (i.e. it can be moved to a different
#'   location after running `collateData()` before running [makeSE]), but
#'   individual files within the output folder should not be moved.\cr\cr
#'   Also, the [processBAM] and [collateData] output folders should be copied to
#'   the same destination and their relative paths preserved. Otherwise, the
#'   locations of the "COV" files will not be recorded in the collated data and
#'   will have to be re-assigned using `covfile(se)<-`. See [makeSE]
#' @examples
#' buildRef(
#'     reference_path = file.path(tempdir(), "Reference"),
#'     fasta = chrZ_genome(),
#'     gtf = chrZ_gtf()
#' )
#'
#' bams <- SpliceWiz_example_bams()
#' processBAM(bams$path, bams$sample,
#'   reference_path = file.path(tempdir(), "Reference"),
#'   output_path = file.path(tempdir(), "SpliceWiz_Output")
#' )
#'
#' expr <- findSpliceWizOutput(file.path(tempdir(), "SpliceWiz_Output"))
#' collateData(expr,
#'   reference_path = file.path(tempdir(), "Reference"),
#'   output_path = file.path(tempdir(), "Collated_output")
#' )
#'
#' # Enable novel splicing:
#'
#' collateData(expr,
#'   reference_path = file.path(tempdir(), "Reference"),
#'   output_path = file.path(tempdir(), "Collated_output"),
#'   novelSplicing = TRUE
#' )
#'
#' @seealso [processBAM], [makeSE]
#' @md
#' @export
collateData <- function(Experiment, reference_path, output_path,
        IRMode = c("SpliceOver", "SpliceMax"),
        packageCOVfiles = FALSE,
        novelSplicing = FALSE, forceStrandAgnostic = FALSE,
        novelSplicing_minSamples = 3, novelSplicing_countThreshold = 10,
        novelSplicing_minSamplesAboveThreshold = 1,
        novelSplicing_requireOneAnnotatedSJ = TRUE,
        novelSplicing_useTJ = TRUE,
        # novelSplicing_extrapolateTJ = FALSE,
        overwrite = FALSE, n_threads = 1,
        lowMemoryMode = TRUE
) {
    # TODO - dynamic memory management
    # - see https://stackoverflow.com/questions/6457290/how-to-check-the-amount-of-ram

    # TODO - benchmark putative TJ before releasing it
    # @param novelSplicing_extrapolateTJ (default `FALSE`) Whether to allow any
    #   pair of observed (and known) junctions and observed (and known) exons
    #   to construct "putative" tandem junctions. Assists in identifying novel
    #   exon skipping and MXE events at a cost of increased false positives.
    novelSplicing_extrapolateTJ <- FALSE

    if(lowMemoryMode) n_threads <- min(n_threads, 4)
    IRMode <- match.arg(IRMode)
    if (IRMode == "")
        .log(paste("In collateData(),",
            "IRMode must be either 'SpliceOver' (default) or 'SpliceMax'"))

    BPPARAM_mod <- .validate_threads(n_threads)
    if(!is.numeric(novelSplicing_minSamples)) 
        novelSplicing_minSamples <- 0
    if(!is.numeric(novelSplicing_countThreshold)) 
        novelSplicing_countThreshold <- 0
    if(!is.numeric(novelSplicing_minSamplesAboveThreshold))
        novelSplicing_minSamplesAboveThreshold <- 0
    if(!is.logical(novelSplicing_requireOneAnnotatedSJ))
        novelSplicing_requireOneAnnotatedSJ <- TRUE
    if(!is.logical(novelSplicing)) novelSplicing <- FALSE

    N_steps <- 8
    dash_progress("Validating Experiment; checking COV files...", N_steps)
    .log("Validating Experiment; checking COV files...", "message")
    norm_output_path <- .collateData_validate(Experiment,
        reference_path, output_path)
    coverage_files <- .make_path_relative(
        .collateData_COV(Experiment), output_path)
    df.internal <- .collateData_expr(Experiment)
    if (!overwrite && file.exists(file.path(output_path, "seed.Rds"))) {
        se <- .makeSE_load_NxtSE(output_path, N = 7)
        if (all(colnames(se) %in% df.internal$sample) &
            all(df.internal$sample %in% colnames(se))
        ) {
            .log(paste("SpliceWiz already collated this experiment",
                "in given directory"), "message")
            return()
        }
    }

    originalSWthreads <- .getSWthreads()
    # tryCatch({
        setSWthreads(1) # try this to prevent memory leak

        jobs <- .split_vector(seq_len(nrow(df.internal)), BPPARAM_mod$workers)
        
        dash_progress("Compiling Sample Stats", N_steps)
        .log("Compiling Sample Stats", "message")
        df.internal <- .collateData_stats(df.internal, jobs, BPPARAM_mod)
        stranded <- !any(df.internal$strand == 0) & !forceStrandAgnostic

        dash_progress("Compiling Junction List", N_steps)
        .collateData_junc_merge(df.internal, 
            jobs, BPPARAM_mod, norm_output_path)
        .collateData_junc_stats(df.internal, jobs, 
            BPPARAM_mod, norm_output_path,
            juncThreshold = novelSplicing_countThreshold)

        dash_progress("Compiling Intron Retention List", N_steps)
        oldScipen <- options(scipen=999)
        .collateData_sw_merge(df.internal, jobs, BPPARAM_mod, 
            norm_output_path, stranded)
        options(oldScipen)

        # Tandem junction compilation
        if(novelSplicing & novelSplicing_useTJ) {
            .collateData_tj_merge(df.internal, jobs, 
                BPPARAM_mod, norm_output_path)
        }
        
    # Reassign +/- based on junctions.fst annotation
        # Annotate junctions
        dash_progress("Tidying up splice junctions and intron retentions", 
            N_steps)
        .log("Tidying up splice junctions and intron retentions...", "message")
        
        if(BPPARAM_mod$workers == 1) {
            .collateData_annotate(reference_path, norm_output_path, 
                stranded, novelSplicing, lowMemoryMode,
                minSamplesWithJunc = novelSplicing_minSamples, 
                minSamplesAboveJuncThreshold = 
                    novelSplicing_minSamplesAboveThreshold,
                novelSplicing_requireOneAnnotatedSJ =
                    novelSplicing_requireOneAnnotatedSJ,
                novelSplicing_useTJ =
                    novelSplicing_useTJ,
                novelSplicing_extrapolateTJ =
                    novelSplicing_extrapolateTJ
            )
        } else {
            # perform task inside child thread, so we can dump the memory later
            .collateData_annotate_BPPARAM(reference_path, norm_output_path, 
                stranded, novelSplicing, lowMemoryMode,
                minSamplesWithJunc = novelSplicing_minSamples, 
                minSamplesAboveJuncThreshold = 
                    novelSplicing_minSamplesAboveThreshold,
                novelSplicing_requireOneAnnotatedSJ =
                    novelSplicing_requireOneAnnotatedSJ,
                novelSplicing_useTJ =
                    novelSplicing_useTJ,
                novelSplicing_extrapolateTJ =
                    novelSplicing_extrapolateTJ
            )
        }
        message("done\n")

        dash_progress("Generating NxtSE assays", N_steps)
        .log("Generating NxtSE assays", "message")

        # Re-define threads
        n_threads_collate_assays <- n_threads
        
        # One job per chunk - more memory efficient
        jobs_2 <- .split_vector(
            seq_len(nrow(df.internal)),
            nrow(df.internal)
        )
        BPPARAM_mod_progress <- .validate_threads(
            n_threads_collate_assays,
            progressbar = TRUE,
            tasks = nrow(df.internal)
        )
        agg.list <- BiocParallel::bplapply(
            seq_len(nrow(df.internal)),
            .collateData_compile_agglist,
            jobs = jobs_2, df.internal = df.internal,
            norm_output_path = norm_output_path, IRMode = IRMode,
            useProgressBar = FALSE,
            BPPARAM = BPPARAM_mod_progress
        )
        gc()

        dash_progress("Building Final NxtSE Object", N_steps)
        .log("Building Final NxtSE Object", "message")
        .log("...consolidating assays to H5 file", "message")
        samples_per_block <- 16
        # if(lowMemoryMode) samples_per_block <- 4
        assays <- .collateData_compile_assays_from_fst(df.internal,
            norm_output_path, samples_per_block)
        .collateData_write_stats(df.internal, norm_output_path)
        .collateData_write_colData(df.internal, 
            coverage_files, norm_output_path, packageCOVfiles)
        
        .log("...packaging reference", "message")
        if(novelSplicing) {
            cov_data <- .prepare_covplot_data(reference_path,
                file.path(norm_output_path, "Reference"))
        } else {
            cov_data <- .prepare_covplot_data(reference_path)
        }
        saveRDS(cov_data, file.path(norm_output_path, "cov_data.Rds"))

        .log("...synthesising NxtSE", "message")
        # NEW compile NxtSE:
        colData.Rds <- readRDS(file.path(norm_output_path, "colData.Rds"))
        colData <- .makeSE_colData_clean(colData.Rds$df.anno)
        se <- .collateData_initialise_HDF5(norm_output_path, colData, assays)

        .collateData_save_NxtSE(se, file.path(norm_output_path, "seed.Rds"))
        .collateData_cleanup(norm_output_path)
        
        # Test run of loading NxtSE
        .log("...determining how overlapping introns should be removed", 
            "message")
        tryCatch({
            tmpse <- makeSE(norm_output_path, verbose = FALSE,
                RemoveOverlapping = FALSE)
            tmpse2 <- .makeSE_iterate_IR_new(tmpse, verbose = FALSE)
            tmpFiltered <- (
                rowData(tmpse)$EventName %in% 
                rowData(tmpse2)$EventName
            )
            saveRDS(tmpFiltered, 
                file.path(norm_output_path, "filteredIntrons.Rds"))
            rm(tmpse, tmpse2)
            gc()
        }, error = function(err) {
            .log("Test loading of NxtSE object failed.")
        })
        
        dash_progress("SpliceWiz (NxtSE) Collation Finished", N_steps)
        .log("SpliceWiz (NxtSE) Collation Finished", "message")

    # }, error = function(e) {
        # stop("In collateData(...)", e, call. = FALSE)
    # }, finally = {
        .restore_threads(originalSWthreads)
    # })
    
    # Return invisible
    invisible(NULL)
}


################################################################################
# collateData helper functions:

# Checks Experiment data.frame. Checks paths valid and have existing parent
#   folders. Creates required folders.
.collateData_validate <- function(Experiment, reference_path, output_path) {
    if (!is(Experiment, "data.frame")) {
        .log(paste("In collateData(),",
            "Experiment object needs to be a data frame"))
    }
    if (ncol(Experiment) < 2) {
        .log(paste("In collateData(),",
            "Experiment needs to contain at least two columns,",
            "with the first 2 columns containing",
            "(1) sample name and (2) processBAM output"))
    }
    .validate_reference(reference_path)
    if (!dir.exists(dirname(output_path))) {
        .log(paste("In collateData(),",
            "Parent directory of output path:",
            dirname(output_path), "needs to exist"))
    }
    base_output_path <- normalizePath(dirname(output_path))
    norm_output_path <- file.path(base_output_path, basename(output_path))
    if (!dir.exists(norm_output_path)) {
        dir.create(norm_output_path)
    }
    temp_output_path <- file.path(norm_output_path, "temp")
    if (!dir.exists(temp_output_path)) {
        dir.create(temp_output_path)
    }
    if (!dir.exists(file.path(norm_output_path, "annotation"))) {
        dir.create(file.path(norm_output_path, "annotation"))
    }
    return(norm_output_path)
}

# Check all given COV files are valid COV format
.collateData_COV <- function(Experiment) {
    coverage_files <- ""
    Experiment <- as.data.frame(Experiment)
    if (ncol(Experiment) == 2) return(NULL)
    
    if (ncol(Experiment) > 2 && all(file.exists(Experiment[, 3]))) {
        coverage_files <- Experiment[, 3]
    }
    if (!isCOV(coverage_files)) {
        message("Some coverage files do not exist or are corrupted")
        coverage_files <- ""
    }
    if (length(coverage_files) != nrow(Experiment)) {
        message("The number of coverage files must equal the number of samples")
        coverage_files <- ""
    }
    return(coverage_files)
}

# Checks all processBAM output files exist. Checks repeat entries.
.collateData_expr <- function(Experiment) {
    Experiment <- as.data.frame(Experiment)
    colnames(Experiment)[c(1, 2)] <- c("sample", "path")
    if (!all(vapply(Experiment$path, file.exists, logical(1)))) {
        .log(paste("In collateData(),",
            "Some files in Experiment do not exist"))
    }
    if (length(Experiment$sample) != length(unique(Experiment$sample))) {
        .log(paste("In collateData(),",
            "Repeated sample names are not allowed"))
    }

    df.internal <- as.data.table(Experiment[, c(1, 2)])
    df.internal$paired <- FALSE
    df.internal$strand <- 0
    df.internal$depth <- 0
    df.internal$mean_frag_size <- 0
    df.internal$directionality_strength <- 0
    df.internal$Intergenic_Fraction <- 0
    df.internal$rRNA_Fraction <- 0
    df.internal$NonPolyA_Fraction <- 0
    df.internal$Mitochondrial_Fraction <- 0
    df.internal$Unanno_Jn_Fraction <- 0
    df.internal$NMD_Jn_Fraction <- 0
    df.internal$Fraction_Splice_Reads <- 0
    df.internal$Fraction_Span_Reads <- 0

    df.internal$IRBurden_clean <- 0
    df.internal$IRBurden_exitrons <- 0
    df.internal$IRBurden_clean_unstranded <- 0
    df.internal$IRBurden_exitrons_unstranded <- 0
    df.internal$IRBurden_antisense <- 0
    return(df.internal)
}

# Designates equal jobs per thread
.collateData_jobs <- function(n_expr, BPPARAM_mod, samples_per_block) {
    n_jobs <- min(ceiling(n_expr / samples_per_block),
        BPPARAM_mod$workers)
    jobs <- .split_vector(seq_len(n_expr), n_jobs)
    return(jobs)
}

################################################################################
# Sub

# Collates statistics of processBAM QC, returns a data frame of these QC stats
.collateData_stats <- function(df.internal, jobs, BPPARAM_mod) {
    n_jobs <- length(jobs)
    df.internal <- rbindlist(
        BiocParallel::bplapply(seq_len(n_jobs),
            function(x, jobs, df.internal) {
                suppressPackageStartupMessages({
                    requireNamespace("data.table")
                    requireNamespace("stats")
                })
                work <- jobs[[x]]
                block <- df.internal[work]
                for (i in seq_len(length(work))) {

                    data.list <- get_multi_DT_from_gz(
                        normalizePath(block$path[i]),
                        c("BAM", "Directionality", "QC"))
                    stats <- data.list[["BAM"]]
                    direct <- data.list[["Directionality"]]
                    QC <- data.list[["QC"]]

                    block <- .collateData_stats_reads(block, i, stats, direct)
                    block <- .collateData_stats_QC(block, i, QC, direct)
                }
                return(block)
            }, jobs = jobs, df.internal = df.internal, BPPARAM = BPPARAM_mod
        )
    )
    return(df.internal)
}

################################################################################

.collateData_stats_reads <- function(block, i, stats, direct) {
    statsVec        <- unname(unlist(stats[,2]))
    names(statsVec) <- unname(unlist(stats[,1]))
    numNuc      <- as.numeric(statsVec["Total nucleotides"])
    numSingle   <- as.numeric(statsVec["Total singles processed"])
    numPairs    <- as.numeric(statsVec["Total pairs processed"])
    
    if (numSingle == 0 & numPairs > 0) {
        block$paired[i] <- TRUE
        block$depth[i] <- numPairs
        block$mean_frag_size[i] <- numNuc / numPairs
    } else if (numSingle > 0 && numPairs / numSingle > 10) {
        block$paired[i] <- TRUE
        block$depth[i] <- numPairs
        block$mean_frag_size[i] <- numNuc / numPairs
    } else {
        block$paired[i] <- FALSE
        block$depth[i] <- numSingle
        block$mean_frag_size[i] <- numNuc / numSingle
    }
    block$strand[i] <- as.numeric(
        direct$Value[direct[,1] == "Overall Directionality"])
    return(block)
}

.collateData_stats_QC <- function(block, i, QC, direct) {
    block$directionality_strength[i] <- as.numeric(
        direct$Value[grepl("Dir score known junctions", unlist(direct[,1]))])
    block$Intergenic_Fraction[i] <-
        QC$Value[QC$QC == "Intergenic Reads"] / block$depth[i]
    block$rRNA_Fraction[i] <-
        QC$Value[QC$QC == "rRNA Reads"] / block$depth[i]
    block$NonPolyA_Fraction[i] <-
        QC$Value[QC$QC == "NonPolyA Reads"] / block$depth[i]
    block$Mitochondrial_Fraction[i] <-
        QC$Value[QC$QC == "Mitochondrial Reads"] / block$depth[i]
    block$Unanno_Jn_Fraction[i] <-
        QC$Value[QC$QC == "Unannotated Junctions"] /
        (
            QC$Value[QC$QC == "Unannotated Junctions"] +
            QC$Value[QC$QC == "Annotated Junctions"]
        )
    block$NMD_Jn_Fraction[i] <-
        QC$Value[QC$QC == "NMD Junctions"] /
        QC$Value[QC$QC == "Annotated Junctions"]
    block$Fraction_Splice_Reads[i] <-
        QC$Value[QC$QC == "Annotated Junctions"] / block$depth[i]
    block$Fraction_Span_Reads[i] <-
        QC$Value[QC$QC == "Spans Reads"] / block$depth[i]
    # IRBurden calculations
    block$IRBurden_clean_unstranded[i] <-
        QC$Value[QC$QC == "Non-Directional Clean IntronDepth Sum"] /
        (
            QC$Value[QC$QC == "Non-Directional Clean IntronDepth Sum"] +
            QC$Value[QC$QC == "Annotated Junctions"]
        )
    block$IRBurden_exitrons_unstranded[i] <-
        QC$Value[QC$QC == "Non-Directional Known-Exon IntronDepth Sum"] /
        (
            QC$Value[QC$QC == "Non-Directional Known-Exon IntronDepth Sum"] +
            QC$Value[QC$QC == "Annotated Junctions"]
        )
    block$IRBurden_antisense[i] <-
        QC$Value[QC$QC == "Non-Directional Anti-Sense IntronDepth Sum"] /
        (
            QC$Value[QC$QC == "Non-Directional Anti-Sense IntronDepth Sum"] +
            QC$Value[QC$QC == "Annotated Junctions"]
        )
    if (block$strand[i] != 0) {
        block$IRBurden_clean[i] <-
            QC$Value[QC$QC == "Directional Clean IntronDepth Sum"] /
            (
                QC$Value[QC$QC == "Directional Clean IntronDepth Sum"] +
                QC$Value[QC$QC == "Annotated Junctions"]
            )
        block$IRBurden_exitrons[i] <-
            QC$Value[QC$QC == "Directional Known-Exon IntronDepth Sum"] /
            (
                QC$Value[QC$QC == "Directional Known-Exon IntronDepth Sum"] +
                QC$Value[QC$QC == "Annotated Junctions"]
            )
    }
    return(block)
}

################################################################################
# Sub

# Compiles a unified list of detected splice junctions
.collateData_junc_merge <- function(
        df.internal, jobs, BPPARAM_mod,
        output_path
) {
    temp_output_path <- file.path(output_path, "temp")
    n_jobs <- length(jobs)

    .log("Compiling Junction List...", "message", appendLF = FALSE)

    # Extract junctions from processBAM output, save temp FST file
    junc.list <- BiocParallel::bplapply(
        seq_len(n_jobs),
        function(x, jobs, df.internal, temp_output_path) {
            suppressPackageStartupMessages({
                requireNamespace("data.table")
                requireNamespace("stats")
            })
            work <- jobs[[x]]
            block <- df.internal[work]
            junc.segment <- NULL
            for (i in seq_len(length(work))) {

                data.list <- get_multi_DT_from_gz(
                    normalizePath(block$path[i]), c("JC_seqname"))
                junc <- data.list[["JC_seqname"]]

                setnames(junc, "JC_seqname", "seqnames")
                junc[, c("seqnames") := as.character(get("seqnames"))]
                if (is.null(junc.segment)) {
                    junc.segment <- junc[, seq_len(4), with = FALSE]
                } else {
                    junc.segment <- merge(junc.segment,
                        junc[, seq_len(4), with = FALSE], all = TRUE)
                }
                # Write temp file
                fst::write.fst(as.data.frame(junc),
                    file.path(temp_output_path, paste(block$sample[i],
                    "junc.fst.tmp", sep = ".")))
            }
            return(junc.segment)
        }, jobs = jobs, df.internal = df.internal,
            temp_output_path = temp_output_path, BPPARAM = BPPARAM_mod
    )

    # Combine list of individual junction dfs into unified list of junction
    message("merging...", appendLF = FALSE)
    junc.common <- NULL
    for (i in seq_len(length(junc.list))) {
        if (is.null(junc.common)) {
            junc.common <- junc.list[[i]]
        } else {
            junc.common <- merge(junc.common, junc.list[[i]],
                all = TRUE, by = colnames(junc.common))
        }
    }
    junc.common[, c("start") := get("start") + 1]
    message("done")
    write.fst(as.data.frame(junc.common), file.path(output_path, 
        "annotation", "junc.common.fst"))
    rm(junc.common)
    gc()
    
    invisible()
}

# Returns a junc_common with junction abundance statistics
.collateData_junc_stats <- function(
        df.internal, jobs, BPPARAM_mod,
        output_path,
        juncThreshold = 10 # min junctions for counting samples
) {
    temp_output_path <- file.path(output_path, "temp")
    n_jobs <- length(jobs)

    .log("Compiling Junction Stats...", "message", appendLF = FALSE)

    # Extract junctions from processBAM output, save temp FST file
    junc.list <- BiocParallel::bplapply(
        seq_len(n_jobs),
        function(x, jobs, df.internal, output_path, juncThreshold) {
            suppressPackageStartupMessages({
                requireNamespace("data.table")
                requireNamespace("stats")
            })
            work <- jobs[[x]]
            block <- df.internal[work]
            junc.common <- as.data.table(read.fst(file.path(output_path, 
                "annotation", "junc.common.fst")))
            junc.common[, c("seqnames") := as.character(get("seqnames"))]
            final.df <- data.frame(
                countJuncInSamples = rep(0, nrow(junc.common)),
                countJuncThresholdSamples = rep(0, nrow(junc.common))
            )
            
            for (i in seq_len(length(work))) {
                value <- .collateData_junc_count(block$sample[i],
                    junc.common, file.path(output_path, "temp"))
                final.df$countJuncInSamples <- final.df$countJuncInSamples + 
                    ifelse(value > 0, 1, 0)
                final.df$countJuncThresholdSamples <- 
                    final.df$countJuncThresholdSamples + 
                    ifelse(value >= juncThreshold, 1, 0)
            }
            return(final.df)
        }, jobs = jobs, df.internal = df.internal,
            output_path = output_path, 
            juncThreshold = juncThreshold,
            BPPARAM = BPPARAM_mod
    )

    # Combine list of individual junction dfs into unified list of junction
    message("merging...", appendLF = FALSE)
    final.df <- Reduce(.collateData_junc_stats_merge_fn, junc.list)
    message("done")

    write.fst(as.data.frame(final.df), file.path(output_path, 
        "annotation", "junc.common.counts.fst"))
    rm(junc.list, final.df)
    gc()
    invisible()
}

.collateData_junc_stats_merge_fn <- function(x, y) {
    as.data.frame(as.matrix(x) + as.matrix(y))
}

# Counts junctions for one sample
.collateData_junc_count <- function(
        sample, # strand,
        junc.common, temp_output_path
) {
    junc <- as.data.table(
        read.fst(file.path(temp_output_path,
            paste(sample, "junc.fst.tmp", sep = ".")))
    )
    junc[, c("seqnames") := as.character(get("seqnames"))]
    junc[, c("start") := get("start") + 1]
    junc$strand <- NULL # Use strand from junc.common

    junc <- junc[junc.common, on = colnames(junc.common)[c(1, 2, 3)]]
    junc$count <- junc$total
    junc[is.na(get("count")), c("count") := 0]
    
    final <- junc$count
    rm(junc)
    gc()
    return(final)
}


# Assume all processBAM outputs are created from same ref. Checks this
.collateData_sw_merge <- function(df.internal, jobs, BPPARAM_mod,
        output_path, stranded) {
    # Check names of all introns are the same in all samples
    temp_output_path <- file.path(output_path, "temp")
    n_jobs <- length(jobs)

    .log("Compiling Intron Retention List...", "message", appendLF = FALSE)

    sw.list <- BiocParallel::bplapply(seq_len(n_jobs),
        function(x, jobs, df.internal, temp_output_path, stranded) {
            suppressPackageStartupMessages({
                requireNamespace("data.table")
                requireNamespace("stats")
            })
            work <- jobs[[x]]
            block <- df.internal[work]
            sw.names <- c()
            phrase <- ifelse(stranded, "Dir_Chr", "Nondir_Chr")
            for (i in seq_len(length(work))) {

                data.list <- get_multi_DT_from_gz(
                    normalizePath(block$path[i]), phrase)
                sw <- data.list[[phrase]]

                setnames(sw, c(phrase, "Start", "End", "Strand"),
                    c("seqnames", "start", "end", "strand"))
                sw[, c("seqnames") := as.character(get("seqnames"))]
                if (is.null(sw.names)) {
                    sw.names <- sw$Name
                } else {
                    if (!identical(sw.names, sw$Name))
                        .collateData_stop_sw_mismatch()
                }
                fst::write.fst(as.data.frame(sw),
                    file.path(temp_output_path,
                        paste(block$sample[i], "sw.fst.tmp", sep = ".")))
            }
            return(sw.names)
        }, jobs = jobs, df.internal = df.internal,
            temp_output_path = temp_output_path,
            stranded = stranded, BPPARAM = BPPARAM_mod
    )
    # Checks common columns are the same
    if (length(sw.list) > 1) {
        for (i in seq_len(length(sw.list))) {
            if (!identical(sw.list[[i]], sw.list[[1]]))
                .collateData_stop_sw_mismatch()
        }
    }
    # Returns common columns
    sw <- fst::read.fst(file.path(temp_output_path,
        paste(df.internal$sample[1], "sw.fst.tmp", sep = ".")),
        as.data.table = TRUE)
    sw.common <- sw[, seq_len(6), with = FALSE]
    sw.common[, c("start") := get("start") + 1]

    message("done")
    write.fst(as.data.frame(sw.common), file.path(output_path, 
        "annotation", "sw.common.fst"))
    rm(sw.common)
    gc()
    invisible()
}

# Compiles a unified list of detected splice junctions
.collateData_tj_merge <- function(df.internal, jobs, BPPARAM_mod,
        output_path) {
    temp_output_path <- file.path(output_path, "temp")
    n_jobs <- length(jobs)

    .log("Compiling Tandem Junction List...", "message", appendLF = FALSE)

    # Extract junctions from processBAM output, save temp FST file
    tj.list <- BiocParallel::bplapply(
        seq_len(n_jobs),
        function(x, jobs, df.internal, temp_output_path) {
            suppressPackageStartupMessages({
                requireNamespace("data.table")
                requireNamespace("stats")
            })
            work <- jobs[[x]]
            block <- df.internal[work]
            tj.segment <- NULL
            for (i in seq_len(length(work))) {

                data.list <- get_multi_DT_from_gz(
                    normalizePath(block$path[i]), c("TJ_seqname"))
                tj <- data.list[["TJ_seqname"]]
                setnames(tj, "TJ_seqname", "seqnames")
                # Filter for novel tandem junctions only
                tj <- tj[get("strand") == "."]
                tj[, c("seqnames") := as.character(get("seqnames"))]
                
                # 0-base to 1-base conversion
                tj[, c("start1") := get("start1") + 1]
                tj[, c("start2") := get("start2") + 1]
                
                if (is.null(tj.segment)) {
                    tj.segment <- tj[, seq_len(6), with = FALSE]
                } else {
                    tj.segment <- merge(tj.segment,
                        tj[, seq_len(6), with = FALSE], all = TRUE)
                }
            }
            return(tj.segment)
        }, jobs = jobs, df.internal = df.internal,
            temp_output_path = temp_output_path, BPPARAM = BPPARAM_mod
    )

    # Combine list of individual junction dfs into unified list of junction
    message("merging...", appendLF = FALSE)
    tj.common <- NULL
    for (i in seq_len(length(tj.list))) {
        if (is.null(tj.common)) {
            tj.common <- tj.list[[i]]
        } else {
            tj.common <- merge(tj.common, tj.list[[i]],
                all = TRUE, by = colnames(tj.common))
        }
    }

    message("done")
    write.fst(as.data.frame(tj.common), file.path(output_path, 
        "annotation", "tj.common.fst"))
    rm(tj.common)
    gc()
    invisible()
}

.collateData_stop_sw_mismatch <- function() {
    stopmsg <- paste(
        "Some processBAM outputs were generated",
        "with a different SpliceWiz reference.",
        "Suggest re-run processBAM on all sample files",
        "using the same reference"
    )
    .log(stopmsg)
}

################################################################################
# Sub

# Annotate processBAM introns and junctions according to given reference
.collateData_annotate <- function(reference_path, norm_output_path,
        stranded, novelSplicing, lowMemoryMode = TRUE,
        minSamplesWithJunc = 3, minSamplesAboveJuncThreshold = 1,
        novelSplicing_requireOneAnnotatedSJ = TRUE,
        novelSplicing_useTJ = TRUE,
        novelSplicing_extrapolateTJ = FALSE
) {
    message("...annotating splice junctions")
    .collateData_junc_annotate(2, reference_path, norm_output_path,
        lowMemoryMode)

    if(novelSplicing & novelSplicing_useTJ) {
        message("...looking for novel exons")
        .collateData_tj_annotate(2, reference_path, norm_output_path)
    }
    
    # Assemble intron_novel_transcript reference from novel junctions and
    # novel tandem junctions
    .collateData_assemble_reference(
        2, reference_path, 
        norm_output_path, lowMemoryMode, novelSplicing,
        minSamplesWithJunc, minSamplesAboveJuncThreshold,
        novelSplicing_requireOneAnnotatedSJ,
        novelSplicing_useTJ,
        novelSplicing_extrapolateTJ,
        verbose = TRUE
    )
    use_ref_path <- file.path(norm_output_path, "Reference")

    if(novelSplicing) {    
        .log("Tidying up splice junctions and intron retentions (part 2)...",
            "message")
    }
        
    message("...grouping splice junctions")
    .collateData_junc_group(2, use_ref_path, norm_output_path)

    message("...grouping introns")
    .collateData_sw_group(2, use_ref_path, norm_output_path, stranded)

    message("...loading splice events")
    .collateData_splice_anno(2, use_ref_path, norm_output_path)

    message("...compiling rowEvents")
    .collateData_rowEvent(2, use_ref_path, norm_output_path)
}

.collateData_annotate_BPPARAM <- function(reference_path, norm_output_path,
        stranded, novelSplicing, lowMemoryMode = TRUE,
        minSamplesWithJunc = 3, minSamplesAboveJuncThreshold = 1,
        novelSplicing_requireOneAnnotatedSJ = TRUE,
        novelSplicing_useTJ = TRUE,
        novelSplicing_extrapolateTJ = FALSE
) {
    BPPARAM_annotate <- .validate_threads(2)
    message("...annotating splice junctions")
    tmp <- BiocParallel::bplapply(
        seq_len(2),
        .collateData_junc_annotate,
        reference_path = reference_path, 
        norm_output_path = norm_output_path,
        lowMemoryMode = lowMemoryMode,
        BPPARAM = BPPARAM_annotate
    )
    if(novelSplicing) {
        message("...looking for novel exons")
        if(novelSplicing_useTJ) {
            tmp <- BiocParallel::bplapply(
                seq_len(2),
                .collateData_tj_annotate,
                reference_path = reference_path, 
                norm_output_path = norm_output_path,
                BPPARAM = BPPARAM_annotate
            )        
        }
        .log(paste("Assembling novel splicing reference in dedicated thread,",
            "this may take up to 10 minutes..."), "message", appendLF = FALSE)
        tmp <- BiocParallel::bplapply(
            seq_len(2),
            .collateData_assemble_reference,
            reference_path = reference_path, 
            norm_output_path = norm_output_path,
            lowMemoryMode = lowMemoryMode,
            novelSplicing = novelSplicing,
            minSamplesWithJunc = minSamplesWithJunc,
            minSamplesAboveJuncThreshold = minSamplesAboveJuncThreshold,
            novelSplicing_requireOneAnnotatedSJ = novelSplicing_requireOneAnnotatedSJ,
            novelSplicing_useTJ = novelSplicing_useTJ,
            novelSplicing_extrapolateTJ = novelSplicing_extrapolateTJ,
            BPPARAM = BPPARAM_annotate
        )
        message("done")
        .log("Tidying up splice junctions and intron retentions (part 2)...",
            "message")
    } else {
        # No need to thread a simply file copy function
        .collateData_assemble_reference(
            2, reference_path, 
            norm_output_path, lowMemoryMode, novelSplicing,
            minSamplesWithJunc, minSamplesAboveJuncThreshold,
            novelSplicing_requireOneAnnotatedSJ,
            novelSplicing_useTJ,
            novelSplicing_extrapolateTJ,
            verbose = TRUE
        )
    }  
    use_ref_path <- file.path(norm_output_path, "Reference")
    
    message("...grouping splice junctions")
    tmp <- BiocParallel::bplapply(
        seq_len(2),
        .collateData_junc_group,
        reference_path = use_ref_path, 
        norm_output_path = norm_output_path,
        BPPARAM = BPPARAM_annotate
    )
    message("...grouping introns")
    tmp <- BiocParallel::bplapply(
        seq_len(2),
        .collateData_sw_group,
        reference_path = use_ref_path, 
        norm_output_path = norm_output_path,
        stranded = stranded,
        BPPARAM = BPPARAM_annotate
    )
    message("...loading splice events")
    tmp <- BiocParallel::bplapply(
        seq_len(2),
        .collateData_splice_anno,
        reference_path = use_ref_path, 
        norm_output_path = norm_output_path,
        BPPARAM = BPPARAM_annotate
    )
    message("...compiling rowEvents")
    tmp <- BiocParallel::bplapply(
        seq_len(2),
        .collateData_rowEvent,
        reference_path = use_ref_path, 
        norm_output_path = norm_output_path,
        BPPARAM = BPPARAM_annotate
    )
}

################################################################################

# Annotate junction splice motifs
.collateData_junc_annotate <- function(
        threadID, reference_path, norm_output_path,
        lowMemoryMode = TRUE
) {
    if(threadID != 2) return()
    junc.strand <- read.fst(
        path = file.path(reference_path, "fst", "junctions.fst"),
        columns = c("seqnames", "start", "end", "strand", "splice_motif"),
        as.data.table = TRUE
    )
    junc.strand <- unique(junc.strand)
    junc.strand[, c("seqnames") := as.character(get("seqnames"))]

    junc.common <- as.data.table(read.fst(file.path(norm_output_path, 
        "annotation", "junc.common.fst")))
    junc.stats <- as.data.table(read.fst(file.path(norm_output_path, 
        "annotation", "junc.common.counts.fst")))

    junc.common[, c("seqnames") := as.character(get("seqnames"))]

    junc.common <- cbind(junc.common, junc.stats)
    
    # Determine strand of junc.common junctions
    junc.common[, c("strand") := NULL]
    junc.common <- merge(junc.common, junc.strand,
        all = TRUE, by = c("seqnames", "start", "end"))

    # Novel junctions are assigned *
    junc.common[is.na(get("strand")), c("strand") := "*"]
    junc.common.unanno <- junc.common[get("strand") == "*"]
    junc.common.anno <- junc.common[get("strand") != "*"]

    # make sure valid seqnames
    junc.common.unanno <- junc.common.unanno[!is.na(get("seqnames"))]
    junc.common.unanno <- junc.common.unanno[get("seqnames") != ""]

    # Determine strandedness based on splice junction motif
    if (nrow(junc.common.unanno) != 0) {
        genome <- Get_Genome(reference_path, as_DNAStringSet = !lowMemoryMode)
        seqinfo <- as.data.frame(seqinfo(genome))
        
        # Test 2bit inefficiency
        genome <- .check_2bit_performance(reference_path, genome)    
        
        # Filter unanno by available sequences
        junc.common.unanno <- junc.common.unanno[
            seqnames %in% rownames(seqinfo)]
        
        # sanity check: remove unannotated junctions that lie outside genome
        junc.common.unanno.gr <- .grDT(junc.common.unanno)
        seqlevels(junc.common.unanno.gr, pruning.mode = "coarse") <-
            seqlevels(genome)
        
        seqinfo.gr <- as(seqinfo(genome), "GenomicRanges")

        OL <- findOverlaps(junc.common.unanno.gr, seqinfo.gr, type = "within")
        junc.common.unanno <- junc.common.unanno[
            unique(from(OL)), which = FALSE]

        if (nrow(junc.common.unanno) != 0) {
            # Create left and right motif GRanges
            left.gr <- with(junc.common.unanno,
                GRanges(seqnames = as.character(seqnames),
                ranges = IRanges(start = start, end = start + 1),
                strand = "+"))
            right.gr <- with(junc.common.unanno,
                GRanges(seqnames = as.character(seqnames),
                ranges = IRanges(start = end - 1, end = end),
                strand = "+"))
            
            junc.common.unanno[, c("splice_motif") := paste0(
                as.character(getSeq(genome, left.gr)),
                as.character(getSeq(genome, right.gr))
            )]
            
            splice_motifs <- data.frame(
                seqs = c("GTAG", "GCAG", "ATAC", "ATAG"),
                seqs_r = c("CTAC", "CTGC", "GTAT", "CTAT")
            )
            junc.common.unanno[get("splice_motif") %in% splice_motifs$seqs,
                c("strand") := "+"]
            junc.common.unanno[get("splice_motif") %in% splice_motifs$seqs_r,
                c("strand") := "-"]
            # Do not accept un-annotated GTACs - too confusing
            # Exclude unannotated non-splice motifs
            junc.common.unanno <- junc.common.unanno[
                get("strand") != "*"]
            junc.common.unanno[get("strand") == "-",
                c("splice_motif") := splice_motifs$seqs[match(
                    get("splice_motif"), splice_motifs$seqs_r)]]
            junc.final <- rbindlist(list(junc.common.anno, junc.common.unanno))
        } else {
            junc.final <- junc.common.anno
        }
        # Cleanup memory:
        rm(genome)
        gc()
    } else {
        junc.final <- junc.common.anno
    }
    # Assign region names to junctions:
    junc.final[, c("Event") := paste0(get("seqnames"), ":",
        get("start"), "-", get("end"), "/", get("strand"))]
    junc.final <- junc.final[, 
        c("seqnames", "start", "end", "strand", 
            "splice_motif", "Event", 
            "countJuncInSamples", "countJuncThresholdSamples")
    ]

    write.fst(as.data.frame(junc.final), file.path(norm_output_path, 
        "annotation", "junc.common.annotated.fst"))
    
    if(exists("junc.common.unanno")) rm(junc.common.unanno)
    rm(junc.final, junc.common.anno, junc.common, junc.strand)
    gc()
}

# Annotate junction splice motifs
.collateData_tj_annotate <- function(
        threadID, reference_path, norm_output_path,
        lowMemoryMode = TRUE
) {
    if(threadID != 2) return()

    # candidate.introns <- as.data.table(
        # read.fst(file.path(reference_path, "fst", "junctions.fst"))
    # )
    # candidate.introns[, c("seqnames") := as.character(get("seqnames"))]
        
    junc.common <- as.data.table(read.fst(file.path(norm_output_path, 
        "annotation", "junc.common.annotated.fst")))
        
    tj.common <- as.data.table(read.fst(file.path(norm_output_path, 
        "annotation", "tj.common.fst")))
    
    junc.common[, c("seqnames") := as.character(get("seqnames"))]
    tj.common[, c("seqnames") := as.character(get("seqnames"))]

    ### - Strand assignment based on annotated junctions - ###
    # First remove any tandem junctions not in junc.common

    jc_events_unstranded <- paste0(junc.common$seqnames, ":", 
        junc.common$start, "-", junc.common$end)
    
    tj_event1 <- paste0(tj.common$seqnames, ":", 
        tj.common$start1, "-", tj.common$end1)
    tj_event2 <- paste0(tj.common$seqnames, ":", 
        tj.common$start2, "-", tj.common$end2)
    
    tj.common <- tj.common[tj_event1 %in% jc_events_unstranded &
        tj_event2 %in% jc_events_unstranded]

    # Then, assemble a strand table
    jc.strand <- data.frame(
        event = jc_events_unstranded,
        strand = junc.common$strand
    )

    # Match strands based on event1 / 2
    tj_event1 <- paste0(tj.common$seqnames, ":", 
        tj.common$start1, "-", tj.common$end1)
    tj_event2 <- paste0(tj.common$seqnames, ":", 
        tj.common$start2, "-", tj.common$end2)
    
    tj.strand <- as.data.table(data.frame(
        strand1 = jc.strand$strand[match(tj_event1, jc.strand$event)],
        strand2 = jc.strand$strand[match(tj_event2, jc.strand$event)],
        stringsAsFactors = FALSE
    ))
    tj.strand$status <- ""
    tj.strand[get("strand1") == "+" & get("strand2") == "+", 
        c("status") := "+"]
    tj.strand[get("strand1") == "-" & get("strand2") == "-", 
        c("status") := "-"]
    tj.strand[get("strand1") == "*", 
        c("status") := get("strand2")]
    tj.strand[get("strand2") == "*", 
        c("status") := get("strand1")]

    # Filter TJ based on matching strand
    tj.common$strand <- tj.strand$status
    tj.common <- tj.common[strand != ""]
    
    ### - Not sure we need to filter out non-novel exons - ###
    # - tandem junctions can also help find novel A5/3SS and skipped exons
    
    # exons_brief <- read.fst(
        # path = file.path(reference_path, "fst", "Exons.fst"),
        # columns = c("seqnames", "start", "end", "strand"),
        # as.data.table = TRUE
    # )
    # exons_brief <- unique(exons_brief)
    # exons_brief[, c("seqnames") := as.character(get("seqnames"))]
    # exons_left <- paste0(exons_brief$seqnames, ":", exons_brief$start)
    # tj_exon_left <- paste0(tj.common$seqnames, ":", tj.common$end1 + 1)
    # exons_right <- paste0(exons_brief$seqnames, ":", exons_brief$end)
    # tj_exon_right <- paste0(tj.common$seqnames, ":", tj.common$start2 - 1)
    
    # tj.common.new <- tj.common[!(tj_exon_left %in% exons_left) &
        # !(tj_exon_right %in% exons_right)]

    write.fst(as.data.frame(tj.common), file.path(norm_output_path, 
        "annotation", "tj.common.annotated.fst"))

    # Cleanup
    rm(junc.common, tj.common, jc.strand, tj.strand)
    gc()
}

################################################################################

# Assemble novel transcripts and generate new reference

.collateData_assemble_reference <- function(
    threadID, reference_path, norm_output_path, lowMemoryMode, novelSplicing,
    minSamplesWithJunc, minSamplesAboveJuncThreshold,
    novelSplicing_requireOneAnnotatedSJ,
    novelSplicing_useTJ = TRUE,
    novelSplicing_extrapolateTJ = FALSE,
    verbose = FALSE
) {
    if(threadID != 2) return()
    
    if(novelSplicing) {
        .log(paste("Assembling novel splicing reference:"), "message")
        fst2Copy <- c(
            "Ontology.fst", 
            "Introns.Dir.fst", "Introns.ND.fst", "IR.NMD.fst"
        )
    } else {
        message("...copying splicing reference")    
        fst2Copy <- c(
            "Exons.fst", "Exons.Group.fst", "Genes.fst", "gtf_fixed.fst",
            "junctions.fst", "Transcripts.fst",
            "Misc.fst", "Ontology.fst", "Proteins.fst", 
            "Splice.Extended.fst", "Splice.fst", "Splice.options.fst",      
            "Introns.Dir.fst", "Introns.ND.fst", "IR.NMD.fst"
        )
    }
    
    novel_ref_path <- file.path(norm_output_path, "Reference")
    .validate_path(novel_ref_path, subdirs = "resource")
    .validate_path(novel_ref_path, subdirs = "fst")

    # Copy over files that don't need to be generated
    file.copy(file.path(reference_path, "SpliceWiz.ref.gz"),
        file.path(novel_ref_path, "SpliceWiz.ref.gz"), overwrite = TRUE)       
    for(fstFile in fst2Copy) {
        if(file.exists(file.path(reference_path, "fst", fstFile))) {
            file.copy(file.path(reference_path, "fst", fstFile),
                file.path(novel_ref_path, "fst", fstFile), overwrite = TRUE)    
        }        
    }

    if(novelSplicing) {
        if(verbose) message("...loading reference FASTA/GTF")
        if(file.exists(file.path(reference_path, "fst/gtf_fixed.fst"))) {
            reference_data <- list(
                genome = Get_Genome(reference_path, validate = FALSE,
                    as_DNAStringSet = !lowMemoryMode),
                gtf_gr = .grDT(
                    read.fst(file.path(reference_path, "fst/gtf_fixed.fst")),
                    keep.extra.columns = TRUE
                )
            )
        } else {
            reference_data <- .get_reference_data(
                reference_path = reference_path,
                fasta = "", gtf = "", verbose = FALSE,
                overwrite = FALSE, force_download = FALSE,
                pseudo_fetch_fasta = lowMemoryMode, pseudo_fetch_gtf = FALSE)
            reference_data$gtf_gr <- .validate_gtf_chromosomes(
                reference_data$genome, reference_data$gtf_gr)
            reference_data$gtf_gr <- .fix_gtf(reference_data$gtf_gr)
        }

        if(verbose) message("...injecting novel transcripts to GTF")
        # Insert novel gtf here
        # reference_data$gtf_gr is a GRanges object
        novel_gtf <- .collateData_novel_assemble_transcripts(
            reference_path, norm_output_path, reference_data$gtf_gr,
            minSamplesWithJunc, minSamplesAboveJuncThreshold,
            novelSplicing_requireOneAnnotatedSJ,
            novelSplicing_useTJ
        )
        # Finish inserting novel gtf
        unique_seqlevels <- unique(c(
            seqlevels(reference_data$gtf_gr), seqlevels(novel_gtf)
        ))
        seqlevels(reference_data$gtf_gr) <- unique_seqlevels
        seqlevels(novel_gtf) <- unique_seqlevels
        
        # Putative TJ injection
        if(novelSplicing_extrapolateTJ) {
            putTJ_gtf <- .collateData_novel_injectPutativeTJ(
                reference_path, norm_output_path, 
                reference_data$gtf_gr, novel_gtf
            )
            unique_seqlevels <- unique(c(
                seqlevels(reference_data$gtf_gr), seqlevels(novel_gtf),
                seqlevels(putTJ_gtf)
            ))
            seqlevels(reference_data$gtf_gr) <- unique_seqlevels
            seqlevels(novel_gtf) <- unique_seqlevels
            seqlevels(putTJ_gtf) <- unique_seqlevels
            novel_gtf <- c(novel_gtf, putTJ_gtf)
        }
        
        if(verbose) message("...processing GTF")
        .process_gtf(c(reference_data$gtf_gr, novel_gtf), 
            novel_ref_path, verbose = FALSE)
        # extra_files$genome_style <- .gtf_get_genome_style(reference_data$gtf_gr)
        reference_data$gtf_gr <- NULL # To save memory, remove original gtf
        
        novelGTFfile <- file.path(norm_output_path, "novelTranscripts.gtf")
        rtracklayer::export(novel_gtf, novelGTFfile, "gtf")
        gzip(filename = novelGTFfile, destname = paste0(novelGTFfile, ".gz"),
            overwrite = TRUE)
        if (file.exists(novelGTFfile) & file.exists(paste0(novelGTFfile, ".gz"))
        ) {
            file.remove(novelGTFfile)
        }
        rm(novel_gtf)
        gc()
        
        if(verbose) message("...processing introns from GTF")
        reference_data$genome <- .check_2bit_performance(reference_path,
            reference_data$genome, verbose = FALSE)
        .process_introns(novel_ref_path, reference_data$genome, 
            useExtendedTranscripts = TRUE, verbose = FALSE)
        rm(reference_data)
        gc()

        if(verbose) message("...annotating alternative splicing events")
        .gen_splice(novel_ref_path, verbose = FALSE)
        
        message("done")
    }

    settings.list <- readRDS(file.path(reference_path, "settings.Rds"))
    # (TODO) - modify settings.Rds
    saveRDS(settings.list, file.path(novel_ref_path, "settings.Rds"))
}

.collateData_novel_assemble_transcripts <- function(
    reference_path, norm_output_path, gtf,
    minSamplesWithJunc, minSamplesAboveJuncThreshold,
    novelSplicing_requireOneAnnotatedSJ = TRUE,
    novelSplicing_useTJ = TRUE
    
) {
    # Load junc.common
    junc.common <- as.data.table(read.fst(file.path(norm_output_path, 
        "annotation", "junc.common.annotated.fst")))
       
    # Filter out known junctions
    known.junctions <- unique(as.data.table(read.fst(
        file.path(reference_path, "fst", "junctions.fst"), 
        columns = c("seqnames", "start", "end", "strand", "Event"))))
    
    junc.common[, c("seqnames") := as.character(get("seqnames"))]
    known.junctions[, c("seqnames") := as.character(get("seqnames"))]
    
    junc.novel <- junc.common[!(get("Event") %in% known.junctions$Event)]
    if(nrow(junc.novel) == 0) {
        rm(junc.common, known.junctions)
        gc()
        return(NULL)    
    }
    
    # filter novel junctions
    junc.novel <- junc.novel[
            get("countJuncInSamples") >= minSamplesWithJunc |
            get("countJuncThresholdSamples") >= minSamplesAboveJuncThreshold
    ]
    if(nrow(junc.novel) == 0) {
        rm(junc.common, known.junctions)
        gc()
        return(NULL)    
    }
    
    # Filter for when there is at least 1 common splice site with knowns
    if(novelSplicing_requireOneAnnotatedSJ) {
        kj.left <- with(known.junctions, GRanges(seqnames = seqnames,
            ranges = IRanges(start, start), strand = strand))
        nj.left <- with(junc.novel, GRanges(seqnames = seqnames,
            ranges = IRanges(start, start), strand = strand))
        kj.right <- with(known.junctions, GRanges(seqnames = seqnames,
            ranges = IRanges(end, end), strand = strand))
        nj.right <- with(junc.novel, GRanges(seqnames = seqnames,
            ranges = IRanges(end, end), strand = strand))

        OL_left <- .findOverlaps_merge(nj.left, kj.left)
        OL_right <- .findOverlaps_merge(nj.right, kj.right)
        
        at_least_one_end <- unique(c(from(OL_left), from(OL_right)))
        junc.novel <- junc.novel[at_least_one_end]    

        if(nrow(junc.novel) == 0) {
            rm(junc.common, known.junctions)
            gc()
            return(NULL)    
        }
    }
    
    rm(junc.common, known.junctions)
    gc()

    if(novelSplicing_useTJ) {
        # No need to filter tandem junctions as this was done in earlier step
        tj.novel <- as.data.table(read.fst(file.path(norm_output_path, 
            "annotation", "tj.common.annotated.fst")))
        if(nrow(tj.novel) > 0) {
            tj.novel[, c("seqnames") := as.character(get("seqnames"))]
            # Filter tj.novel by junc.novel
            setnames(tj.novel, c("start1", "end1"), c("start", "end"))
            tmp <- tj.novel[!junc.novel, 
                on = c("seqnames", "start", "end", "strand")]
            tj.novel <- tj.novel[!tmp, on = colnames(tj.novel)]
            setnames(tj.novel, c("start", "end"), c("start1", "end1"))

            setnames(tj.novel, c("start2", "end2"), c("start", "end"))
            tmp <- tj.novel[!junc.novel, 
                on = c("seqnames", "start", "end", "strand")]
            tj.novel <- tj.novel[!tmp, on = colnames(tj.novel)]
            setnames(tj.novel, c("start", "end"), c("start2", "end2"))

            # Remove events in junc.novel that are also featured in tj.novel
            # tmp_gr <- GRanges(tj.novel$seqnames, 
                # IRanges(tj.novel$start1, tj.novel$end1), tj.novel$strand)
            # OL <- .findOverlaps_merge(.grDT(junc.novel), tmp_gr, type = "equal")
            # if(length(OL@from) > 0) junc.novel <- junc.novel[-unique(OL@from)]

            # tmp_gr <- GRanges(tj.novel$seqnames, 
                # IRanges(tj.novel$start2, tj.novel$end2), tj.novel$strand)
            # OL <- .findOverlaps_merge(.grDT(junc.novel), tmp_gr, type = "equal")
            # if(length(OL@from) > 0) junc.novel <- junc.novel[-unique(OL@from)]
        }
    } else {
        tj.novel <- data.table()
    }
    
    n_trans <- nrow(junc.novel) + nrow(tj.novel)
    if(n_trans < 1) return(NULL)

    new_gtf <- .collateData_construct_novelGTF(
        gtf, junc.novel, tj.novel,
        novelTr_source = "novel",
        novelGeneID_header = "novelGene",
        novelGeneBT = "novel_gene",
        novelTrID_header = "novelTrID",
        novelTrName_suffix = "-novelTr",
        novelTrBT = "novel_transcript"
    )

    return(new_gtf)
}

.collateData_novel_injectPutativeTJ <- function(
    reference_path, norm_output_path, ref_gtf, novel_gtf
) {
    # Generate a table of all junctions (known + novel)
    exons <- c(
        ref_gtf[ref_gtf$type == "exon"],
        novel_gtf[novel_gtf$type == "exon"]
    )
    allJunctions <- .grlGaps(split(
          exons, exons$transcript_id
    ))
    allJunctions <- as.data.table(allJunctions)
    allJunctions[, c("group", "group_name") := list(NULL,NULL)]
    allJunctions <- unique(as.data.table(sort(.grDT(allJunctions))))
    
    # Generate a table of all annotated + novel TJ's
    
    # Known TJ's from SpliceWiz reference
    suppressWarnings({
        knownTJ <- fread(file.path(reference_path, "SpliceWiz.ref.gz"), 
            skip = "ref-tj")
    })
    colnames(knownTJ) <- 
        c("seqnames", "start1", "end1", "start2", "end2", "strand")
    knownTJ[, c("start1", "start2") := list(
        get("start1") + 1, get("start2") + 1)]

    tandem_gtf <- novel_gtf[novel_gtf$type == "exon"]
    tandem_gtf <- tandem_gtf[tandem_gtf$exon_number == 3]
    tandemTrID <- unique(tandem_gtf$transcript_id)
    tandemTr <- novel_gtf[novel_gtf$transcript_id %in% tandemTrID & 
        novel_gtf$type == "transcript"]
    
    # Retrieve novel TJ's
    novelTJ <- data.table(
        seqnames = as.character(seqnames(tandemTr)),
        start1 = 0,
        end1 = 0,
        start2 = 0,
        end2 = 0,
        strand = as.character(strand(tandemTr))
    )
    novelTandemExons <- novel_gtf[novel_gtf$type == "exon" & 
        novel_gtf$transcript_id %in% tandemTrID]
    exon1 <- novelTandemExons[
        (novelTandemExons$exon_number == 1 & 
            as.character(strand(novelTandemExons)) == "+") |
        (novelTandemExons$exon_number == 3 & 
            as.character(strand(novelTandemExons)) == "-")
    ]
    exon2 <- novelTandemExons[
        (novelTandemExons$exon_number == 2)
    ]
    exon3 <- novelTandemExons[
        (novelTandemExons$exon_number == 1 & 
            as.character(strand(novelTandemExons)) == "-") |
        (novelTandemExons$exon_number == 3 & 
            as.character(strand(novelTandemExons)) == "+")
    ]

    names(exon1) <- exon1$transcript_id
    names(exon2) <- exon2$transcript_id
    names(exon3) <- exon3$transcript_id

    exon1 <- exon1[tandemTr$transcript_id]
    exon2 <- exon2[tandemTr$transcript_id]
    exon3 <- exon3[tandemTr$transcript_id]

    novelTJ$start1 <- BiocGenerics::end(exon1) + 1
    novelTJ$end1 <- BiocGenerics::start(exon2) - 1
    novelTJ$start2 <- BiocGenerics::end(exon2) + 1
    novelTJ$end2 <- BiocGenerics::start(exon3) - 1
    
    # Create putative novel TJ's
    #   junction1 - exon - junction2

    leftJn <- copy(allJunctions[, c("seqnames", "start", "end", "strand")])
    rightJn <- copy(allJunctions[, c("seqnames", "start", "end", "strand")])
    putExons <- copy(as.data.table(exons)[,
        c("seqnames", "start", "end", "strand")])

    leftJn <- unique(leftJn)
    rightJn <- unique(rightJn)
    putExons <- unique(putExons)

    setnames(leftJn, c("start", "end"), c("start1", "end1"))
    setnames(rightJn, c("start", "end"), c("start2", "end2"))
    setnames(putExons, c("start", "end"), c("end1", "start2"))
    putExons[, c("end1", "start2") := list(get("end1") - 1, get("start2") + 1)]

    putTJ <- putExons[leftJn, on = c("seqnames", "end1", "strand")]
    putTJ <- putTJ[rightJn, on = c("seqnames", "start2", "strand")]
    putTJ <- putTJ[, c("seqnames", "start1", "end1", "start2", "end2", "strand")]
    putTJ <- putTJ[!is.na(get("end2"))]

    putTJ <- putTJ[complete.cases(putTJ)]
    putTJ <- unique(putTJ)
    # anti-join against knowns

    novelPutTJ <- fsetdiff(putTJ, knownTJ)

    nontandemTr <- novel_gtf[
        !(novel_gtf$transcript_id %in% tandemTrID) & novel_gtf$type == "transcript"
    ]
    nontandemExons <- novel_gtf[
        novel_gtf$type == "exon" & novel_gtf$transcript_id %in% nontandemTr$transcript_id
    ]

    novelJunctions <- .grlGaps(split(
      nontandemExons,
      nontandemExons$transcript_id
    ))

    novelJunctions <- as.data.table(novelJunctions)
    novelJunctions[, c("group", "group_name") := list(NULL,NULL)]
    novelJunctions <- .grDT(novelJunctions)
    novelJunctions <- sort(novelJunctions)
    novelJunctions <- as.data.table(novelJunctions)
    novelJunctions <- unique(novelJunctions)
    novelJunctions <- novelJunctions[, c("seqnames", "start", "end", "strand"), with = FALSE]

    knownJunctions <- as.data.table(fst::read.fst(file.path(
      reference_path, "fst/junctions.fst"
    ), columns = c("seqnames", "start", "end", "strand")))
    knownJunctions <- unique(knownJunctions)

    novelJunctions <- fsetdiff(novelJunctions, knownJunctions)
        
    filterByJunctions <- rbind(knownJunctions, novelJunctions)

    # Now package novelPutTJ into novel_gtf
    if(nrow(novelPutTJ) == 0) return(NULL)
    
    new_gtf <- .collateData_construct_novelGTF(
        c(ref_gtf, novel_gtf), NULL, novelPutTJ,
        novelTr_source = "novelPutative",
        novelGeneID_header = "novelGenePutative",
        novelGeneBT = "novel_genePutative",
        novelTrID_header = "novelPutTrID",
        novelTrName_suffix = "-novelPutTr",
        novelTrBT = "novel_transcript_putative"
    )

    return(new_gtf)    
    
}

.collateData_construct_novelGTF <- function(
    gtf, junc.novel = NULL, tj.novel = NULL,
    novelTr_source = "novel",
    novelGeneID_header = "novelGene",
    novelGeneBT = "novel_gene",
    novelTrID_header = "novelTrID",
    novelTrName_suffix = "-novelTr",
    novelTrBT = "novel_transcript"
) {
    n_junc <- ifelse(is(junc.novel, "data.frame"), nrow(junc.novel), 0)
    n_tj <- ifelse(is(tj.novel, "data.frame"), nrow(tj.novel), 0)
    n_trans <- n_junc + n_tj
    if(n_trans < 1) return(NULL)
    
    new_gtf <- GRanges()
    gr_transcript <- GRanges()
    # Initialize novel transcripts and determine genes
    suppressWarnings({
        # gr_transcript <- c(
            # GRanges(
                # junc.novel$seqnames,
                # IRanges(junc.novel$start - 50, junc.novel$end + 50),
                # junc.novel$strand
            # ),
            # GRanges(
                # tj.novel$seqnames,
                # IRanges(tj.novel$start1 - 50, tj.novel$end2 + 50),
                # tj.novel$strand
            # )
        # )
        if(n_junc > 0) {
            gr_transcript <- c(gr_transcript,
                GRanges(
                    junc.novel$seqnames,
                    IRanges(junc.novel$start - 50, junc.novel$end + 50),
                    junc.novel$strand
                )
            )
        }
        if(n_tj > 0) {
            gr_transcript <- c(gr_transcript,
                GRanges(
                    tj.novel$seqnames,
                    IRanges(tj.novel$start1 - 50, tj.novel$end2 + 50),
                    tj.novel$strand
                )
            )
        }
    })
    Genes <- as.data.table(gtf[gtf$type == "gene"])
    if(any(grepl(novelGeneID_header, Genes$gene_id))) {
        .log("Issue with given gene_id header", "message")
        novelGeneID_header <- paste0(novelGeneID_header, "2")
    }
    if(any(grepl(novelGeneBT, Genes$gene_biotype))) {
        .log("Issue with given gene_id header", "message")
        novelGeneBT <- paste0(novelGeneBT, "2")
    }
    Transcripts <- as.data.table(gtf[gtf$type == "transcript"])
    if(any(grepl(novelTrID_header, Transcripts$transcript_id))) {
        .log("Issue with given transcript_id header", "message")
        novelTrID_header <- paste0(novelTrID_header, "2")
    }
    if(any(grepl(novelTrName_suffix, Transcripts$transcript_name))) {
        .log("Issue with given transcript_name suffix", "message")
        novelTrName_suffix <- paste0(novelTrName_suffix, "2")
    }
    if(any(grepl(novelTrBT, Transcripts$transcript_biotype))) {
        .log("Issue with given transcript_biotype header", "message")
        novelTrBT <- paste0(novelTrBT, "2")
    }

    # Annotate novel junctions as belonging to known genes if they overlap
    OL <- .findOverlaps_merge(gr_transcript, .grDT(Genes))
    OL.DT <- data.table(from = OL@from, to = OL@to)
    OL.DT <- unique(OL.DT, by = "from")
    junc_genes <- data.table(juncID = seq_len(length(gr_transcript)))
    junc_genes$gene_id <- junc_genes$gene_name <- junc_genes$gene_biotype <- NA
    junc_genes$gene_id[OL.DT$from] <- Genes$gene_id[OL.DT$to]
    junc_genes$gene_name[OL.DT$from] <- Genes$gene_name[OL.DT$to]
    junc_genes$gene_biotype[OL.DT$from] <- Genes$gene_biotype[OL.DT$to]

    # Construct a list of novel genes
    if(any(is.na(junc_genes$gene_id))) {
        tr_novelGene <- sort(reduce(
            gr_transcript[is.na(junc_genes$gene_id)]
        ))
        empty_mCol_nG <- data.frame(matrix(ncol = ncol(mcols(gtf)), 
            nrow = length(tr_novelGene)))
        colnames(empty_mCol_nG) <- colnames(mcols(gtf))

        empty_mCol_nG$source <- novelTr_source
        empty_mCol_nG$gene_id <- paste0(novelGeneID_header, 
            formatC(seq_len(length(tr_novelGene)),
                width = 6, format = "d", flag = "0"))
        empty_mCol_nG$gene_name <- empty_mCol_nG$gene_id

        empty_mCol_nG$gene_biotype <- novelGeneBT
        
        mcols(tr_novelGene) <- empty_mCol_nG
        mcols(tr_novelGene)$type = "gene"
        new_gtf <- c(new_gtf, tr_novelGene)
        
        # Reannotate gene list
        OL <- findOverlaps(gr_transcript, tr_novelGene)
        OL.DT <- data.table(from = OL@from, to = OL@to)
        OL.DT <- unique(OL.DT, by = "from")
        junc_genes$gene_id[OL.DT$from] <- empty_mCol_nG$gene_id[OL.DT$to]
        junc_genes$gene_name[OL.DT$from] <- empty_mCol_nG$gene_name[OL.DT$to]
        junc_genes$gene_biotype[OL.DT$from] <- novelGeneBT
    }
    
    junc_genes[, c("transcript_number") := seq_len(.N), by = "gene_id"]
    junc_genes[, c("transcript_number_str") := formatC(
        get("transcript_number"), width = 3, format = "d", flag = "0")]
    junc_genes[, c("transcript_name") := paste0(
        get("gene_name"), novelTrName_suffix, get("transcript_number_str"))]
    junc_genes[, c("transcript_id") := paste0(novelTrID_header, formatC(
        seq_len(.N), width = 6, format = "d", flag = "0"))]

    # Construct a list of novel transcripts and exons
    empty_mCol <- data.frame(matrix(ncol = ncol(mcols(gtf)), 
        nrow = length(gr_transcript)))
    colnames(empty_mCol) <- colnames(mcols(gtf))
    empty_mCol$source <- novelTr_source
    empty_mCol$gene_id <- junc_genes$gene_id
    empty_mCol$gene_name <- junc_genes$gene_name
    empty_mCol$transcript_id <- junc_genes$transcript_id
    empty_mCol$transcript_name <- junc_genes$transcript_name
    
    empty_mCol$gene_biotype <- junc_genes$gene_biotype
    empty_mCol$transcript_biotype <- novelTrBT
    
    mcols(gr_transcript) <- empty_mCol
    mcols(gr_transcript)$type = "transcript"
    new_gtf <- c(new_gtf, gr_transcript)

    # Construct exons and transcripts
    if(n_junc > 0) {
        gr_novel_leftExon <- with(junc.novel, GRanges(seqnames = seqnames,
            ranges = IRanges(start = start - 50, end = start - 1),
            strand = strand))
        gr_novel_rightExon <- with(junc.novel, GRanges(seqnames = seqnames,
            ranges = IRanges(start = end + 1, end = end + 50),
            strand = strand))
        mcols(gr_novel_leftExon) <- empty_mCol[seq_len(n_junc),]
        mcols(gr_novel_rightExon) <- empty_mCol[seq_len(n_junc),]
        mcols(gr_novel_leftExon)$exon_number <- ifelse(
            strand(gr_novel_leftExon) == "+", 1, 2)
        mcols(gr_novel_rightExon)$exon_number <- ifelse(
            strand(gr_novel_rightExon) == "-", 1, 2)
        mcols(gr_novel_leftExon)$exon_id <- paste0(
            mcols(gr_novel_leftExon)$transcript_id,
            "-exon",  mcols(gr_novel_leftExon)$exon_number
        )
        mcols(gr_novel_rightExon)$exon_id <- paste0(
            mcols(gr_novel_rightExon)$transcript_id,
            "-exon",  mcols(gr_novel_rightExon)$exon_number
        )
        mcols(gr_novel_leftExon)$type <- "exon"
        mcols(gr_novel_rightExon)$type <- "exon"
        new_gtf <- c(new_gtf, gr_novel_leftExon, gr_novel_rightExon)
    }

    if(n_tj > 0) {
        gr_tjnovel_leftExon <- GRanges(tj.novel$seqnames,
            IRanges(tj.novel$start1 - 50, tj.novel$start1 - 1),
            tj.novel$strand)
        gr_tjnovel_middleExon <- GRanges(tj.novel$seqnames,
            IRanges(tj.novel$end1 + 1, tj.novel$start2 - 1),
            tj.novel$strand)
        gr_tjnovel_rightExon <- GRanges(tj.novel$seqnames,
            IRanges(tj.novel$end2 + 1, tj.novel$end2 + 50),
            tj.novel$strand)
        mcols(gr_tjnovel_leftExon) <- empty_mCol[
            seq(n_junc + 1, nrow(empty_mCol)),]
        mcols(gr_tjnovel_middleExon) <- empty_mCol[
            seq(n_junc + 1, nrow(empty_mCol)),]
        mcols(gr_tjnovel_rightExon) <- empty_mCol[
            seq(n_junc + 1, nrow(empty_mCol)),]
        mcols(gr_tjnovel_leftExon)$exon_number <- ifelse(
            strand(gr_tjnovel_leftExon) == "+", 1, 3)
        mcols(gr_tjnovel_middleExon)$exon_number <- 2
        mcols(gr_tjnovel_rightExon)$exon_number <- ifelse(
            strand(gr_tjnovel_rightExon) == "-", 1, 3)
        mcols(gr_tjnovel_leftExon)$exon_id <- paste0(
            mcols(gr_tjnovel_leftExon)$transcript_id,
            "-exon",  mcols(gr_tjnovel_leftExon)$exon_number
        )
        mcols(gr_tjnovel_middleExon)$exon_id <- paste0(
            mcols(gr_tjnovel_middleExon)$transcript_id,
            "-exon",  mcols(gr_tjnovel_middleExon)$exon_number
        )
        mcols(gr_tjnovel_rightExon)$exon_id <- paste0(
            mcols(gr_tjnovel_rightExon)$transcript_id,
            "-exon",  mcols(gr_tjnovel_rightExon)$exon_number
        )
        mcols(gr_tjnovel_leftExon)$type <- "exon"
        mcols(gr_tjnovel_middleExon)$type <- "exon"
        mcols(gr_tjnovel_rightExon)$type <- "exon"

        new_gtf <- c(new_gtf, gr_tjnovel_leftExon, 
            gr_tjnovel_middleExon, gr_tjnovel_rightExon)
    }

    return(new_gtf)
}

################################################################################

# Use Exon Groups file to designate flanking exon islands to ALL junctions
.collateData_junc_group <- function(
        threadID, reference_path, norm_output_path
) {
    if(threadID != 2) return()
    junc.common <- as.data.table(read.fst(file.path(norm_output_path, 
        "annotation", "junc.common.annotated.fst")))

    Exon.Groups <- as.data.table(
        read.fst(file.path(reference_path, "fst", "Exons.Group.fst"))
    )
    junc.common[, c("seqnames") := as.character(get("seqnames"))]
    Exon.Groups[, c("seqnames") := as.character(get("seqnames"))]
    # Always calculate stranded for junctions
    Exon.Groups.S <- Exon.Groups[get("strand") != "*"]

    # Same method as in BuildRef
    junc.common <- .process_introns_group_overlap(
        junc.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "exon_group", "gene_group", "exon_group")
    )
    junc.common <- .process_introns_group_fix_RI(
        junc.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "intron_number", "gene_group", "intron_number")
    )
    junc.common[,
        c("JG_up", "JG_down") := list(
            paste(get("gene_group_up"), get("exon_group_up"), sep = "_"),
            paste(get("gene_group_down"), get("exon_group_down"), sep = "_")
        )
    ]

    # Remove temp columns
    junc.common$gene_group_up <- NULL
    junc.common$gene_group_down <- NULL
    junc.common$exon_group_up <- NULL
    junc.common$exon_group_down <- NULL
    
    write.fst(as.data.frame(junc.common), file.path(norm_output_path, 
        "annotation", "Junc.fst"))
        
    junc_PSI <- junc.common[, c("seqnames", "start", "end", "strand")]
    write.fst(as.data.frame(junc_PSI), 
        file.path(norm_output_path, "junc_PSI_index.fst"))
    
    rm(junc.common, Exon.Groups, Exon.Groups.S, junc_PSI)
    gc()
}

# Use Exon Groups file to designate flanking exon islands to ALL introns
.collateData_sw_group <- function(threadID,
        reference_path, norm_output_path, stranded = TRUE
) {
    if(threadID != 2) return()
    sw.common <- as.data.table(read.fst(file.path(norm_output_path, 
        "annotation", "sw.common.fst")))

    # Use Exon Groups file to designate exon groups to all junctions
    Exon.Groups <- as.data.table(
        read.fst(file.path(reference_path, "fst", "Exons.Group.fst")))

    sw.common[, c("seqnames") := as.character(get("seqnames"))]
    Exon.Groups[, c("seqnames") := as.character(get("seqnames"))]
    
    # Always calculate stranded for junctions
    Exon.Groups.S <- Exon.Groups[get("strand") != "*"]

    sw.common <- .process_introns_group_overlap(
        sw.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "exon_group", "gene_group", "exon_group")
    )
    sw.common <- .process_introns_group_fix_RI(
        sw.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "intron_number", "gene_group", "intron_number")
    )
    sw.common[,
        c("JG_up", "JG_down") := list(
            paste(get("gene_group_up"), get("exon_group_up"), sep = "_"),
            paste(get("gene_group_down"), get("exon_group_down"), sep = "_")
        )
    ]

    sw.common$gene_group_up <- NULL
    sw.common$gene_group_down <- NULL
    sw.common$exon_group_up <- NULL
    sw.common$exon_group_down <- NULL

    if (!stranded) {
        Exon.Groups.S <- Exon.Groups[get("strand") == "*"]
    } else {
        Exon.Groups.S <- Exon.Groups[get("strand") != "*"]
    }

    sw.common <- .process_introns_group_overlap(
        sw.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "exon_group", "gene_group", "exon_group")
    )
    sw.common <- .process_introns_group_fix_RI(
        sw.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "intron_number", "gene_group", "intron_number")
    )
    sw.common[,
        c("IRG_up", "IRG_down") := list(
            paste(get("gene_group_up"), get("exon_group_up"), sep = "_"),
            paste(get("gene_group_down"), get("exon_group_down"), sep = "_")
        )
    ]

    sw.common[, c("EventRegion") :=
        paste0(get("seqnames"), ":", get("start"), "-",
            get("end"), "/", get("strand"))]

    write.fst(as.data.frame(sw.common),
        file.path(norm_output_path, "annotation", "IR.fst"))
    
    rm(sw.common, Exon.Groups, Exon.Groups.S)
    gc()
}

# Annotates splice junctions with ID's of flanking exon islands
.collateData_splice_anno <- function(threadID,
        reference_path, norm_output_path
) {
    if(threadID != 2) return()
    if(!file.exists(file.path(reference_path, "fst", "Splice.fst"))) return()

    candidate.introns <- as.data.table(
        read.fst(file.path(reference_path, "fst", "junctions.fst")))

    sw.common <- as.data.table(read.fst(file.path(norm_output_path, 
        "annotation", "IR.fst")))

    candidate.introns[, c("seqnames") := as.character(get("seqnames"))]
    sw.common[, c("seqnames") := as.character(get("seqnames"))]
    
    # Order introns by importance (WHY?)
    candidate.introns[, c("transcript_biotype_2") := get("transcript_biotype")]
    candidate.introns[
        !(get("transcript_biotype") %in%
            c("protein_coding", "processed_transcript",
            "lincRNA", "antisense", "nonsense_mediated_decay")),
        c("transcript_biotype_2") := "other"]
    candidate.introns[, c("transcript_biotype_2") :=
        factor(get("transcript_biotype_2"),
            c("protein_coding", "processed_transcript",
            "lincRNA", "antisense", "other", "nonsense_mediated_decay"),
        ordered = TRUE)]
    if ("transcript_support_level" %in% colnames(candidate.introns)) {
        setorderv(candidate.introns,
            c("transcript_biotype_2", "transcript_support_level"))
    } else {
        setorder(candidate.introns, "transcript_biotype_2")
    }
    candidate.introns[, c("Event1a") := get("Event")]
    candidate.introns[, c("Event2a") := get("Event")]

    # Annotate exon islands for splice events
    Splice.Anno <- read.fst(file.path(reference_path, "fst", "Splice.fst"),
        as.data.table = TRUE)

    # Remove retained introns not assayed in sw.common
    Splice.Anno <- Splice.Anno[
        get("Event1a") %in% sw.common$EventRegion |
        get("EventType") != "RI"
    ]

    Splice.Anno[candidate.introns, on = "Event1a",
        c("up_1a") := paste(get("i.gene_group_stranded"),
        get("i.exon_group_stranded_upstream"), sep = "_")]
    Splice.Anno[candidate.introns, on = "Event1a",
        c("down_1a") := paste(get("i.gene_group_stranded"),
        get("i.exon_group_stranded_downstream"), sep = "_")]
    Splice.Anno[candidate.introns, on = "Event2a",
        c("down_2a") := paste(get("i.gene_group_stranded"),
        get("i.exon_group_stranded_downstream"), sep = "_")]

    Splice.Anno[get("EventType") %in% c("MXE", "SE", "ALE", "A3SS", "RI"),
        c("JG_up") := get("up_1a")]
    Splice.Anno[get("EventType") %in% c("SE", "AFE", "A5SS", "RI"),
        c("JG_down") := get("down_1a")]
    Splice.Anno[get("EventType") %in% c("MXE"),
        c("JG_down") := get("down_2a")]

    Splice.Anno$up_1a <- NULL
    Splice.Anno$down_1a <- NULL
    Splice.Anno$down_2a <- NULL
    Splice.Anno[, c("strand") :=
        tstrsplit(get("Event1a"), split = "/")[[2]]]
    
    write.fst(as.data.frame(Splice.Anno),
        file.path(norm_output_path, "annotation", "Splice.fst"))
    rm(Splice.Anno, candidate.introns, sw.common)
    gc()
}

# Generate rowData annotations
.collateData_rowEvent <- function(threadID, reference_path, norm_output_path) {
    if(threadID != 2) return()
    .collateData_rowEvent_brief(norm_output_path)
    .collateData_rowEvent_mapGenes(norm_output_path)
    .collateData_rowEvent_splice_option(reference_path, norm_output_path)
    .collateData_rowEvent_full(reference_path, norm_output_path)
}

# Truncated rowEvent, with EventName, EventType, EventRegion
.collateData_rowEvent_brief <- function(norm_output_path) {
    # make rowEvent brief here
    sw.common <- as.data.table(read.fst(
        file.path(norm_output_path, "annotation", "IR.fst")))
    sw.common[, c("seqnames") := as.character(get("seqnames"))]
    
    sw.anno.brief <- sw.common[, c("Name", "EventRegion")]
    setnames(sw.anno.brief, "Name", "EventName")
    sw.anno.brief[, c("EventType") := "IR"]
    sw.anno.brief <- sw.anno.brief[,
        c("EventName", "EventType", "EventRegion")]

    rowEvent <- sw.anno.brief
    spliceFile <- file.path(norm_output_path, "annotation", "Splice.fst")
    if(file.exists(spliceFile)) {
        Splice.Anno <- as.data.table(read.fst(spliceFile))
        splice.anno.brief <- Splice.Anno[,
            c("EventName", "EventType", "EventRegion")]
        rowEvent <- rbind(rowEvent, splice.anno.brief)    
    }    
    
    write.fst(as.data.frame(rowEvent), 
        file.path(norm_output_path, "rowEvent.brief.fst"))
    
    rm(sw.common, Splice.Anno, sw.anno.brief, splice.anno.brief)
    gc()
}

# Create mapping of EventNames to Genes (for Gene Ontology)
.collateData_rowEvent_mapGenes <- function(norm_output_path) {

    reference_path <- file.path(norm_output_path, "Reference")
    spliceFile <- file.path(reference_path, "fst/Splice.fst")
    TrFile <- file.path(reference_path, "fst/Transcripts.fst")
    IRdirFile <- file.path(reference_path, "fst/Introns.Dir.fst")
    IRnondirFile <- file.path(reference_path, "fst/Introns.ND.fst")
    
    # Create mappers
    IR_trid <- rbind(
        read.fst(
            IRdirFile, columns = c("EventName", "transcript_id")
        ),
        read.fst(
            IRnondirFile, columns = c("EventName", "transcript_id")
        )
    )
    Tr2Gene <- read.fst(TrFile, columns = c("transcript_id", "gene_id"))
    IR_trid <- as.data.table(IR_trid)
    Tr2Gene <- as.data.table(Tr2Gene)
    IR_trid <- Tr2Gene[IR_trid, on = "transcript_id"]
    IR_trid$gene_id_b <- IR_trid$gene_id
    allEvents <- IR_trid[, c("EventName", "gene_id", "gene_id_b"), with = FALSE]

    if(file.exists(spliceFile)) {
        splice_geneid <- read.fst(
            spliceFile, columns = c("EventName", "gene_id", "gene_id_b")
        )
        splice_geneid$gene_id <- as.character(splice_geneid$gene_id)
        splice_geneid$gene_id_b <- as.character(splice_geneid$gene_id_b)

        
        splice_geneid$gene_id <- as.character(splice_geneid$gene_id)
        splice_geneid$gene_id_b <- as.character(splice_geneid$gene_id_b)    
    }

    allEvents <- rbind(allEvents, splice_geneid)
    allEvents <- as.data.table(allEvents)
    
    # Order by events in rowEvent
    rowEvent <- read.fst(
        file.path(norm_output_path, "rowEvent.brief.fst")
    )
    rowEvent$gene_id <- allEvents$gene_id[match(
        rowEvent$EventName, allEvents$EventName)]
    rowEvent$gene_id_b <- allEvents$gene_id_b[match(
        rowEvent$EventName, allEvents$EventName)]
    
    write.fst(as.data.frame(rowEvent), 
        file.path(norm_output_path, "rowEvent.brief.fst"))
    
    rm(rowEvent, splice_geneid, IR_trid, Tr2Gene, allEvents)
    gc()
}

# Generate annotation based on importance of involved transcripts
.collateData_rowEvent_splice_option <- function(
        reference_path, norm_output_path
) {
    if(!file.exists(file.path(reference_path, "fst", "Splice.options.fst"))) 
        return()
    
    Splice.Options <- as.data.table(read.fst(
        file.path(reference_path, "fst", "Splice.options.fst")))
    Transcripts <- as.data.table(read.fst(
        file.path(reference_path, "fst", "Transcripts.fst")))
    Splice.Anno.Brief <- read.fst(
        file.path(reference_path, "fst", "Splice.fst"),
        as.data.table = TRUE, columns = c("EventName", "EventID"))
    # Splice.Options[Splice.Anno.Brief, on = "EventID",
        # c("EventName") := get("i.EventName")]
    # Splice.Options[Transcripts, on = "transcript_id",
        # c("transcript_biotype") := get("i.transcript_biotype")]
    Splice.Options$EventName <- Splice.Anno.Brief$EventName[match(
        Splice.Options$EventID, Splice.Anno.Brief$EventID)]
    Splice.Options$transcript_biotype <- Transcripts$transcript_biotype[match(
        Splice.Options$transcript_id, Transcripts$transcript_id)]

    Splice.Options.Summary <- copy(Splice.Options)
    # Splice.Options.Summary[,
        # c("tsl_min") := min(get("transcript_support_level")),
        # by = c("EventID", "isoform")]
    # Splice.Options.Summary[,
        # c("any_is_PC") := any(get("is_protein_coding")),
        # by = c("EventID", "isoform")]
    # Splice.Options.Summary[,
        # c("all_is_NMD") := all(grepl("decay", get("transcript_biotype"))),
        # by = c("EventID", "isoform")]
    Splice.Options.Summary[,
        c("tsl_min", "any_is_PC", "all_is_NMD") := list(
            min(get("transcript_support_level")),
            any(get("is_protein_coding")),
            all(grepl("decay", get("transcript_biotype")))
        ), by = c("EventID", "isoform")]

    write.fst(as.data.frame(Splice.Options.Summary),
        file.path(norm_output_path, "annotation", "Splice.Options.Summary.fst"))
    rm(Splice.Options.Summary, Splice.Options, Transcripts, Splice.Anno.Brief)
    gc()
}

# Annotate full rowEvent. Includes:
# - whether Inc/Exc is a protein coding splicing event
# - whether Inc/Exc represent an event causing NMD
# - the highest-ranking TSL for each alternative (A/B) of event
.collateData_rowEvent_full <- function(reference_path, norm_output_path) {

    Splice.Options.Summary <- as.data.table(read.fst(
        file.path(norm_output_path, "annotation", 
        "Splice.Options.Summary.fst")))
    Splice.Anno <- as.data.table(read.fst(
        file.path(norm_output_path, "annotation", "Splice.fst")))

    rowEvent.Extended <- read.fst(
        file.path(norm_output_path, "rowEvent.brief.fst"),
        as.data.table = TRUE)
    
    # IR-NMD: not guaranteed to exist:
    IRNMDfile <- file.path(reference_path, "fst", "IR.NMD.fst")
    IR_NMD <- NULL
    if(file.exists(IRNMDfile)) {
        IR_NMD <- read.fst(IRNMDfile, as.data.table = TRUE)
    }

    candidate.introns <- as.data.table(
        read.fst(file.path(reference_path, "fst", "junctions.fst")))   

    rowEvent.Extended[get("EventType") == "IR",
        c("intron_id") := tstrsplit(get("EventName"), split = "/")[[2]]]

    rowEvent.Extended.IR <- rowEvent.Extended[get("EventType") == "IR"]
    rowEvent.Extended.splice <- rowEvent.Extended[get("EventType") != "IR"]

    rowEvent.Extended.IR[, 
        c("Inc_Is_Protein_Coding", "Exc_Is_Protein_Coding") := 
        list(FALSE,FALSE)
    ]
    rowEvent.Extended.splice[, 
        c("Inc_Is_Protein_Coding", "Exc_Is_Protein_Coding") := 
        list(FALSE,FALSE)
    ]

    if(!is.null(IR_NMD)) {
        rowEvent.Extended.IR[
            get("intron_id") %in% IR_NMD$intron_id[IR_NMD$intron_type == "CDS"],
            c("Inc_Is_Protein_Coding", "Exc_Is_Protein_Coding") := TRUE
        ]    
    }

    rowEvent.Extended.splice[
        get("EventName") %in% Splice.Options.Summary$EventName[
            Splice.Options.Summary$isoform == "A" &
            Splice.Options.Summary$any_is_PC == TRUE
        ],
        c("Inc_Is_Protein_Coding") := TRUE
    ]
    rowEvent.Extended.splice[
        get("EventName") %in% Splice.Options.Summary$EventName[
            Splice.Options.Summary$isoform == "B" &
            Splice.Options.Summary$any_is_PC == TRUE
        ],
        c("Exc_Is_Protein_Coding") := TRUE
    ]

    # rowEvent.Extended[Splice.Options.Summary[get("isoform") == "A"],
        # on = "EventName", c("Inc_Is_Protein_Coding") := get("i.any_is_PC")]
    # rowEvent.Extended[Splice.Options.Summary[get("isoform") == "B"],
        # on = "EventName", c("Exc_Is_Protein_Coding") := get("i.any_is_PC")]


    rowEvent.Extended.IR[, c("Exc_Is_NMD", "Inc_Is_NMD") := 
        list(FALSE, FALSE)]
    rowEvent.Extended.splice[, c("Exc_Is_NMD", "Inc_Is_NMD") := 
        list(FALSE, FALSE)]

    tmpA <- Splice.Options.Summary[get("isoform") == "A"]
    tmpB <- Splice.Options.Summary[get("isoform") == "B"]

    ## Important changes:
    ## - for IR, NMD is
    ##   - reported for all introns calculated in reference (not just CDS)
    ##   - NA if intron not tested (not FALSE, as in version <= 1.1.2)
    ##   - only true for spliced transcripts if true for all transcripts
    ##     containing the same intron

    if(!is.null(IR_NMD)) {
        # rowEvent.Extended[IR_NMD[!is.na(get("splice_is_NMD"))], on = "intron_id",
            # c("Exc_Is_NMD") := get("i.splice_is_NMD")]
        # rowEvent.Extended[IR_NMD, on = "intron_id",
            # c("Inc_Is_NMD") := get("i.IRT_is_NMD")]
        rowEvent.Extended.IR$Inc_Is_NMD <- IR_NMD$IRT_is_NMD[match(
            rowEvent.Extended.IR$intron_id, IR_NMD$intron_id)]
        rowEvent.Extended.IR$tmpExc_Is_NMD <- IR_NMD$splice_is_NMD[match(
            rowEvent.Extended.IR$intron_id, IR_NMD$intron_id)]
        rowEvent.Extended.IR[, 
            c("sumExc_Is_NMD") := sum(get("tmpExc_Is_NMD") == TRUE, na.rm = TRUE),
            by = "EventRegion"
        ]
        rowEvent.Extended.IR[, c("Exc_Is_NMD") := get("sumExc_Is_NMD") > 0]
        rowEvent.Extended.IR$sumExc_Is_NMD <- NULL   
        rowEvent.Extended.IR$tmpExc_Is_NMD <- NULL

        rowEvent.Extended.IR[!(get("intron_id") %in% IR_NMD$intron_id),
            c("Exc_Is_NMD", "Inc_Is_NMD") := list(NA, NA)]    
        
        # rowEvent.Extended[Splice.Options.Summary[get("isoform") == "A"],
            # on = "EventName", c("Inc_Is_NMD") := get("i.all_is_NMD")]
        # rowEvent.Extended[Splice.Options.Summary[get("isoform") == "B"],
            # on = "EventName", c("Exc_Is_NMD") := get("i.all_is_NMD")]

        rowEvent.Extended.splice$Inc_Is_NMD <- tmpA$all_is_NMD[match(
            rowEvent.Extended.splice$EventName, tmpA$EventName)]
        rowEvent.Extended.splice$Exc_Is_NMD <- tmpB$all_is_NMD[match(
            rowEvent.Extended.splice$EventName, tmpB$EventName)]
    }
    
    rowEvent.Extended.IR[, c("Inc_TSL", "Exc_TSL") := list(NA, NA)]
    rowEvent.Extended.splice[, c("Inc_TSL", "Exc_TSL") := list(NA, NA)]

    # rowEvent.Extended[candidate.introns, on = "intron_id",
        # c("Inc_TSL") := get("i.transcript_support_level")]
    # rowEvent.Extended[candidate.introns, on = "intron_id",
        # c("Exc_TSL") := get("i.transcript_support_level")]
    rowEvent.Extended.IR$Inc_TSL <- 
        candidate.introns$transcript_support_level[match(
            rowEvent.Extended.IR$intron_id, candidate.introns$intron_id)]
    rowEvent.Extended.IR$Exc_TSL <- 
        candidate.introns$transcript_support_level[match(
            rowEvent.Extended.IR$intron_id, candidate.introns$intron_id)]

    # rowEvent.Extended[Splice.Options.Summary[get("isoform") == "A"],
        # on = "EventName", c("Inc_TSL") := get("i.tsl_min")]
    # rowEvent.Extended[Splice.Options.Summary[get("isoform") == "B"],
        # on = "EventName", c("Exc_TSL") := get("i.tsl_min")]
    rowEvent.Extended.splice$Inc_TSL <- tmpA$tsl_min[match(
        rowEvent.Extended.splice$EventName, tmpA$EventName)]
    rowEvent.Extended.splice$Exc_TSL <- tmpB$tsl_min[match(
        rowEvent.Extended.splice$EventName, tmpB$EventName)]
        
    # Designate exclusive first intron / last intron
    candidate.introns[, c("inverse_intron_number") :=
        max(get("intron_number")) - get("intron_number") + 1, 
        by = "transcript_id"]
    candidate.introns[, 
        c("max_intron_number", "max_inv_intron_number") := list(
            max(get("intron_number")), max(get("inverse_intron_number"))), 
        by = "Event"
    ]

    rowEvent.Extended <- rbind(rowEvent.Extended.IR, rowEvent.Extended.splice)

    # define Event1 / Event2
    # rowEvent.Extended[get("EventType") == "IR",
        # c("Event1a") := get("EventRegion")]
    # rowEvent.Extended[Splice.Anno, on = "EventName",
        # c("Event1a", "Event2a", "Event1b", "Event2b") :=
        # list(get("i.Event1a"), get("i.Event2a"),
            # get("i.Event1b"), get("i.Event2b"))]
    rowEvent.Extended$Event1a <- Splice.Anno$Event1a[match(
        rowEvent.Extended$EventName, Splice.Anno$EventName)]
    rowEvent.Extended$Event2a <- Splice.Anno$Event2a[match(
        rowEvent.Extended$EventName, Splice.Anno$EventName)]
    rowEvent.Extended$Event1b <- Splice.Anno$Event1b[match(
        rowEvent.Extended$EventName, Splice.Anno$EventName)]
    rowEvent.Extended$Event2b <- Splice.Anno$Event2b[match(
        rowEvent.Extended$EventName, Splice.Anno$EventName)]
    rowEvent.Extended[get("EventType") == "IR",
        c("Event1a") := get("EventRegion")]
    
    # rowEvent.Extended[candidate.introns, on = "Event1a",
        # c("iafi_A") := (get("i.max_intron_number") == 1)]
    # rowEvent.Extended[candidate.introns, on = "Event1b",
        # c("iafi_B") := (get("i.max_intron_number") == 1)]
    # rowEvent.Extended[, c("is_always_first_intron") :=
        # get("iafi_A") & get("iafi_B")]
    # rowEvent.Extended[, c("iafi_A", "iafi_B") := list(NULL, NULL)]

    rowEvent.Extended$is_always_first_intron <-
        rowEvent.Extended$Event1a %in% candidate.introns$Event[
            candidate.introns$max_intron_number == 1] &
        rowEvent.Extended$Event1b %in% candidate.introns$Event[
            candidate.introns$max_intron_number == 1]       

    # rowEvent.Extended[candidate.introns, on = "Event1a",
        # c("iali_A") := (get("i.max_inv_intron_number") == 1)]
    # rowEvent.Extended[candidate.introns, on = "Event1b",
        # c("iali_B") := (get("i.max_inv_intron_number") == 1)]
    # rowEvent.Extended[, c("is_always_last_intron") :=
        # get("iali_A") & get("iali_B")]
    # rowEvent.Extended[, c("iali_A", "iali_B") := list(NULL, NULL)]

    rowEvent.Extended$is_always_last_intron <-
        rowEvent.Extended$Event1a %in% candidate.introns$Event[
            candidate.introns$max_inv_intron_number == 1] &
        rowEvent.Extended$Event1b %in% candidate.introns$Event[
            candidate.introns$max_inv_intron_number == 1]       
    
    rowEvent.Extended[get("EventType") %in% c("MXE", "SE"),
        c("is_always_first_intron", "is_always_last_intron") := list(NA,NA)]

    # Convert IR to `annotated`
    rowEvent.Extended[, c("is_annotated_IR") := FALSE]

    Exons <- read.fst(file.path(reference_path, "fst", "Exons.fst"),
        as.data.table = TRUE)
    OL <- findOverlaps(
        coord2GR(unlist(
            rowEvent.Extended[get("EventType") == "IR", "EventRegion"]
        )), 
        .grDT(Exons),
        type = "within"
    )
    rowEvent.Extended[unique(OL@from), c("is_annotated_IR") := TRUE]

    # Annotate NMD direction
    rowEvent.Extended[, c("NMD_direction") := 0]
    if(!is.null(IR_NMD)) {
        rowEvent.Extended[get("Inc_Is_NMD") & !get("Exc_Is_NMD"), 
            c("NMD_direction") := 1]
        rowEvent.Extended[!get("Inc_Is_NMD") & get("Exc_Is_NMD"), 
            c("NMD_direction") := -1]
    }
    
    write.fst(as.data.frame(rowEvent.Extended), 
        file.path(norm_output_path, "rowEvent.fst"))

    rm(rowEvent.Extended, rowEvent.Extended.splice, rowEvent.Extended.IR,   
        candidate.introns, IR_NMD, Splice.Options.Summary, Splice.Anno)
    gc()
}

################################################################################
# Sub

# Collates data from junction counts and processBAM coverage
.collateData_compile_agglist <- function(x, jobs, df.internal,
        norm_output_path, IRMode,
        useProgressBar = TRUE) {

    # Load dataframe headers (left-most columns containing annotations)
    rowEvent <- as.data.table(read.fst(
        file.path(norm_output_path, "rowEvent.brief.fst")))
    junc.common <- as.data.table(read.fst(
        file.path(norm_output_path, "annotation", "Junc.fst"),
        columns = c("seqnames", "start", "end", "strand", 
            "Event", "JG_up", "JG_down")))
    sw.common <- as.data.table(read.fst(
        file.path(norm_output_path, "annotation", "IR.fst"),
        columns = c("seqnames", "start", "end", "Name", "Null", "strand",
        "EventRegion")))
    Splice.Anno <- as.data.table(read.fst(
        file.path(norm_output_path, "annotation", "Splice.fst")))
    junc_PSI <- as.data.table(read.fst(
        file.path(norm_output_path, "junc_PSI_index.fst")))
    
    junc.common[, c("seqnames") := as.character(get("seqnames"))]
    sw.common[, c("seqnames") := as.character(get("seqnames"))]

    work <- jobs[[x]]
    block <- df.internal[work]
    templates <- .collateData_seed_init(rowEvent, junc_PSI) # list of DT

    pb <- NULL
    if(useProgressBar) 
        pb <- progress_bar$new(
            format = " generating arrays [:bar] :percent eta: :eta",
            total = length(work), clear = FALSE, width= 100)
    for (i in seq_len(length(work))) {
        junc <- .collateData_process_junc(
            block$sample[i], block$strand[i],
            junc.common, norm_output_path)

        junc_tmp <- junc[sw.common, 
            on = c("seqnames", "start", "end", "strand"),
            c("seqnames", "start", "end", "strand", "SO_L", "SO_R")]

        sw <- .collateData_process_sw(
            block$sample[i], block$strand[i], 
            junc_tmp, sw.common, norm_output_path)

        junc_tmp2 <- junc[get("Event") %in%
            c(Splice.Anno$Event1a, Splice.Anno$Event1b,
            Splice.Anno$Event2a, Splice.Anno$Event2b)]

        splice <- .collateData_process_splice(
            junc_tmp2, sw, Splice.Anno)

        splice <- .collateData_process_splice_depth(
            splice, sw)

        # Dump unneeded columns to preserve memory
        junc <- junc[, c("seqnames", "start", "end", "strand",
            "count", "count_unstranded", "SO_L", "SO_R")]
        sw_cols_keep <- c("EventRegion", "strand", "IntronDepth", 
                "ExonToIntronReadsLeft", "ExonToIntronReadsRight",
                "TotalDepth", "Coverage")
        if (IRMode == "SpliceOver") {
            sw <- sw[, c(sw_cols_keep, "SpliceOver"), with = FALSE]
        } else {
            sw <- sw[, c(sw_cols_keep, "SpliceMax"), with = FALSE]
        }
        gc()
            
        .collateData_process_assays_as_fst(rowEvent, junc_PSI, block$sample[i], 
            junc, sw, splice, IRMode, norm_output_path)

        # remove temp files - raw extracted junc / SW output from processBAM
        file.remove(file.path(norm_output_path, "temp",
            paste(block$sample[i], "junc.fst.tmp", sep = ".")))
        file.remove(file.path(norm_output_path, "temp",
            paste(block$sample[i], "sw.fst.tmp", sep = ".")))
        if(!is.null(pb)) pb$tick()
    } # end FOR loop
    return(NULL)
}

# INTERNAL: copies a list of data.table
.copy_DT <- function(templates) {
    copy <- lapply(templates, copy)
    names(copy) <- names(templates)
    return(copy)
}

# Curate list of subset headers
.collateData_seed_init <- function(rowEvent, junc_PSI) {
    templates <- list(
        assay = copy(rowEvent),
        inc = rowEvent[get("EventType") %in% c("IR", "MXE", "SE", "RI")],
        exc = rowEvent[get("EventType") %in% c("MXE")],
        junc = copy(junc_PSI)
    )
    return(templates)
}

# Collates junction counts, given sample strandedness and junction strand anno
# - calculates SpliceLeft, SpliceRight, SpliceOver sums
.collateData_process_junc <- function(sample, strand,
        junc.common, norm_output_path) {
    junc <- as.data.table(
        read.fst(file.path(norm_output_path, "temp",
            paste(sample, "junc.fst.tmp", sep = ".")))
    )
    junc[, c("seqnames") := as.character(get("seqnames"))]
    junc[, c("start") := get("start") + 1]
    junc$strand <- NULL # Use strand from junc.common

    junc <- junc[junc.common, on = colnames(junc.common)[c(1, 2, 3)]]
    junc$count_unstranded <- junc$total
    if (strand == 0) {
        junc$count <- junc$total
    } else if (strand == -1) {
        junc$count <- 0
        junc[get("strand") == "+", c("count") := get("neg")]
        junc[get("strand") == "-", c("count") := get("pos")]
        junc[get("strand") == "*", c("count") := get("total")]
    } else {
        junc$count <- 0
        junc[get("strand") == "+", c("count") := get("pos")]
        junc[get("strand") == "-", c("count") := get("neg")]
        junc[get("strand") == "*", c("count") := get("total")]
    }
    junc[is.na(get("count")), c("count") := 0]
    junc <- junc[, c("seqnames", "start", "end", "strand", 
        "Event", "count", "count_unstranded")]
    junc <- cbind(junc, junc.common[, c("JG_up", "JG_down")])
    junc[, c("SO_L") := 0]
    junc[, c("SO_R") := 0]
    junc[, c("SO_I") := 0]

    # SpliceLeft and SpliceRight calculations
    junc[, c("SL") := sum(get("count")),
        by = c("seqnames", "start", "strand")]
    junc[, c("SR") := sum(get("count")),
        by = c("seqnames", "end", "strand")]

    # first overlap any junction that has non-same-island junctions
    junc[get("JG_up") != get("JG_down") &
            get("JG_up") != "" & get("strand") == "+",
        c("SO_L") := sum(get("count")), by = "JG_up"]
    junc[get("JG_up") != get("JG_down") &
            get("JG_down") != "" & get("strand") == "+",
        c("SO_R") := sum(get("count")), by = "JG_down"]
    junc[get("JG_up") != get("JG_down") &
            get("JG_up") != "" & get("strand") == "-",
        c("SO_R") := sum(get("count")), by = "JG_up"]
    junc[get("JG_up") != get("JG_down") &
            get("JG_down") != "" & get("strand") == "-",
        c("SO_L") := sum(get("count")), by = "JG_down"]

    # Then use a simple overlap method to account for the remainder

    # Filter for same-island junctions
    junc.subset <- junc[get("JG_up") == get("JG_down") &
        get("JG_up") != "" & get("JG_down") != ""]
    junc.subset <- junc.subset[!(grepl("NA_NA", get("JG_up")) &
        !grepl("NA_NA", get("JG_down")))]

    if (nrow(junc.subset) > 0) {
        subject <- junc[get("count") > 0]
        OL <- .findOverlaps_merge_DT(junc.subset, subject)

        splice.overlaps.DT <- data.table(from = from(OL), to = to(OL))
        splice.overlaps.DT[,
            c("count") := subject$count[to(OL)]]
        splice.overlaps.DT[,
            c("count_sum") := sum(get("count")), by = "from"]
        splice.summa <- unique(
            splice.overlaps.DT[, c("from", "count_sum")])

        junc.subset[splice.summa$from,
            c("SO_I") := splice.summa$count_sum]

        junc[junc.subset, on = c("Event"), c("SO_I") := get("i.SO_I")]

        # For annotated junctions, take SpliceOver as max of
        # SpliceLeft, SpliceRight, or SpliceOver
        junc[get("SO_L") < get("SO_I"), c("SO_L") := get("SO_I")]
        junc[get("SO_R") < get("SO_I"), c("SO_R") := get("SO_I")]
        # Finally, for extreme cases, make SO_L = SL if underestimates
        junc[get("SO_L") < get("SL"), c("SO_L") := get("SL")]
        junc[get("SO_R") < get("SR"), c("SO_R") := get("SR")]
        junc[, c("SO_I") := NULL]
        rm(OL, subject, junc.subset)
        gc()
    }

    return(junc)
}

# Imports processBAM data, calculates SpliceOver from junction counts
.collateData_process_sw <- function(sample, strand, junc,
        sw.common, norm_output_path) {
    sw <- as.data.table(
        read.fst(file.path(norm_output_path, "temp",
            paste(sample, "sw.fst.tmp", sep = ".")))
    )
    sw[, c("seqnames") := as.character(get("seqnames"))]
    sw[, c("start") := get("start") + 1]
    sw <- sw[sw.common, on = colnames(sw.common)[seq_len(6)],
        c("EventRegion") := get("i.EventRegion")]

    # Extra statistics:
    sw[, c("SpliceMax") := 0]
    sw[get("SpliceLeft") >= get("SpliceRight"),
        c("SpliceMax") := get("SpliceLeft")]
    sw[get("SpliceLeft") < get("SpliceRight"),
        c("SpliceMax") := get("SpliceRight")]

    sw[junc, on = c("seqnames", "start", "end", "strand"),
        c("SpliceOverLeft") := get("SO_L")]
    sw[junc, on = c("seqnames", "start", "end", "strand"),
        c("SpliceOverRight") := get("SO_R")]
    sw[get("SpliceOverLeft") >= get("SpliceOverRight"),
        c("SpliceOver") := get("SpliceOverLeft")]
    sw[get("SpliceOverLeft") < get("SpliceOverRight"),
        c("SpliceOver") := get("SpliceOverRight")]

    sw[, c("IROratio") := 0]
    sw[get("IntronDepth") < 1 & get("IntronDepth") > 0 &
        (get("Coverage") + get("SpliceOver")) > 0,
        c("IROratio") := get("Coverage") / (
            get("Coverage") + get("SpliceOver"))]
    sw[get("IntronDepth") >= 1,
        c("IROratio") := get("IntronDepth") /
            (get("IntronDepth") + get("SpliceOver"))]

    sw[, c("TotalDepth") := get("IntronDepth") + get("SpliceOver")]
    setnames(sw, "Name", "EventName")
    return(sw)
}

# Calculates junction counts for each annotated splice event. Also calculates:
# - participation: The proportion of junctions at the region for each ASE
# - coverage: The total number of junctions covered at that ASE region
.collateData_process_splice <- function(junc, sw, Splice.Anno) {
    splice <- copy(Splice.Anno)

    # Summarises counts for all splice events as defined by Event(1/2)(a/b)
    splice[, c("count_Event1a", "count_Event2a",
        "count_Event1b", "count_Event2b") := list(0, 0, 0, 0)]
    splice[!is.na(get("Event1a")),
        c("count_Event1a") := junc$count[match(get("Event1a"), junc$Event)]]
    splice[is.na(get("count_Event1a")),
        c("count_Event1a") := 0]
    splice[!is.na(get("Event1a")) & get("EventType") == "RI",
        c("count_Event1a") :=
            sw$IntronDepth[match(get("Event1a"), sw$EventRegion)]
    ]

    splice[!is.na(get("Event2a")),
        c("count_Event2a") := junc$count[match(get("Event2a"), junc$Event)]]
    splice[is.na(get("count_Event2a")),
        c("count_Event2a") := 0]

    splice[!is.na(get("Event1b")),
        c("count_Event1b") := junc$count[match(get("Event1b"), junc$Event)]]
    splice[is.na(get("count_Event1b")),
        c("count_Event1b") := 0]

    splice[!is.na(get("Event2b")),
        c("count_Event2b") := junc$count[match(get("Event2b"), junc$Event)]]
    splice[is.na(get("count_Event2b")),
        c("count_Event2b") := 0]

    # Hack: for RI, ExonToIntronReadsLeft/Right is stored in count_Event2a/b
    splice[get("EventType") == "RI" &
            tstrsplit(get("Event1a"), split = "/", fixed = TRUE)[[2]] == "+",
        c("count_Event2a", "count_Event2b") := list(
            sw$ExonToIntronReadsLeft[match(get("Event1a"), sw$EventRegion)],
            sw$ExonToIntronReadsRight[match(get("Event1a"), sw$EventRegion)]
        )
    ]
    splice[get("EventType") == "RI" &
            tstrsplit(get("Event1a"), split = "/", fixed = TRUE)[[2]] == "-",
        c("count_Event2a", "count_Event2b") := list(
            sw$ExonToIntronReadsRight[match(get("Event1a"), sw$EventRegion)],
            sw$ExonToIntronReadsLeft[match(get("Event1a"), sw$EventRegion)]
        )
    ]

    # Calculates total splice events that participates from the up/down
    #   stream exon islands
    splice[, c("count_JG_up", "count_JG_down") := list(0, 0)]
    # splice[!is.na(get("JG_up")) & get("strand") == "+",
        # c("count_JG_up") := junc$SO_L[match(get("JG_up"), junc$JG_up)]]
    # splice[!is.na(get("JG_up")) & get("strand") == "-",
        # c("count_JG_up") := junc$SO_R[match(get("JG_up"), junc$JG_up)]]
    # splice[is.na(get("count_JG_up")), c("count_JG_up") := 0]
    # splice[!is.na(get("JG_down")) & get("strand") == "-",
        # c("count_JG_down") := junc$SO_L[match(get("JG_down"), junc$JG_down)]]
    # splice[!is.na(get("JG_down")) & get("strand") == "+",
        # c("count_JG_down") := junc$SO_R[match(get("JG_down"), junc$JG_down)]]
    # splice[is.na(get("count_JG_down")), c("count_JG_down") := 0]
    
    # New method of determing depths
    # Use Event1b's SO_L and SO_R for single events
    # Use Event1a's SO_L and SO_R for IR/RI
    # For SE, use Event1a and Event2a's SO
    # For MXE, use Event1a and Event2b's SO

    single_events <- c("AFE", "ALE", "A5SS", "A3SS")
    splice[get("strand") == "+", 
        c("count_JG_up_1") := junc$SO_L[match(get("Event1a"), junc$Event)]]
    splice[get("strand") == "-", 
        c("count_JG_up_1") := junc$SO_R[match(get("Event1a"), junc$Event)]]
    splice[get("strand") == "+" & !is.na(get("Event1b")), 
        c("count_JG_up_2") := junc$SO_L[match(get("Event1b"), junc$Event)]]
    splice[get("strand") == "-" & !is.na(get("Event1b")), 
        c("count_JG_up_2") := junc$SO_R[match(get("Event1b"), junc$Event)]]
    splice[get("strand") == "+" & is.na(get("Event1b")), 
        c("count_JG_up_2") := junc$SO_L[match(get("Event1a"), junc$Event)]]
    splice[get("strand") == "-" & is.na(get("Event1b")), 
        c("count_JG_up_2") := junc$SO_R[match(get("Event1a"), junc$Event)]]
    splice[get("count_JG_up_1") >= get("count_JG_up_2"),
        c("count_JG_up") := get("count_JG_up_1")]
    splice[get("count_JG_up_1") < get("count_JG_up_2"),
        c("count_JG_up") := get("count_JG_up_2")]

    splice[get("strand") == "+" & is.na(get("Event2a")), 
        c("count_JG_down_1") := junc$SO_R[match(get("Event1a"), junc$Event)]]
    splice[get("strand") == "-" & is.na(get("Event2a")), 
        c("count_JG_down_1") := junc$SO_L[match(get("Event1a"), junc$Event)]]
    splice[get("strand") == "+" & !is.na(get("Event2a")), 
        c("count_JG_down_1") := junc$SO_R[match(get("Event2a"), junc$Event)]]
    splice[get("strand") == "-" & !is.na(get("Event2a")), 
        c("count_JG_down_1") := junc$SO_L[match(get("Event2a"), junc$Event)]]
    splice[get("strand") == "+" & is.na(get("Event2b")), 
        c("count_JG_down_2") := junc$SO_R[match(get("Event1b"), junc$Event)]]
    splice[get("strand") == "-" & is.na(get("Event2b")), 
        c("count_JG_down_2") := junc$SO_L[match(get("Event1b"), junc$Event)]]
    splice[get("strand") == "+" & !is.na(get("Event2b")), 
        c("count_JG_down_2") := junc$SO_R[match(get("Event2b"), junc$Event)]]
    splice[get("strand") == "-" & !is.na(get("Event2b")), 
        c("count_JG_down_2") := junc$SO_L[match(get("Event2b"), junc$Event)]]
    splice[get("count_JG_down_1") >= get("count_JG_down_2"),
        c("count_JG_down") := get("count_JG_down_1")]
    splice[get("count_JG_down_1") < get("count_JG_down_2"),
        c("count_JG_down") := get("count_JG_down_2")]

    splice$count_JG_up_1 <- splice$count_JG_up_2 <- NULL
    splice$count_JG_down_1 <- splice$count_JG_down_2 <- NULL
        
    splice[get("strand") == "+" & !(get("EventType") %in% single_events), 
        c("count_JG_up") := junc$SO_L[match(get("Event1a"), junc$Event)]]
    splice[get("strand") == "-" & !(get("EventType") %in% single_events), 
        c("count_JG_up") := junc$SO_R[match(get("Event1a"), junc$Event)]]
    splice[get("strand") == "+" & get("EventType") %in% single_events, 
        c("count_JG_up") := junc$SO_L[match(get("Event1b"), junc$Event)]]
    splice[get("strand") == "-" & get("EventType") %in% single_events, 
        c("count_JG_up") := junc$SO_R[match(get("Event1b"), junc$Event)]]
        
    splice[get("strand") == "+" & get("EventType") %in% c("IR", "RI"), 
        c("count_JG_down") := junc$SO_R[match(get("Event1a"), junc$Event)]]
    splice[get("strand") == "-" & get("EventType") %in% c("IR", "RI"), 
        c("count_JG_down") := junc$SO_L[match(get("Event1a"), junc$Event)]]
    splice[get("strand") == "+" & get("EventType") %in% single_events, 
        c("count_JG_down") := junc$SO_R[match(get("Event1b"), junc$Event)]]
    splice[get("strand") == "-" & get("EventType") %in% single_events, 
        c("count_JG_down") := junc$SO_L[match(get("Event1b"), junc$Event)]]
    splice[get("strand") == "+" & get("EventType") == "SE", 
        c("count_JG_down") := junc$SO_R[match(get("Event2a"), junc$Event)]]
    splice[get("strand") == "-" & get("EventType") == "SE", 
        c("count_JG_down") := junc$SO_L[match(get("Event2a"), junc$Event)]]
    splice[get("strand") == "+" & get("EventType") == "MXE", 
        c("count_JG_down") := junc$SO_R[match(get("Event2b"), junc$Event)]]
    splice[get("strand") == "-" & get("EventType") == "MXE", 
        c("count_JG_down") := junc$SO_L[match(get("Event2b"), junc$Event)]]
        
    # Splice participation
    # - this calculates the number of events that participates in splicing
    #   across the upstream / downstream exon island that belong to
    #   either isoform A or B
    splice[, c("partic_up", "partic_down") := list(0, 0)]
    splice[get("EventType") %in% c("MXE", "SE", "ALE", "A3SS", "RI"),
        c("partic_up") := get("count_Event1a") + get("count_Event1b")]
    splice[get("EventType") %in% c("MXE"),
        c("partic_down") := get("count_Event2a") + get("count_Event2b")]
    splice[get("EventType") %in% c("SE"),
        c("partic_down") := get("count_Event2a") + get("count_Event1b")]
    splice[get("EventType") %in% c("AFE", "A5SS", "RI"),
        c("partic_down") := get("count_Event1a") + get("count_Event1b")]

    # Splice "coverage" = participation / max_JG
    splice[, c("cov_up", "cov_down") := list(0, 0)]
    splice[get("count_JG_up") > 0,
        c("cov_up") := get("partic_up") / get("count_JG_up")]
    splice[get("count_JG_down") > 0,
        c("cov_down") := get("partic_down") / get("count_JG_down")]
    splice[get("EventType") %in% c("MXE", "SE", "RI") &
        get("cov_up") < get("cov_down"),
        c("coverage") := get("cov_up")]
    splice[get("EventType") %in% c("MXE", "SE", "RI") &
        get("cov_up") >= get("cov_down"),
        c("coverage") := get("cov_down")]
    splice[get("EventType") %in% c("ALE", "A3SS"),
        c("coverage") := get("cov_up")]
    splice[get("EventType") %in% c("AFE", "A5SS"),
        c("coverage") := get("cov_down")]

    return(splice)
}

# Calculates TotalDepth which includes local IR levels.
# - Used to normalize Coverage plots
.collateData_process_splice_depth <- function(splice, sw) {

    # Calculate depth for splicing where EventRegion doesn't cover a single
    #   intron or where processBAM has decided not to assess the intron
    splice.no_region <- splice[!(get("EventRegion") %in% sw$EventRegion)]
    splice.no_region[, c("Depth1a") :=
        sw$TotalDepth[match(get("Event1a"), sw$EventRegion)]]
    splice.no_region[, c("Depth2a") :=
        sw$TotalDepth[match(get("Event2a"), sw$EventRegion)]]
    splice.no_region[, c("Depth1b") :=
        sw$TotalDepth[match(get("Event1b"), sw$EventRegion)]]
    splice.no_region[, c("Depth2b") :=
        sw$TotalDepth[match(get("Event2b"), sw$EventRegion)]]
    splice.no_region[is.na(get("Depth1a")), c("Depth1a") := 0]
    splice.no_region[is.na(get("Depth1b")), c("Depth1b") := 0]
    splice.no_region[is.na(get("Depth2a")), c("Depth2a") := 0]
    splice.no_region[is.na(get("Depth2b")), c("Depth2b") := 0]

    # First guess depth based on depths of any junction that is assessed
    #   by processBAM
    # - DepthA, DepthB: upstream and downstream IR+Splice depths
    splice.no_region[, c("DepthUp", "DepthDown") := list(0,0)]
    splice.no_region[get("Depth1a") > get("Depth1b"),
        c("DepthUp") := get("Depth1a")]
    splice.no_region[get("Depth1a") <= get("Depth1b"),
        c("DepthUp") := get("Depth1b")]
    # DepthDown only matters where EventType %in% c("MXE", "SE")
    splice.no_region[!(get("EventType") %in% c("MXE", "SE")),
         c("DepthDown") := get("DepthUp")]
    splice.no_region[get("EventType") %in% c("SE") & 
            get("Depth1b") > get("Depth2a"),
        c("DepthDown") := get("Depth1b")]
    splice.no_region[get("EventType") %in% c("SE") & 
            get("Depth1b") <= get("Depth2a"),
        c("DepthDown") := get("Depth2a")]
    splice.no_region[get("EventType") %in% c("MXE") & 
            get("Depth2b") > get("Depth2a"),
        c("DepthDown") := get("Depth2b")]
    splice.no_region[get("EventType") %in% c("MXE") & 
            get("Depth2b") <= get("Depth2a"),
        c("DepthDown") := get("Depth2a")]

    splice.no_region[, c("Depth") := 0]
    splice.no_region[get("DepthUp") < get("count_JG_up"),
        c("DepthUp") := get("count_JG_up")]
    splice.no_region[get("DepthDown") < get("count_JG_down"),
        c("DepthDown") := get("count_JG_down")]
    splice.no_region[get("DepthUp") > get("DepthDown"),
        c("Depth") := get("DepthUp")]
    splice.no_region[get("DepthUp") <= get("DepthDown"),
        c("Depth") := get("DepthDown")]

    splice[, c("TotalDepth") := 0]
    splice[sw, on = "EventRegion",
        c("TotalDepth") := get("i.TotalDepth")]
    splice[splice.no_region, on = "EventName",
        c("TotalDepth") := get("i.Depth")]
    return(splice)
}

# Compiles all the data as assays, write as temp FST files
# - This acts as "on-disk memory" to avoid using too much memory
.collateData_process_assays_as_fst <- function(rowEvent, junc_PSI,
        sample, junc, sw, splice, IRMode, norm_output_path) {

    assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
    inc.todo <- c("Up_Inc", "Down_Inc")
    exc.todo <- c("Up_Exc", "Down_Exc")
    junc.todo <- c("junc_PSI", "junc_counts", "junc_counts_uns")
    templates <- .collateData_seed_init(rowEvent, junc_PSI)
    
    # Included / Excluded counts for IR and splicing
    templates$assay[, c("Included") := c(
        sw$IntronDepth,
        0.5 * (splice$count_Event1a[splice$EventType %in% c("SE", "MXE")] +
            splice$count_Event2a[splice$EventType %in% c("SE", "MXE")]),
        splice$count_Event1a[!splice$EventType %in% c("SE", "MXE")]
    )]
    if (IRMode == "SpliceOver") {
        templates$assay[, c("Excluded") := c(
            sw$SpliceOver,
            0.5 * (splice$count_Event1b[splice$EventType %in% c("MXE")] +
                splice$count_Event2b[splice$EventType %in% c("MXE")]),
            splice$count_Event1b[!splice$EventType %in% c("MXE")]
        )]
    } else {
        templates$assay[, c("Excluded") := c(
            sw$SpliceMax,
            0.5 * (splice$count_Event1b[splice$EventType %in% c("MXE")] +
                splice$count_Event2b[splice$EventType %in% c("MXE")]),
            splice$count_Event1b[!splice$EventType %in% c("MXE")]
        )]
    }

    # Validity checking for IR, MXE, SE
    sw[get("strand") == "+", c("Up_Inc") := get("ExonToIntronReadsLeft")]
    sw[get("strand") == "-", c("Up_Inc") := get("ExonToIntronReadsRight")]
    sw[get("strand") == "+", c("Down_Inc") := get("ExonToIntronReadsRight")]
    sw[get("strand") == "-", c("Down_Inc") := get("ExonToIntronReadsLeft")]
    templates$inc[, c("Up_Inc") := c(sw$Up_Inc,
            splice$count_Event1a[splice$EventType %in% c("MXE", "SE")],
            splice$count_Event2a[splice$EventType %in% c("RI")]
    )]
    templates$inc[, c("Down_Inc") := c(sw$Down_Inc,
            splice$count_Event2a[splice$EventType %in% c("MXE", "SE")],
            splice$count_Event2b[splice$EventType %in% c("RI")]
    )]
    templates$exc[, c("Up_Exc") :=
        splice$count_Event1b[splice$EventType %in% c("MXE")]]
    templates$exc[, c("Down_Exc") :=
        splice$count_Event2b[splice$EventType %in% c("MXE")]]

    templates$assay[, c("Depth") := c(sw$TotalDepth, splice$TotalDepth)]
    templates$assay[, c("Coverage") := c(sw$Coverage, splice$coverage)]
    
    # copy RI's coverage from corresponding IR's coverage
    templates$assay[get("EventType") == "RI",
        c("Coverage") := sw$Coverage[match(get("EventRegion"), sw$EventRegion)]]

    splice[get("EventType") %in% c("MXE", "SE") &
        get("cov_up") < get("cov_down"),
        c("minDepth") := get("count_JG_up")]
    splice[get("EventType") %in% c("MXE", "SE") &
        get("cov_up") >= get("cov_down"),
        c("minDepth") := get("count_JG_down")]
    splice[get("EventType") %in% c("ALE", "A3SS", "RI"),
        c("minDepth") := get("count_JG_up")]
    splice[get("EventType") %in% c("AFE", "A5SS"),
        c("minDepth") := get("count_JG_down")]
    templates$assay[, c("minDepth") := c(sw$IntronDepth, splice$minDepth)]

    junc[get("count") == 0, c("PSI") := 0]
    junc[get("SO_L") > get("SO_R"),
        c("PSI") := get("count") / get("SO_L")]
    junc[get("SO_R") >= get("SO_L") & get("SO_R") > 0,
        c("PSI") := get("count") / get("SO_R")]
    
    templates$junc <- junc[, c("seqnames", "start", "end", "strand", 
        "PSI", "count", "count_unstranded")]
    colnames(templates$junc)[c(5,6,7)] <- 
        c("junc_PSI", "junc_counts", "junc_counts_uns")

    fst::write.fst(as.data.frame(templates$assay[, assay.todo, with = FALSE]),
        file.path(norm_output_path, "temp",
                paste("assays", sample, "fst.tmp", sep = ".")))
    fst::write.fst(as.data.frame(templates$inc[, inc.todo, with = FALSE]),
        file.path(norm_output_path, "temp",
                paste("included", sample, "fst.tmp", sep = ".")))
    fst::write.fst(as.data.frame(templates$exc[, exc.todo, with = FALSE]),
        file.path(norm_output_path, "temp",
                paste("excluded", sample, "fst.tmp", sep = ".")))
    fst::write.fst(as.data.frame(templates$junc[, junc.todo, with = FALSE]),
        file.path(norm_output_path, "temp",
                paste("junc_psi", sample, "fst.tmp", sep = ".")))

}

################################################################################

# Reads the "on-disk memory" FST files and compile to H5
.collateData_compile_assays_from_fst <- function(df.internal,
        norm_output_path, samples_per_block) {

    rowname_lists <-
        .collateData_H5_initialize(nrow(df.internal), norm_output_path)

    .collateData_H5_write_assays(df.internal, norm_output_path,
        samples_per_block)

    # Retrieve assays:
    assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
    inc.todo <- c("Up_Inc", "Down_Inc")
    exc.todo <- c("Up_Exc", "Down_Exc")
    junc.todo <- c("junc_PSI", "junc_counts", "junc_counts_uns")
    stuff.todo <- c(assay.todo, inc.todo, exc.todo, junc.todo)

    h5filename <- file.path(norm_output_path, "data.h5")
    assays <- list()
    for (assay in assay.todo) {
        # h5writeDimnames(list(rowname_lists$EventName, df.internal$sample),
            # h5filename, assay)
        assays[[assay]] <- HDF5Array(h5filename, assay)
    }
    for (inc in inc.todo) {
        # h5writeDimnames(list(rowname_lists$Inc_Events, df.internal$sample),
            # h5filename, inc)
        assays[[inc]] <- HDF5Array(h5filename, inc)
    }
    for (exc in exc.todo) {
        # h5writeDimnames(list(rowname_lists$Exc_Events, df.internal$sample),
            # h5filename, exc)
        assays[[exc]] <- HDF5Array(h5filename, exc)
    }
    
    # if(length(rowname_lists$junc_rownames) > 1000000)
        # .log(paste(
            # "Please ignore any messages below about compression and chunking;",
            # "The H5 file chunk size is already optimized."
        # ), "message")
    for (junc in junc.todo) {
        # h5writeDimnames(list(rowname_lists$junc_rownames, df.internal$sample),
            # h5filename, junc)
        assays[[junc]] <- HDF5Array(h5filename, junc)
    }
    return(assays)
}

# Initializes H5 arrays using rowData; return rownames as a list
.collateData_H5_initialize <- function(
        num_samples, norm_output_path
) {
    assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
    inc.todo <- c("Up_Inc", "Down_Inc")
    exc.todo <- c("Up_Exc", "Down_Exc")
    junc.todo <- c("junc_PSI", "junc_counts", "junc_counts_uns")
    stuff.todo <- c(assay.todo, inc.todo, exc.todo, junc.todo)

    rowData <- as.data.table(
        read.fst(file.path(norm_output_path, "rowEvent.fst")))
    Inc_Events <- rowData$EventName[rowData$EventType %in%
        c("IR", "MXE", "SE", "RI")]
    Exc_Events <- rowData$EventName[rowData$EventType %in% "MXE"]

    junc_index <- fst::read.fst(file.path(
        norm_output_path, "junc_PSI_index.fst"
    ))
    junc_rownames <- with(junc_index,
        paste0(seqnames, ":", start, "-", end, "/", strand))

    h5filename <- file.path(norm_output_path, "data.h5")
    if (file.exists(h5filename)) file.remove(h5filename)
    h5createFile(h5filename)
    for (assay in assay.todo) {
        chunk_row <- nrow(rowData)
        # 100 kb 1-column chunks
        if(chunk_row > 1024 * 100 / 8) chunk_row <- 1024 * 100 / 8
        h5createDataset(file = h5filename,
            dataset = assay,
            dims = c(nrow(rowData), num_samples),
            storage.mode = "double", 
            chunk = c(chunk_row, 1), level = 6
        )
    }
    for (inc in inc.todo) {
        chunk_row <- length(Inc_Events)
        # 100 kb 1-column chunks
        if(chunk_row > 1024 * 100 / 8) chunk_row <- 1024 * 100 / 8
        h5createDataset(file = h5filename,
            dataset = inc,
            dims = c(length(Inc_Events), num_samples),
            storage.mode = "double", 
            chunk = c(chunk_row, 1), level = 6
        )
    }
    for (exc in exc.todo) {
        # 100 kb 1-column chunks
        if(chunk_row > 1024 * 100 / 8) chunk_row <- 1024 * 100 / 8
        chunk_row <- length(Exc_Events)
        h5createDataset(file = h5filename, dataset = exc,
            dims = c(length(Exc_Events), num_samples),
            storage.mode = "double", 
            chunk = c(chunk_row, 1), level = 6
        )
    }
    for (junc in junc.todo) {
        # 100 kb 1-column chunks
        if(chunk_row > 1024 * 100 / 8) chunk_row <- 1024 * 100 / 8
        chunk_row <- length(junc_rownames)
        h5createDataset(file = h5filename, dataset = junc,
            dims = c(length(junc_rownames), num_samples),
            storage.mode = "double", 
            chunk = c(chunk_row, 1), level = 6
        )
    }
    rowname_lists <- list(
        EventName = rowData$EventName,
        Inc_Events = Inc_Events, Exc_Events = Exc_Events,
        junc_rownames = junc_rownames
    )
    return(rowname_lists)
}

# Read temp FST files; add to H5 assays
.collateData_H5_write_assays <- function(
        df.internal, norm_output_path, samples_per_block
) {
    assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
    inc.todo <- c("Up_Inc", "Down_Inc")
    exc.todo <- c("Up_Exc", "Down_Exc")
    junc.todo <- c("junc_PSI", "junc_counts", "junc_counts_uns")
    stuff.todo <- c(assay.todo, inc.todo, exc.todo, junc.todo)

    h5filename <- file.path(norm_output_path, "data.h5")

    # Make a nested list
    matrices <- list()
    for (stuff in stuff.todo) {
        matrices[[stuff]] <- list()
    }
    buf_i <- 0 # Counts the number of in-memory samples
    
    pb <- progress_bar$new(
        format = " finalising H5 database [:bar] :percent eta: :eta",
        total = nrow(df.internal), clear = FALSE, width= 100)
    for (i in seq_len(nrow(df.internal))) {
                                
        sample <- df.internal$sample[i]
        buf_i <- buf_i + 1

        mat <- fst::read.fst(file.path(norm_output_path, "temp",
            paste("assays", sample, "fst.tmp", sep = ".")))
        for (stuff in assay.todo) {
            matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
        }
        mat <- fst::read.fst(file.path(norm_output_path, "temp",
            paste("included", sample, "fst.tmp", sep = ".")))
        for (stuff in inc.todo) {
            matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
        }
        mat <- fst::read.fst(file.path(norm_output_path, "temp",
            paste("excluded", sample, "fst.tmp", sep = ".")))
        for (stuff in exc.todo) {
            matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
        }
        mat <- fst::read.fst(file.path(norm_output_path, "temp",
            paste("junc_psi", sample, "fst.tmp", sep = ".")))
        for (stuff in junc.todo) {
            matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
        }
        if (buf_i >= samples_per_block | i == nrow(df.internal)) {
            for (stuff in stuff.todo) {
                # write
                mat <- as.matrix(do.call(cbind, matrices[[stuff]]))
                mat[is.na(mat)] <- 0
                h5write(mat,
                    file = h5filename, name = stuff,
                    index = list(NULL, seq(i - buf_i + 1, i))
                )
                # reset
                matrices[[stuff]] <- list()
            }
            buf_i <- 0
            gc()
        }
        pb$tick()
    }
                            
             
}

################################################################################

# Writes sample stats to FST
.collateData_write_stats <- function(df.internal, norm_output_path) {
    outfile <- file.path(norm_output_path, "stats.fst")
    # Convert path to relative path, for reproducibility:
    df.internal$path <- .make_path_relative(df.internal$path, norm_output_path)
    write.fst(as.data.frame(df.internal), outfile)
}

# Writes default colData to RDS
.collateData_write_colData <- function(
        df.internal, coverage_files,
        norm_output_path, copyCOV = FALSE
) {
    if(!is.null(coverage_files)) {
        suppressWarnings({
            covfiles_full <- normalizePath(file.path(
                norm_output_path, coverage_files))        
        })
        # Create barebones colData.Rds - save coverage files as well
        if (
            length(coverage_files) == nrow(df.internal) & 
            all(file.exists(covfiles_full)) &&
            isCOV(covfiles_full)
        ) {
            if(copyCOV) {
                dirCOV <- file.path(norm_output_path, "COV")
                if(!dir.exists(dirCOV)) dir.create(dirCOV)
                for(i in seq_len(length(covfiles_full))) {
                    file.copy(covfiles_full[i], dirCOV, overwrite = TRUE)
                    covfiles_full[i] <- file.path(dirCOV, 
                        basename(covfiles_full[i]))
                }
            }
            if(!all(file.exists(covfiles_full)))
                .log(paste("Some files failed to copy over to", dirCOV))
            coverage_files <- .make_path_relative(
                covfiles_full, norm_output_path)
            df.files <- data.table(
                sample = df.internal$sample,
                bam_file = "",
                sw_file = df.internal$path,
                cov_file = coverage_files
            )

        } else {
            df.files <- data.table(
                sample = df.internal$sample,
                bam_file = "",
                sw_file = df.internal$path,
                cov_file = ""
            )
        }
    } else {
        df.files <- data.table(
            sample = df.internal$sample,
            bam_file = "",
            sw_file = df.internal$path,
            cov_file = ""
        )    
    }

    #
    colData <- list(
        df.files = df.files,
        df.anno = data.table(sample = df.internal$sample)
    )
    saveRDS(colData, file.path(norm_output_path, "colData.Rds"))
}

# Loads the newly created H5 database so we can save it as a
#   SummarizedExperiment for quick recall later
.collateData_initialise_HDF5 <- function(collate_path, colData, assays) {
    item.todo <- c("rowEvent", "Included", "Excluded", "Depth", "Coverage",
        "minDepth", "Up_Inc", "Down_Inc", "Up_Exc", "Down_Exc")

    rowData <- read.fst(file.path(collate_path, "rowEvent.fst"))
    
    # Convert to rds to save file space at small expense of increased load time
    saveRDS(rowData, file.path(collate_path, "rowEvent.Rds"))

    # To save space, only use EventName and EventType for now
    rowData <- rowData[, c("EventName", "EventType")]

    colData <- as.data.frame(colData)
    colData_use <- colData[, -1, drop = FALSE]
    rownames(colData_use) <- colData$sample

    se <- SummarizedExperiment(
        assays = SimpleList(
            Included = assays[["Included"]],
            Excluded = assays[["Excluded"]],
            Depth = assays[["Depth"]],
            Coverage = assays[["Coverage"]],
            minDepth = assays[["minDepth"]]
        ), rowData = rowData, colData = colData_use
    )
    rownames(se) <- rowData(se)$EventName

    metadata(se)$Up_Inc <- assays[["Up_Inc"]]
    metadata(se)$Down_Inc <- assays[["Down_Inc"]]
    rownames(metadata(se)$Up_Inc) <- rowData(se)$EventName[
        rowData(se)$EventType %in% c("IR", "MXE", "SE", "RI")]
    rownames(metadata(se)$Down_Inc) <- rowData(se)$EventName[
        rowData(se)$EventType %in% c("IR", "MXE", "SE", "RI")]
    
    metadata(se)$Up_Exc <- assays[["Up_Exc"]]
    metadata(se)$Down_Exc <- assays[["Down_Exc"]]
    rownames(metadata(se)$Up_Exc) <- rowData(se)$EventName[
        rowData(se)$EventType %in% c("MXE")]
    rownames(metadata(se)$Down_Exc) <- rowData(se)$EventName[
        rowData(se)$EventType %in% c("MXE")]
        
    se@metadata[["junc_PSI"]] <- assays[["junc_PSI"]]
    se@metadata[["junc_counts"]] <- assays[["junc_counts"]]
    se@metadata[["junc_counts_uns"]] <- assays[["junc_counts_uns"]]

    junc_gr_df <- read.fst(file.path(collate_path, "annotation/Junc.fst"))
    rownames(se@metadata[["junc_PSI"]]) <- junc_gr_df$Event
    rownames(se@metadata[["junc_counts"]]) <- junc_gr_df$Event
    rownames(se@metadata[["junc_counts_uns"]]) <- junc_gr_df$Event
    se@metadata[["junc_gr"]] <- .grDT(junc_gr_df)

    colnames(metadata(se)$Up_Inc) <- colData$sample
    colnames(metadata(se)$Down_Inc) <- colData$sample
    colnames(metadata(se)$Up_Exc) <- colData$sample
    colnames(metadata(se)$Down_Exc) <- colData$sample
    colnames(metadata(se)$junc_PSI) <- colData$sample
    colnames(metadata(se)$junc_counts) <- colData$sample
    colnames(metadata(se)$junc_counts_uns) <- colData$sample
    
    colData.Rds <- readRDS(file.path(collate_path, "colData.Rds"))
    if ("df.files" %in% names(colData.Rds) &&
        "cov_file" %in% colnames(colData.Rds$df.files)) {
        metadata(se)$cov_file <- colData.Rds$df.files$cov_file
    }
    stats.df <- fst::read.fst(file.path(collate_path, "stats.fst"))
    metadata(se)$sampleQC <- stats.df[, -1, drop = FALSE]
    rownames(metadata(se)$sampleQC) <- colnames(se)

    metadata(se)$BuildVersion <- collateData_version

    # Remove remaining columns to save more space in Rds
    rowData(se)$EventType <- NULL
    rowData(se)$EventName <- NULL

    return(se)
}

.collateData_save_NxtSE <- function(se, filepath) {
    se@assays <- .collateData_simplify_assay_paths(se@assays)
    se@metadata[["Up_Inc"]] <- .collateData_simplify_assay_path(
        se@metadata[["Up_Inc"]])
    se@metadata[["Down_Inc"]] <- .collateData_simplify_assay_path(
        se@metadata[["Down_Inc"]])
    se@metadata[["Up_Exc"]] <- .collateData_simplify_assay_path(
        se@metadata[["Up_Exc"]])
    se@metadata[["Down_Exc"]] <- .collateData_simplify_assay_path(
        se@metadata[["Down_Exc"]])
    se@metadata[["junc_PSI"]] <- .collateData_simplify_assay_path(
        se@metadata[["junc_PSI"]])
    se@metadata[["junc_counts"]] <- .collateData_simplify_assay_path(
        se@metadata[["junc_counts"]])
    se@metadata[["junc_counts_uns"]] <- .collateData_simplify_assay_path(
        se@metadata[["junc_counts_uns"]])
    saveRDS(se, filepath)
}

# Internals for save_NxtSE; adapted from HDF5Array saveHDF5SummarizedExperiment
# - These are needed if the collateData folder has been moved since last access
# - allows relative paths to be used

# Saving a NxtSE object

# Add single assay
.collateData_simplify_assay_path <- function(assay) {
    DelayedArray::modify_seeds(assay,
        function(x) {
            x@filepath <- basename(x@filepath)
            x
        }
    )
}

# Add list of assays
.collateData_simplify_assay_paths <- function(assays) {
    nassay <- length(assays)
    for (i in seq_len(nassay)) {
        a <- .collateData_simplify_assay_path(getListElement(assays, i))
        assays <- setListElement(assays, i, a)
    }
    return(assays)
}

# Loading a NxtSE object

# Fix a single assay
# .collateData_expand_assay_path <- function(assay, path) {
    # DelayedArray::modify_seeds(assay,
        # function(x) {
            # x@filepath <- file.path(path, x@filepath)
            # x
        # }
    # )
# }

# Fix a list of assays
# .collateData_expand_assay_paths <- function(assays, path) {

    # nassay <- length(assays)
    # for (i in seq_len(nassay)) {
        # a <- .collateData_expand_assay_path(getListElement(assays, i), path)
        # assays <- setListElement(assays, i, a)
    # }
    # return(assays)
# }

# Internals - compile reference data from genome, for quick access

.prepare_covplot_data <- function(
        reference_path, use_ref_path = reference_path) {
    .validate_reference(reference_path)
    genome <- Get_Genome(reference_path)
    data <- list(
        seqInfo = seqinfo(genome),
        geneList = .getGeneList(use_ref_path),
        elements = .loadViewRef(use_ref_path),
        transcripts = .loadTranscripts(use_ref_path),
        spliceList = .loadSpliceEvents(use_ref_path),
        IRList = .loadIREvents(use_ref_path),
        ontology = .loadOntology(reference_path)
    )
    return(data)
}

.loadViewRef <- function(reference_path) {
    .validate_reference(reference_path)

    dir_path <- file.path(reference_path, "fst")
    fetchCols <- c("seqnames", "start", "end", "strand", "type", 
        "transcript_id", "exon_id", "exon_number")
    fetchColsProt <- c("seqnames", "start", "end", "strand", "type", 
        "transcript_id", "protein_id", "ccds_id", "exon_number")
    fetchColsIntron_A <- c("seqnames", "start", "end", "strand")
    fetchColsIntron_B <- c("transcript_id", "intron_id", "intron_number")
    
    exons.DT <- as.data.table(read.fst(file.path(dir_path, "Exons.fst"),
        columns = fetchCols))
    # exons.DT <- exons.DT[get("transcript_biotype") != "protein_coding"]

    introns.DT <- as.data.table(read.fst(file.path(dir_path, "junctions.fst"),
            columns = fetchColsIntron_A))
    introns.DT$type <- "intron"
    introns.DT <- cbind(introns.DT, 
        as.data.table(read.fst(file.path(dir_path, "junctions.fst"),
            columns = fetchColsIntron_B)
        )
    )

    if(file.exists(file.path(dir_path, "Proteins.fst"))) {
        # peek columns in Proteins.fst
        protein_test <- read.fst(file.path(dir_path, "Proteins.fst"), to = 1)
        protCols <- colnames(protein_test)
        fetchColsProt <- intersect(fetchColsProt, protCols)
        
        protein.DT <- as.data.table(
            read.fst(file.path(dir_path, "Proteins.fst"),
            columns = fetchColsProt))
        
        if(any(c("protein_id", "ccds_id") %in% fetchColsProt)) {
            if("ccds_id" %in% fetchColsProt) {
                setnames(protein.DT, "ccds_id", "exon_id")
                if("protein_id" %in% fetchColsProt)
                    protein.DT$protein_id <- NULL
            } else {
                setnames(protein.DT, "protein_id", "exon_id")
            }
            misc.DT <- as.data.table(read.fst(file.path(dir_path, "Misc.fst"),
                columns = fetchCols))

            total.DT <- rbindlist(list(
                exons.DT[, fetchCols, with = FALSE],
                protein.DT[, fetchCols, with = FALSE],
                misc.DT[, fetchCols, with = FALSE]
            ))
        } else {
            total.DT <- exons.DT
        }
    } else {
        total.DT <- exons.DT
    }
    setnames(total.DT, c("exon_id", "exon_number"), c("feature_id", "aux_id"))
    setnames(introns.DT, c("intron_id", "intron_number"), 
        c("feature_id", "aux_id"))
    return(rbind(total.DT,introns.DT))
}

.getGeneList <- function(reference_path) {
    .validate_reference(reference_path)

    file_path <- file.path(reference_path, "fst", "Genes.fst")
    if (!file.exists(file_path)) return(NULL)

    df <- as.data.table(read.fst(file_path))
    df$gene_display_name <- paste0(df$gene_name, " (", df$gene_id, ")")

    return(df)
}

.loadTranscripts <- function(reference_path) {
    .validate_reference(reference_path)

    file_path <- file.path(reference_path, "fst", "Transcripts.fst")

    Transcripts.DT <- as.data.table(read.fst(file_path))

    if ("transcript_support_level" %in% colnames(Transcripts.DT)) {
        Transcripts.DT$transcript_support_level <-
            tstrsplit(Transcripts.DT$transcript_support_level, split = " ")[[1]]
        Transcripts.DT$transcript_support_level[
            is.na(Transcripts.DT$transcript_support_level)] <- "NA"
    } else {
        Transcripts.DT$transcript_support_level <- 1
    }

    return(Transcripts.DT)
}

.loadSpliceEvents <- function(reference_path) {
    .validate_reference(reference_path)

    file_path <- file.path(reference_path, "fst", "Splice.fst")

    splice.DT <- as.data.table(read.fst(file_path, columns = c(
        "EventName", "EventType", "EventRegion",
        "Event1a", "Event1b", "Event2a", "Event2b"
    )))

    return(splice.DT)
}

.loadIREvents <- function(reference_path) {
    .validate_reference(reference_path)

    Dir_path <- file.path(reference_path, "fst", "Introns.Dir.fst")
    ND_path <- file.path(reference_path, "fst", "Introns.ND.fst")

    IR.Dir.DT <- as.data.table(read.fst(Dir_path, columns = c(
        "EventName", "intron_id", "IRname"
    )))
    IR.ND.DT <- as.data.table(read.fst(ND_path, columns = c(
        "EventName", "intron_id", "IRname"
    )))

    return(rbind(
        IR.Dir.DT,
        IR.ND.DT
    ))
}

.loadOntology <- function(reference_path) {
    .validate_reference(reference_path)

    file_path <- file.path(reference_path, "fst", "Ontology.fst")
    if(!file.exists(file_path)) return(NULL)
    
    return(as.data.table(read.fst(file_path)))
}

################################################################################

.collateData_cleanup <- function(collate_data) {
    if (dir.exists(file.path(collate_data, "temp")))
        unlink(file.path(collate_data, "temp"), recursive = TRUE)

    if (dir.exists(file.path(collate_data, "annotation")))
        unlink(file.path(collate_data, "annotation"), recursive = TRUE)
    
    if (dir.exists(file.path(collate_data, "Reference")))
        unlink(file.path(collate_data, "Reference"), recursive = TRUE)
    
    unlink(file.path(collate_data, "junc_PSI_index.fst"))
    unlink(file.path(collate_data, "rowEvent.brief.fst"))
    unlink(file.path(collate_data, "rowEvent.fst"))
    unlink(file.path(collate_data, "stats.fst"))
}
alexchwong/SpliceWiz documentation built on March 17, 2024, 3:16 a.m.