R/CollateData.R

Defines functions .loadTranscripts .getGeneList .loadViewRef .prepare_covplot_data .collateData_expand_assay_paths .collateData_expand_assay_path .collateData_simplify_assay_paths .collateData_simplify_assay_path .collateData_save_NxtSE .collateData_initialise_HDF5 .collateData_write_colData .collateData_write_stats .collateData_H5_write_assays .collateData_H5_initialize .collateData_compile_assays_from_fst .collateData_process_assays_as_fst .collateData_process_splice_depth .collateData_process_splice .collateData_process_irf .collateData_process_junc .collateData_seed_init .copy_DT .collateData_compile_agglist .collateData_rowEvent_full .collateData_rowEvent_splice_option .collateData_rowEvent_brief .collateData_rowEvent .collateData_splice_anno .collateData_irf_group .collateData_junc_group .collateData_junc_annotate .collateData_annotate .collateData_stop_irf_mismatch .collateData_irf_merge .collateData_junc_merge .collateData_stats_QC .collateData_stats_reads .collateData_stats .collateData_jobs .collateData_expr .collateData_COV .collateData_validate CollateData

Documented in CollateData

#' Processes data from IRFinder output
#'
#' CollateData unifies a list of [IRFinder] output files belonging to an
#' experiment.
#'
#' @details
#' All sample IRFinder outputs must be generated using the same
#' reference.
#'
#' The combination of junction counts and IR quantification from
#' IRFinder is used to calculate percentage spliced in (PSI) of alternative
#' splice events, and percent intron retention (PIR) of retained introns. Also,
#' QC information is extracted. Data is organised in a H5file and FST files
#' for memory and processor efficient downstream access using [MakeSE].
#'
#'   The original IRFinder algorithm, see the following
#'   [wiki](https://github.com/williamritchie/IRFinder/wiki/IRFinder-Output),
#'   uses `SpliceMax` to estimate abundance of spliced transcripts.
#'   This calculates the number of mapped splice events
#'   that share the boundary coordinate of either the left or right flanking
#'   exon `SpliceLeft,SpliceRight`, estimating splice abundance as the larger
#'   of the two values.
#'
#'   NxtIRF proposes a new algorithm,`SpliceOverMax`,
#'   to account for the possibility that the major isoform shares neither
#'   boundary, but arises from either of the flanking "exon islands". Exon
#'   islands are contiguous regions covered by exons from any transcript
#'   (except those designated as `retained_intron` or
#'   `sense_intronic`), and are separated by
#'   obligate intronic regions (genomic regions that are introns for all
#'   transcripts). For introns that are internal to a single exon island
#'   (i.e. akin to "known-exon" introns from IRFinder), `SpliceOverMax`
#'   uses [GenomicRanges::findOverlaps] to sum all splice reads that overlap
#'   the same genomic region as the intron of interest.
#'
#' @param Experiment (Required) A 2 or 3 column data frame, ideally generated by
#'   [Find_IRFinder_Output] or [Find_Samples].
#'   The first column designate the sample names, and the 2nd column
#'   contains the path to the IRFinder output file (of type `sample.txt.gz`).
#'   (Optionally) a 3rd column contains the coverage files (of type
#'   `sample.cov`) of the corresponding samples.
#'   NB: all other columns are ignored.
#' @param reference_path (Required) The path to the reference generated by
#'   [BuildReference]
#' @param output_path (Required)  The path to contain the output files for this
#'   function
#' @param IRMode (default `SpliceOverMax`) The algorithm to calculate
#'   'splice abundance' in IR quantification.
#'  Valid options are `SpliceOverMax` and `SpliceMax`.
#'  See details
#' @param samples_per_block (default `16`) How many samples to process per
#'    thread, maximum. Setting this to a lower value may help in
#'    memory-constrained systems.
#' @param n_threads (default `1`) The number of threads to use. On low
#'   memory systems, reduce the number of `n_threads` and `samples_per_block`
#' @param overwrite (default `FALSE`) If `CollateData()` has previously been run
#'   using the same set of samples, it will not be overwritten unless this is
#'   set to `TRUE`.
#' @return `CollateData()` writes to the directory given by `output_path`.
#'   This output directory is portable (i.e. it can be moved to a different
#'   location after running `CollateData()` before running [MakeSE]), but
#'   individual files within the output folder should not be moved.\cr\cr
#'   Also, the [IRFinder] and [CollateData] output folders should be copied to
#'   the same destination and their relative paths preserved. Otherwise, the
#'   locations of the "COV" files will not be recorded in the collated data and
#'   will have to be re-assigned using `covfile(se)<-`. See [MakeSE]
#' @examples
#' BuildReference(
#'     reference_path = file.path(tempdir(), "Reference"),
#'     fasta = chrZ_genome(),
#'     gtf = chrZ_gtf()
#' )
#'
#' bams <- NxtIRF_example_bams()
#' IRFinder(bams$path, bams$sample,
#'   reference_path = file.path(tempdir(), "Reference"),
#'   output_path = file.path(tempdir(), "IRFinder_output")
#' )
#'
#' expr <- Find_IRFinder_Output(file.path(tempdir(), "IRFinder_output"))
#' CollateData(expr,
#'   reference_path = file.path(tempdir(), "Reference"),
#'   output_path = file.path(tempdir(), "NxtIRF_output")
#' )
#' @seealso [IRFinder], [MakeSE]
#' @md
#' @export
CollateData <- function(Experiment, reference_path, output_path,
        IRMode = c("SpliceOverMax", "SpliceMax"),
        overwrite = FALSE, n_threads = 1,
        samples_per_block = 16
) {
    IRMode <- match.arg(IRMode)
    if (IRMode == "")
        .log(paste("In CollateData(),",
            "IRMode must be either 'SpliceOverMax' (default) or 'SpliceMax'"))

    N_steps <- 8
    dash_progress("Validating Experiment; checking COV files...", N_steps)
    .log("Validating Experiment; checking COV files...", "message")
    BPPARAM_mod <- .validate_threads(n_threads)
    norm_output_path <- .collateData_validate(Experiment,
        reference_path, output_path)
    coverage_files <- .make_path_relative(
        .collateData_COV(Experiment), output_path)
    df.internal <- .collateData_expr(Experiment)
    if (!overwrite && file.exists(file.path(output_path, "NxtSE.rds"))) {
        se <- .MakeSE_load_NxtSE(file.path(output_path, "NxtSE.rds"))
        if (all(colnames(se) %in% df.internal$sample) &
            all(df.internal$sample %in% colnames(se))
        ) {
            .log("NxtIRF already collated this experiment in given directory",
                "message")
            return()
        }
    }
    jobs <- .collateData_jobs(nrow(df.internal), BPPARAM_mod, samples_per_block)

    dash_progress("Compiling Sample Stats", N_steps)
    .log("Compiling Sample Stats", "message")
    df.internal <- .collateData_stats(df.internal, jobs, BPPARAM_mod)
    stranded <- !any(df.internal$strand == 0)

    dash_progress("Compiling Junction List", N_steps)
    junc.common <- .collateData_junc_merge(df.internal, jobs, BPPARAM_mod,
        output_path)
    # saveRDS(junc.common, file.path(output_path,"junc.common.Rds"))
    gc()

    dash_progress("Compiling Intron Retention List", N_steps)
    irf.common <- .collateData_irf_merge(df.internal, jobs, BPPARAM_mod,
        output_path, stranded)
    # saveRDS(irf.common, file.path(output_path,"irf.common.Rds"))
    gc()

# Reassign +/- based on junctions.fst annotation
    # Annotate junctions
    dash_progress("Tidying up splice junctions and intron retentions", N_steps)
    .log("Tidying up splice junctions and intron retentions...", "message")
    .collateData_annotate(reference_path, norm_output_path,
        junc.common, irf.common, stranded)
    message("done\n")
    rm(junc.common, irf.common)
    gc()

    dash_progress("Generating NxtIRF assays", N_steps)
    .log("Generating NxtIRF assays", "message")

    # Use 1 sample per job, with progress BPPARAM
    jobs_2 <- .split_vector(seq_len(nrow(df.internal)),
        nrow(df.internal))
    BPPARAM_mod_progress <- .validate_threads(n_threads, progressbar = TRUE,
        tasks = nrow(df.internal))
    agg.list <- BiocParallel::bplapply(
        seq_len(nrow(df.internal)),
        .collateData_compile_agglist,
        jobs = jobs_2, df.internal = df.internal,
        norm_output_path = norm_output_path, IRMode = IRMode,
        BPPARAM = BPPARAM_mod_progress
    )
    gc()

    dash_progress("Building Final SummarizedExperiment Object", N_steps)
    message("Building Final SummarizedExperiment Object")

    assays <- .collateData_compile_assays_from_fst(df.internal,
        norm_output_path, samples_per_block)

    .collateData_write_stats(df.internal, norm_output_path)
    .collateData_write_colData(df.internal, coverage_files, norm_output_path)
    cov_data <- .prepare_covplot_data(reference_path)
    saveRDS(cov_data, file.path(norm_output_path, "annotation", "cov_data.Rds"))

    # NEW compile NxtSE:
    colData.Rds <- readRDS(file.path(norm_output_path, "colData.Rds"))
    colData <- .makeSE_colData_clean(colData.Rds$df.anno)
    se <- .collateData_initialise_HDF5(norm_output_path, colData, assays)

    .collateData_save_NxtSE(se, file.path(norm_output_path, "NxtSE.rds"))
    if (dir.exists(file.path(norm_output_path, "temp"))) {
        unlink(file.path(norm_output_path, "temp"), recursive = TRUE)
    }
    dash_progress("NxtIRF Collation Finished", N_steps)
    message("NxtIRF Collation Finished")
}


################################################################################
# CollateData helper functions:

# Checks Experiment data.frame. Checks paths valid and have existing parent
#   folders. Creates required folders.
.collateData_validate <- function(Experiment, reference_path, output_path) {
    if (!is(Experiment, "data.frame")) {
        .log(paste("In CollateData(),",
            "Experiment object needs to be a data frame"))
    }
    if (ncol(Experiment) < 2) {
        .log(paste("In CollateData(),",
            "Experiment needs to contain at least two columns,",
            "with the first 2 columns containing",
            "(1) sample name and (2) IRFinder output"))
    }
    .validate_reference(reference_path)
    if (!dir.exists(dirname(output_path))) {
        .log(paste("In CollateData(),",
            "Parent directory of output path:",
            dirname(output_path), "needs to exist"))
    }
    base_output_path <- normalizePath(dirname(output_path))
    norm_output_path <- file.path(base_output_path, basename(output_path))
    if (!dir.exists(norm_output_path)) {
        dir.create(norm_output_path)
    }
    temp_output_path <- file.path(norm_output_path, "temp")
    if (!dir.exists(temp_output_path)) {
        dir.create(temp_output_path)
    }
    if (!dir.exists(file.path(norm_output_path, "annotation"))) {
        dir.create(file.path(norm_output_path, "annotation"))
    }
    return(norm_output_path)
}

# Check all given COV files are valid COV format
.collateData_COV <- function(Experiment) {
    coverage_files <- ""
    Experiment <- as.data.frame(Experiment)
    if (ncol(Experiment) == 2) return(NULL)
    
    if (ncol(Experiment) > 2 && all(file.exists(Experiment[, 3]))) {
        coverage_files <- Experiment[, 3]
    }
    if (!IsCOV(coverage_files)) {
        message("Some coverage files do not exist or are corrupted")
        coverage_files <- ""
    }
    if (length(coverage_files) != nrow(Experiment)) {
        message("The number of coverage files must equal the number of samples")
        coverage_files <- ""
    }
    return(coverage_files)
}

# Checks all IRFinder output files exist. Checks repeat entries.
.collateData_expr <- function(Experiment) {
    Experiment <- as.data.frame(Experiment)
    colnames(Experiment)[c(1, 2)] <- c("sample", "path")
    if (!all(vapply(Experiment$path, file.exists, logical(1)))) {
        .log(paste("In CollateData(),",
            "Some files in Experiment do not exist"))
    }
    if (length(Experiment$sample) != length(unique(Experiment$sample))) {
        .log(paste("In CollateData(),",
            "Repeated sample names are not allowed"))
    }

    df.internal <- as.data.table(Experiment[, c(1, 2)])
    df.internal$paired <- FALSE
    df.internal$strand <- 0
    df.internal$depth <- 0
    df.internal$mean_frag_size <- 0
    df.internal$directionality_strength <- 0
    df.internal$Intergenic_Fraction <- 0
    df.internal$rRNA_Fraction <- 0
    df.internal$NonPolyA_Fraction <- 0
    df.internal$Mitochondrial_Fraction <- 0
    df.internal$Unanno_Jn_Fraction <- 0
    df.internal$NMD_Jn_Fraction <- 0
    df.internal$Fraction_Splice_Reads <- 0
    df.internal$Fraction_Span_Reads <- 0

    df.internal$IRBurden_clean <- 0
    df.internal$IRBurden_exitrons <- 0
    df.internal$IRBurden_clean_unstranded <- 0
    df.internal$IRBurden_exitrons_unstranded <- 0
    df.internal$IRBurden_antisense <- 0
    return(df.internal)
}

# Designates equal jobs per thread
.collateData_jobs <- function(n_expr, BPPARAM_mod, samples_per_block) {
    n_jobs <- min(ceiling(n_expr / samples_per_block),
        BPPARAM_mod$workers)
    jobs <- .split_vector(seq_len(n_expr), n_jobs)
    return(jobs)
}

################################################################################
# Sub

# Collates statistics of IRFinder QC, returns a data frame of these QC stats
.collateData_stats <- function(df.internal, jobs, BPPARAM_mod) {
    n_jobs <- length(jobs)
    df.internal <- rbindlist(
        BiocParallel::bplapply(seq_len(n_jobs),
            function(x, jobs, df.internal) {
                suppressPackageStartupMessages({
                    requireNamespace("data.table")
                    requireNamespace("stats")
                })
                work <- jobs[[x]]
                block <- df.internal[work]
                for (i in seq_len(length(work))) {

                    data.list <- get_multi_DT_from_gz(
                        normalizePath(block$path[i]),
                        c("BAM", "Directionality", "QC"))
                    stats <- data.list$BAM
                    direct <- data.list$Directionality
                    QC <- data.list$QC

                    block <- .collateData_stats_reads(block, i, stats, direct)
                    block <- .collateData_stats_QC(block, i, QC, direct)
                }
                return(block)
            }, jobs = jobs, df.internal = df.internal, BPPARAM = BPPARAM_mod
        )
    )
    return(df.internal)
}

################################################################################

.collateData_stats_reads <- function(block, i, stats, direct) {
    if (stats$Value[3] == 0 & stats$Value[4] > 0) {
        block$paired[i] <- TRUE
        block$depth[i] <- stats$Value[4]
        block$mean_frag_size[i] <- stats$Value[2] /
            stats$Value[4]
    } else if (stats$Value[3] > 0 &&
            stats$Value[4] / stats$Value[3] / 1000) {
        block$paired[i] <- TRUE
        block$depth[i] <- stats$Value[4]
        block$mean_frag_size[i] <- stats$Value[2] /
            stats$Value[4]
    } else {
        block$paired[i] <- FALSE
        block$depth[i] <- stats$Value[3]
        block$mean_frag_size[i] <- stats$Value[2] /
            stats$Value[3]
    }
    block$strand[i] <- direct$Value[9]
    return(block)
}

.collateData_stats_QC <- function(block, i, QC, direct) {
    block$directionality_strength[i] <- direct$Value[8]
    block$Intergenic_Fraction[i] <-
        QC$Value[QC$QC == "Intergenic Reads"] /
            block$depth[i]
    block$rRNA_Fraction[i] <-
        QC$Value[QC$QC == "rRNA Reads"] /
            block$depth[i]
    block$NonPolyA_Fraction[i] <-
        QC$Value[QC$QC == "NonPolyA Reads"] /
            block$depth[i]
    block$Mitochondrial_Fraction[i] <-
        QC$Value[QC$QC == "Mitochondrial Reads"] /
            block$depth[i]
    block$Unanno_Jn_Fraction[i] <-
        QC$Value[QC$QC == "Unannotated Junctions"] /
        (QC$Value[QC$QC == "Unannotated Junctions"] +
        QC$Value[QC$QC == "Annotated Junctions"])
    block$NMD_Jn_Fraction[i] <-
        QC$Value[QC$QC == "NMD Junctions"] /
        QC$Value[QC$QC == "Annotated Junctions"]
    block$Fraction_Splice_Reads[i] <-
        QC$Value[QC$QC == "Annotated Junctions"] /
        block$depth[i]
    block$Fraction_Span_Reads[i] <-
        QC$Value[QC$QC == "Spans Reads"] /
            block$depth[i]
    # IRBurden calculations
    block$IRBurden_clean_unstranded[i] <-
        QC$Value[QC$QC == "Non-Directional Clean IntronDepth Sum"] /
        (QC$Value[QC$QC == "Non-Directional Clean IntronDepth Sum"] +
        QC$Value[QC$QC == "Annotated Junctions"])
    block$IRBurden_exitrons_unstranded[i] <-
        QC$Value[QC$QC == "Non-Directional Known-Exon IntronDepth Sum"] /
        (QC$Value[QC$QC == "Non-Directional Known-Exon IntronDepth Sum"] +
        QC$Value[QC$QC == "Annotated Junctions"])
    block$IRBurden_antisense[i] <-
        QC$Value[QC$QC == "Non-Directional Anti-Sense IntronDepth Sum"] /
        (QC$Value[QC$QC == "Non-Directional Anti-Sense IntronDepth Sum"] +
        QC$Value[QC$QC == "Annotated Junctions"])
    if (block$strand[i] != 0) {
        block$IRBurden_clean[i] <-
            QC$Value[QC$QC == "Directional Clean IntronDepth Sum"] /
            (QC$Value[QC$QC == "Directional Clean IntronDepth Sum"] +
            QC$Value[QC$QC == "Annotated Junctions"])
        block$IRBurden_exitrons[i] <-
            QC$Value[QC$QC == "Directional Known-Exon IntronDepth Sum"] /
            (QC$Value[QC$QC == "Directional Known-Exon IntronDepth Sum"] +
            QC$Value[QC$QC == "Annotated Junctions"])
    }
    return(block)
}

################################################################################
# Sub

# Compiles a unified list of detected splice junctions
.collateData_junc_merge <- function(df.internal, jobs, BPPARAM_mod,
        output_path) {
    temp_output_path <- file.path(output_path, "temp")
    n_jobs <- length(jobs)

    .log("Compiling Junction List...", "message", appendLF = FALSE)

    # Extract junctions from IRFinder output, save temp FST file
    junc.list <- BiocParallel::bplapply(
        seq_len(n_jobs),
        function(x, jobs, df.internal, temp_output_path) {
            suppressPackageStartupMessages({
                requireNamespace("data.table")
                requireNamespace("stats")
            })
            work <- jobs[[x]]
            block <- df.internal[work]
            junc.segment <- NULL
            for (i in seq_len(length(work))) {

                data.list <- get_multi_DT_from_gz(
                    normalizePath(block$path[i]), c("JC_seqname"))
                junc <- data.list[["JC_seqname"]]

                setnames(junc, "JC_seqname", "seqnames")
                if (is.null(junc.segment)) {
                    junc.segment <- junc[, seq_len(4), with = FALSE]
                } else {
                    junc.segment <- merge(junc.segment,
                        junc[, seq_len(4), with = FALSE], all = TRUE)
                }
                # Write temp file
                fst::write.fst(as.data.frame(junc),
                    file.path(temp_output_path, paste(block$sample[i],
                    "junc.fst.tmp", sep = ".")))
            }
            return(junc.segment)
        }, jobs = jobs, df.internal = df.internal,
            temp_output_path = temp_output_path, BPPARAM = BPPARAM_mod
    )

    # Combine list of individual junction dfs into unified list of junction
    message("merging...", appendLF = FALSE)
    junc.common <- NULL
    for (i in seq_len(length(junc.list))) {
        if (is.null(junc.common)) {
            junc.common <- junc.list[[i]]
        } else {
            junc.common <- merge(junc.common, junc.list[[i]],
                all = TRUE, by = colnames(junc.common))
        }
    }
    junc.common[, c("start") := get("start") + 1]
    message("done")
    return(junc.common)
}

# Assume all IRFinder outputs are created from same ref. Checks this
.collateData_irf_merge <- function(df.internal, jobs, BPPARAM_mod,
        output_path, stranded) {
    # Check names of all introns are the same in all samples
    temp_output_path <- file.path(output_path, "temp")
    n_jobs <- length(jobs)

    .log("Compiling Intron Retention List...", "message", appendLF = FALSE)

    irf.list <- BiocParallel::bplapply(seq_len(n_jobs),
        function(x, jobs, df.internal, temp_output_path, stranded) {
            suppressPackageStartupMessages({
                requireNamespace("data.table")
                requireNamespace("stats")
            })
            work <- jobs[[x]]
            block <- df.internal[work]
            irf.names <- c()
            phrase <- ifelse(stranded, "Dir_Chr", "Nondir_Chr")
            for (i in seq_len(length(work))) {

                data.list <- get_multi_DT_from_gz(
                    normalizePath(block$path[i]), phrase)
                irf <- data.list[[phrase]]

                setnames(irf, c(phrase, "Start", "End", "Strand"),
                    c("seqnames", "start", "end", "strand"))
                if (is.null(irf.names)) {
                    irf.names <- irf$Name
                } else {
                    if (!identical(irf.names, irf$Name))
                        .collateData_stop_irf_mismatch()
                }
                fst::write.fst(as.data.frame(irf),
                    file.path(temp_output_path,
                        paste(block$sample[i], "irf.fst.tmp", sep = ".")))
            }
            return(irf.names)
        }, jobs = jobs, df.internal = df.internal,
            temp_output_path = temp_output_path,
            stranded = stranded, BPPARAM = BPPARAM_mod
    )
    # Checks common columns are the same
    if (length(irf.list) > 1) {
        for (i in seq_len(length(irf.list))) {
            if (!identical(irf.list[[i]], irf.list[[1]]))
                .collateData_stop_irf_mismatch()
        }
    }
    # Returns common columns
    irf <- fst::read.fst(file.path(temp_output_path,
        paste(df.internal$sample[1], "irf.fst.tmp", sep = ".")),
        as.data.table = TRUE)
    irf.common <- irf[, seq_len(6), with = FALSE]
    irf.common[, c("start") := get("start") + 1]

    message("done")
    return(irf.common)
}

.collateData_stop_irf_mismatch <- function() {
    stopmsg <- paste(
        "Some IRFinder outputs were generated",
        "with a different NxtIRF reference.",
        "Suggest re-run IRFinder on all sample files",
        "using thesame reference"
    )
    .log(stopmsg)
}

################################################################################
# Sub

# Annotate IRFinder introns and junctions according to given reference
.collateData_annotate <- function(reference_path, norm_output_path,
        junc.common, irf.common, stranded) {

    message("...annotating splice junctions")
    junc.common <- .collateData_junc_annotate(junc.common, reference_path)
    message("...grouping splice junctions")
    junc.common <- .collateData_junc_group(junc.common, reference_path)

    message("...grouping introns")
    irf.common <- .collateData_irf_group(irf.common, reference_path, stranded)
    irf.common[, c("EventRegion") :=
        paste0(get("seqnames"), ":", get("start"), "-",
            get("end"), "/", get("strand"))]

    message("...loading splice events")
    Splice.Anno <- .collateData_splice_anno(reference_path, irf.common)

    message("...saving annotations")
    # Save annotation
    write.fst(as.data.frame(junc.common),
        file.path(norm_output_path, "annotation", "Junc.fst"))
    write.fst(as.data.frame(irf.common),
        file.path(norm_output_path, "annotation", "IR.fst"))
    write.fst(as.data.frame(Splice.Anno),
        file.path(norm_output_path, "annotation", "Splice.fst"))

    # Write junc_PSI index
    junc_PSI <- junc.common[, c("seqnames", "start", "end", "strand")]
    write.fst(junc_PSI, file.path(norm_output_path, "junc_PSI_index.fst"))

    message("...compiling rowEvents")
    .collateData_rowEvent(irf.common, Splice.Anno,
        norm_output_path, reference_path)
}

################################################################################

# Annotate junction splice motifs
.collateData_junc_annotate <- function(junc.common, reference_path) {
    junc.strand <- read.fst(
        path = file.path(reference_path, "fst", "junctions.fst"),
        columns = c("seqnames", "start", "end", "strand", "splice_motif"),
        as.data.table = TRUE
    )
    junc.strand <- unique(junc.strand)

    # Determine strand of junc.common junctions
    junc.common[, c("strand") := NULL]
    junc.common <- unique(junc.common)
    junc.common <- merge(junc.common, junc.strand,
        all = TRUE, by = c("seqnames", "start", "end"))

    # Novel junctions are assigned *
    junc.common[is.na(get("strand")), c("strand") := "*"]
    junc.common.unanno <- junc.common[get("strand") == "*"]
    junc.common.anno <- junc.common[get("strand") != "*"]

    # make sure valid seqnames
    junc.common.unanno <- junc.common.unanno[!is.na(get("seqnames"))]
    junc.common.unanno <- junc.common.unanno[get("seqnames") != ""]

    # Determine strandedness based on splice junction motif
    if (nrow(junc.common.unanno) != 0) {

        genome <- Get_Genome(reference_path, as_DNAStringSet = TRUE)

        # Filter unanno by available sequences
        junc.common.unanno <- junc.common.unanno[seqnames %in% names(genome)]
        
        # sanity check: remove unannotated junctions that lie outside genome
        junc.common.unanno.gr <- makeGRangesFromDataFrame(junc.common.unanno)
        seqinfo <- as.data.frame(seqinfo(genome))
        seqinfo.gr <- GRanges(rownames(seqinfo), IRanges(1, seqinfo$seqlengths), "*")
        OL <- findOverlaps(junc.common.unanno.gr, seqinfo.gr, type = "within")
        junc.common.unanno <- junc.common.unanno[unique(from(OL)), which = FALSE]

        # Create left and right motif GRanges
        left.gr <- with(junc.common.unanno,
            GRanges(seqnames = as.character(seqnames),
            ranges = IRanges(start = start, end = start + 1),
            strand = "+"))
        right.gr <- with(junc.common.unanno,
            GRanges(seqnames = as.character(seqnames),
            ranges = IRanges(start = end - 1, end = end),
            strand = "+"))
        
        junc.common.unanno[, c("splice_motif") := paste0(
            as.character(getSeq(genome, left.gr)),
            as.character(getSeq(genome, right.gr))
        )]
        
        splice_motifs <- data.frame(
            seqs = c("GTAG", "GCAG", "ATAC", "ATAG"),
            seqs_r = c("CTAC", "CTGC", "GTAT", "CTAT")
        )
        junc.common.unanno[get("splice_motif") %in% splice_motifs$seqs,
            c("strand") := "+"]
        junc.common.unanno[get("splice_motif") %in% splice_motifs$seqs_r,
            c("strand") := "-"]
        # Do not accept un-annotated GTACs - too confusing
        # Exclude unannotated non-splice motifs
        junc.common.unanno <- junc.common.unanno[
            get("strand") != "*"]
        junc.common.unanno[get("strand") == "-",
            c("splice_motif") := splice_motifs$seqs[match(
                get("splice_motif"), splice_motifs$seqs_r)]]
        junc.final <- rbindlist(list(junc.common.anno, junc.common.unanno))
    } else {
        junc.final <- junc.common.anno
    }
    # Assign region names to junctions:
    junc.final[, c("Event") := paste0(get("seqnames"), ":",
        get("start"), "-", get("end"), "/", get("strand"))]
    return(junc.final)
}

# Use Exon Groups file to designate flanking exon islands to ALL junctions
.collateData_junc_group <- function(junc.common, reference_path) {

    Exon.Groups <- as.data.table(
        read.fst(file.path(reference_path, "fst", "Exons.Group.fst"))
    )
    # Always calculate stranded for junctions
    Exon.Groups.S <- Exon.Groups[get("strand") != "*"]

    # Same method as in BuildRef
    junc.common <- .process_introns_group_overlap(
        junc.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "exon_group", "gene_group", "exon_group")
    )
    junc.common <- .process_introns_group_fix_RI(
        junc.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "intron_number", "gene_group", "intron_number")
    )
    junc.common[,
        c("JG_up", "JG_down") := list(
            paste(get("gene_group_up"), get("exon_group_up"), sep = "_"),
            paste(get("gene_group_down"), get("exon_group_down"), sep = "_")
        )
    ]

    # Remove temp columns
    junc.common$gene_group_up <- NULL
    junc.common$gene_group_down <- NULL
    junc.common$exon_group_up <- NULL
    junc.common$exon_group_down <- NULL

    return(junc.common)
}

# Use Exon Groups file to designate flanking exon islands to ALL introns
.collateData_irf_group <- function(
        irf.common, reference_path, stranded = TRUE
) {
    # Use Exon Groups file to designate exon groups to all junctions
    Exon.Groups <- as.data.table(
        read.fst(file.path(reference_path, "fst", "Exons.Group.fst")))

    # Always calculate stranded for junctions
    Exon.Groups.S <- Exon.Groups[get("strand") != "*"]

    irf.common <- .process_introns_group_overlap(
        irf.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "exon_group", "gene_group", "exon_group")
    )
    irf.common <- .process_introns_group_fix_RI(
        irf.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "intron_number", "gene_group", "intron_number")
    )
    irf.common[,
        c("JG_up", "JG_down") := list(
            paste(get("gene_group_up"), get("exon_group_up"), sep = "_"),
            paste(get("gene_group_down"), get("exon_group_down"), sep = "_")
        )
    ]

    irf.common$gene_group_up <- NULL
    irf.common$gene_group_down <- NULL
    irf.common$exon_group_up <- NULL
    irf.common$exon_group_down <- NULL

    if (!stranded) {
        Exon.Groups.S <- Exon.Groups[get("strand") == "*"]
    } else {
        Exon.Groups.S <- Exon.Groups[get("strand") != "*"]
    }

    irf.common <- .process_introns_group_overlap(
        irf.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "exon_group", "gene_group", "exon_group")
    )
    irf.common <- .process_introns_group_fix_RI(
        irf.common, Exon.Groups.S,
        c("gene_group_up", "exon_group_up",
            "gene_group_down", "exon_group_down"),
        c("gene_group", "intron_number", "gene_group", "intron_number")
    )
    irf.common[,
        c("IRG_up", "IRG_down") := list(
            paste(get("gene_group_up"), get("exon_group_up"), sep = "_"),
            paste(get("gene_group_down"), get("exon_group_down"), sep = "_")
        )
    ]

    return(irf.common)
}

# Annotates splice junctions with ID's of flanking exon islands
.collateData_splice_anno <- function(reference_path, irf.common) {
    candidate.introns <- as.data.table(
        read.fst(file.path(reference_path, "fst", "junctions.fst")))

    # Order introns by importance (WHY?)
    candidate.introns[, c("transcript_biotype_2") := get("transcript_biotype")]
    candidate.introns[
        !(get("transcript_biotype") %in%
            c("protein_coding", "processed_transcript",
            "lincRNA", "antisense", "nonsense_mediated_decay")),
        c("transcript_biotype_2") := "other"]
    candidate.introns[, c("transcript_biotype_2") :=
        factor(get("transcript_biotype_2"),
            c("protein_coding", "processed_transcript",
            "lincRNA", "antisense", "other", "nonsense_mediated_decay"),
        ordered = TRUE)]
    if ("transcript_support_level" %in% colnames(candidate.introns)) {
        setorderv(candidate.introns,
            c("transcript_biotype_2", "transcript_support_level"))
    } else {
        setorder(candidate.introns, "transcript_biotype_2")
    }
    candidate.introns[, c("Event1a") := get("Event")]
    candidate.introns[, c("Event2a") := get("Event")]

    # Annotate exon islands for splice events
    Splice.Anno <- read.fst(file.path(reference_path, "fst", "Splice.fst"),
        as.data.table = TRUE)

    # Remove retained introns not assayed in irf.common
    Splice.Anno <- Splice.Anno[
        get("Event1a") %in% irf.common$EventRegion |
        get("EventType") != "RI"
    ]

    Splice.Anno[candidate.introns, on = "Event1a",
        c("up_1a") := paste(get("i.gene_group_stranded"),
        get("i.exon_group_stranded_upstream"), sep = "_")]
    Splice.Anno[candidate.introns, on = "Event1a",
        c("down_1a") := paste(get("i.gene_group_stranded"),
        get("i.exon_group_stranded_downstream"), sep = "_")]
    Splice.Anno[candidate.introns, on = "Event2a",
        c("down_2a") := paste(get("i.gene_group_stranded"),
        get("i.exon_group_stranded_downstream"), sep = "_")]

    Splice.Anno[get("EventType") %in% c("MXE", "SE", "ALE", "A3SS", "RI"),
        c("JG_up") := get("up_1a")]
    Splice.Anno[get("EventType") %in% c("SE", "AFE", "A5SS", "RI"),
        c("JG_down") := get("down_1a")]
    Splice.Anno[get("EventType") %in% c("MXE"),
        c("JG_down") := get("down_2a")]

    Splice.Anno$up_1a <- NULL
    Splice.Anno$down_1a <- NULL
    Splice.Anno$down_2a <- NULL
    Splice.Anno[, c("strand") :=
        tstrsplit(get("Event1a"), split = "/")[[2]]]
    return(Splice.Anno)
}

# Generate rowData annotations
.collateData_rowEvent <- function(irf.common, Splice.Anno,
        norm_output_path, reference_path) {
    .collateData_rowEvent_brief(irf.common, Splice.Anno,
        norm_output_path)
    Splice.Options.Summary <- .collateData_rowEvent_splice_option(
        reference_path)
    .collateData_rowEvent_full(Splice.Options.Summary, Splice.Anno,
        norm_output_path, reference_path)
}

# Truncated rowEvent, with EventName, EventType, EventRegion
.collateData_rowEvent_brief <- function(irf.common, Splice.Anno,
        norm_output_path) {
    # make rowEvent brief here
    irf.anno.brief <- irf.common[, c("Name", "EventRegion")]
    setnames(irf.anno.brief, "Name", "EventName")
    irf.anno.brief[, c("EventType") := "IR"]
    irf.anno.brief <- irf.anno.brief[,
        c("EventName", "EventType", "EventRegion")]
    splice.anno.brief <- Splice.Anno[,
        c("EventName", "EventType", "EventRegion")]

    rowEvent <- rbind(irf.anno.brief, splice.anno.brief)
    write.fst(rowEvent, file.path(norm_output_path, "rowEvent.brief.fst"))
}

# Generate annotation based on importance of involved transcripts
.collateData_rowEvent_splice_option <- function(reference_path) {
    Splice.Options <- as.data.table(read.fst(
        file.path(reference_path, "fst", "Splice.options.fst")))
    Transcripts <- as.data.table(read.fst(
        file.path(reference_path, "fst", "Transcripts.fst")))
    Splice.Anno.Brief <- read.fst(
        file.path(reference_path, "fst", "Splice.fst"),
        as.data.table = TRUE, columns = c("EventName", "EventID"))
    Splice.Options[Splice.Anno.Brief, on = "EventID",
        c("EventName") := get("i.EventName")]
    Splice.Options[Transcripts, on = "transcript_id",
        c("transcript_biotype") := get("i.transcript_biotype")]

    Splice.Options.Summary <- copy(Splice.Options)
    Splice.Options.Summary[,
        c("tsl_min") := min(get("transcript_support_level")),
        by = c("EventID", "isoform")]
    Splice.Options.Summary[,
        c("any_is_PC") := any(get("is_protein_coding")),
        by = c("EventID", "isoform")]
    Splice.Options.Summary[,
        c("all_is_NMD") := all(grepl("decay", get("transcript_biotype"))),
        by = c("EventID", "isoform")]
    return(Splice.Options.Summary)
}

# Annotate full rowEvent. Includes:
# - whether Inc/Exc is a protein coding splicing event
# - whether Inc/Exc represent an event causing NMD
# - the highest-ranking TSL for each alternative (A/B) of event
.collateData_rowEvent_full <- function(Splice.Options.Summary, Splice.Anno,
        norm_output_path, reference_path) {
    rowEvent.Extended <- read.fst(
        file.path(norm_output_path, "rowEvent.brief.fst"),
        as.data.table = TRUE)
    IR_NMD <- read.fst(file.path(reference_path, "fst", "IR.NMD.fst"),
        as.data.table = TRUE)
    candidate.introns <- as.data.table(
        read.fst(file.path(reference_path, "fst", "junctions.fst")))

    # Prioritise candidate.introns based on transcript importance

    rowEvent.Extended[get("EventType") == "IR",
        c("intron_id") := tstrsplit(get("EventName"), split = "/")[[2]]]
    rowEvent.Extended[, c("Inc_Is_Protein_Coding") := FALSE]
    rowEvent.Extended[, c("Exc_Is_Protein_Coding") := FALSE]
    rowEvent.Extended[IR_NMD, on = "intron_id",
        c("Exc_Is_Protein_Coding") := TRUE]
    rowEvent.Extended[IR_NMD, on = "intron_id",
        c("Inc_Is_Protein_Coding") := (get("i.intron_type") == "CDS")]
    rowEvent.Extended[get("EventType") == "IR" &
        get("Exc_Is_Protein_Coding") == FALSE, c("Exc_Is_NMD") := NA]
    rowEvent.Extended[get("EventType") == "IR" &
        get("Inc_Is_Protein_Coding") == FALSE, c("Inc_Is_NMD") := NA]

    rowEvent.Extended[Splice.Options.Summary[get("isoform") == "A"],
        on = "EventName", c("Inc_Is_Protein_Coding") := get("i.any_is_PC")]
    rowEvent.Extended[Splice.Options.Summary[get("isoform") == "B"],
        on = "EventName", c("Exc_Is_Protein_Coding") := get("i.any_is_PC")]
    rowEvent.Extended[, c("Inc_Is_NMD", "Exc_Is_NMD") := list(FALSE, FALSE)]
    rowEvent.Extended[IR_NMD[!is.na(get("splice_is_NMD"))], on = "intron_id",
        c("Exc_Is_NMD") := get("i.splice_is_NMD")]
    rowEvent.Extended[IR_NMD, on = "intron_id",
        c("Inc_Is_NMD") := get("i.IRT_is_NMD")]

    rowEvent.Extended[Splice.Options.Summary[get("isoform") == "A"],
        on = "EventName", c("Inc_Is_NMD") := get("i.all_is_NMD")]
    rowEvent.Extended[Splice.Options.Summary[get("isoform") == "B"],
        on = "EventName", c("Exc_Is_NMD") := get("i.all_is_NMD")]
    rowEvent.Extended[candidate.introns, on = "intron_id",
        c("Inc_TSL") := get("i.transcript_support_level")]
    rowEvent.Extended[candidate.introns, on = "intron_id",
        c("Exc_TSL") := get("i.transcript_support_level")]
    rowEvent.Extended[Splice.Options.Summary[get("isoform") == "A"],
        on = "EventName", c("Inc_TSL") := get("i.tsl_min")]
    rowEvent.Extended[Splice.Options.Summary[get("isoform") == "B"],
        on = "EventName", c("Exc_TSL") := get("i.tsl_min")]
        
    # Designate exclusive first intron / last intron
    candidate.introns[, c("max_intron_number") :=
        max(get("intron_number")), by = "Event"]
    candidate.introns[, c("inverse_intron_number") :=
        max(get("intron_number")) - get("intron_number") + 1, 
        by = "transcript_id"]
    candidate.introns[, c("max_inv_intron_number") :=
        max(get("inverse_intron_number")), by = "Event"]
    candidate.introns[, c("Event1a") := get("Event")]
    candidate.introns[, c("Event1b") := get("Event")]

    # define Event1 / Event2
    rowEvent.Extended[get("EventType") == "IR",
        c("Event1a") := get("EventRegion")]
    rowEvent.Extended[Splice.Anno, on = "EventName",
        c("Event1a", "Event2a", "Event1b", "Event2b") :=
        list(get("i.Event1a"), get("i.Event2a"),
            get("i.Event1b"), get("i.Event2b"))]

    rowEvent.Extended[candidate.introns, on = "Event1a",
        c("iafi_A") := (get("i.max_intron_number") == 1)]
    rowEvent.Extended[candidate.introns, on = "Event1b",
        c("iafi_B") := (get("i.max_intron_number") == 1)]
    rowEvent.Extended[, c("is_always_first_intron") :=
        get("iafi_A") & get("iafi_B")]
    rowEvent.Extended[, c("iafi_A", "iafi_B") := list(NULL, NULL)]

    rowEvent.Extended[candidate.introns, on = "Event1a",
        c("iali_A") := (get("i.max_inv_intron_number") == 1)]
    rowEvent.Extended[candidate.introns, on = "Event1b",
        c("iali_B") := (get("i.max_inv_intron_number") == 1)]
    rowEvent.Extended[, c("is_always_last_intron") :=
        get("iali_A") & get("iali_B")]
    rowEvent.Extended[, c("iali_A", "iali_B") := list(NULL, NULL)]
    
    rowEvent.Extended[get("EventType") %in% c("MXE", "SE"),
        c("is_always_first_intron", "is_always_last_intron") := list(NA,NA)]
    write.fst(rowEvent.Extended, file.path(norm_output_path, "rowEvent.fst"))
}

################################################################################
# Sub

# Collates data from junction counts and IRFinder coverage
.collateData_compile_agglist <- function(x, jobs, df.internal,
        norm_output_path, IRMode) {

    # Load dataframe headers (left-most columns containing annotations)
    rowEvent <- as.data.table(read.fst(
        file.path(norm_output_path, "rowEvent.brief.fst")))
    junc.common <- as.data.table(read.fst(
        file.path(norm_output_path, "annotation", "Junc.fst")))
    irf.common <- as.data.table(read.fst(
        file.path(norm_output_path, "annotation", "IR.fst")))
    Splice.Anno <- as.data.table(read.fst(
        file.path(norm_output_path, "annotation", "Splice.fst")))
    junc_PSI <- as.data.table(read.fst(
        file.path(norm_output_path, "junc_PSI_index.fst")))

    work <- jobs[[x]]
    block <- df.internal[work]
    templates <- .collateData_seed_init(rowEvent, junc_PSI) # list of DT

    for (i in seq_len(length(work))) {
        junc <- .collateData_process_junc(
            block$sample[i], block$strand[i],
            junc.common, norm_output_path)

        irf <- .collateData_process_irf(
            block$sample[i], block$strand[i], junc,
            irf.common, norm_output_path)

        splice <- .collateData_process_splice(
            junc, irf, Splice.Anno)

        splice <- .collateData_process_splice_depth(
            splice, irf)

        .collateData_process_assays_as_fst(.copy_DT(templates),
            block$sample[i], junc, irf, splice, IRMode, norm_output_path)

        # remove temp files - raw extracted junc / IRF output from IRFinder
        file.remove(file.path(norm_output_path, "temp",
            paste(block$sample[i], "junc.fst.tmp", sep = ".")))
        file.remove(file.path(norm_output_path, "temp",
            paste(block$sample[i], "irf.fst.tmp", sep = ".")))
    } # end FOR loop
    return(NULL)
}

# INTERNAL: copies a list of data.table
.copy_DT <- function(templates) {
    copy <- lapply(templates, copy)
    names(copy) <- names(templates)
    return(copy)
}

# Curate list of subset headers
.collateData_seed_init <- function(rowEvent, junc_PSI) {
    templates <- list(
        assay = copy(rowEvent),
        inc = rowEvent[get("EventType") %in% c("IR", "MXE", "SE", "RI")],
        exc = rowEvent[get("EventType") %in% c("MXE")],
        junc = copy(junc_PSI)
    )
    return(templates)
}

# Collates junction counts, given sample strandedness and junction strand anno
# - calculates SpliceLeft, SpliceRight, SpliceOver sums
.collateData_process_junc <- function(sample, strand,
        junc.common, norm_output_path) {
    junc <- as.data.table(
        read.fst(file.path(norm_output_path, "temp",
            paste(sample, "junc.fst.tmp", sep = ".")))
    )
    junc[, c("start") := get("start") + 1]
    junc$strand <- NULL # Use strand from junc.common

    junc <- junc[junc.common, on = colnames(junc.common)[c(1, 2, 3)]]
    if (strand == 0) {
        junc$count <- junc$total
    } else if (strand == -1) {
        junc$count <- 0
        junc[get("strand") == "+", c("count") := get("neg")]
        junc[get("strand") == "-", c("count") := get("pos")]
        junc[get("strand") == "*", c("count") := get("total")]
    } else {
        junc$count <- 0
        junc[get("strand") == "+", c("count") := get("pos")]
        junc[get("strand") == "-", c("count") := get("neg")]
        junc[get("strand") == "*", c("count") := get("total")]
    }
    junc[is.na(get("count")), c("count") := 0]
    junc <- junc[, c("seqnames", "start", "end", "strand", "Event", "count")]
    junc <- cbind(junc, junc.common[, c("JG_up", "JG_down")])
    junc[, c("SO_L") := 0]
    junc[, c("SO_R") := 0]
    junc[, c("SO_I") := 0]

    # SpliceLeft and SpliceRight calculations
    junc[, c("SL") := sum(get("count")),
        by = c("seqnames", "start", "strand")]
    junc[, c("SR") := sum(get("count")),
        by = c("seqnames", "end", "strand")]

    # first overlap any junction that has non-same-island junctions
    junc[get("JG_up") != get("JG_down") &
            get("JG_up") != "" & get("strand") == "+",
        c("SO_L") := sum(get("count")), by = "JG_up"]
    junc[get("JG_up") != get("JG_down") &
            get("JG_down") != "" & get("strand") == "+",
        c("SO_R") := sum(get("count")), by = "JG_down"]
    junc[get("JG_up") != get("JG_down") &
            get("JG_up") != "" & get("strand") == "-",
        c("SO_R") := sum(get("count")), by = "JG_up"]
    junc[get("JG_up") != get("JG_down") &
            get("JG_down") != "" & get("strand") == "-",
        c("SO_L") := sum(get("count")), by = "JG_down"]

    # Then use a simple overlap method to account for the remainder

    # Filter for same-island junctions
    junc.subset <- junc[get("JG_up") == get("JG_down") &
        get("JG_up") != "" & get("JG_down") != ""]
    junc.subset <- junc.subset[!(grepl("NA_NA", get("JG_up")) &
        !grepl("NA_NA", get("JG_down")))]

    if (nrow(junc.subset) > 0) {
        OL <- findOverlaps(.grDT(junc.subset), .grDT(junc))

        splice.overlaps.DT <- data.table(from = from(OL), to = to(OL))
        splice.overlaps.DT[,
            c("count") := junc$count[to(OL)]]
        splice.overlaps.DT[,
            c("count_sum") := sum(get("count")), by = "from"]
        splice.summa <- unique(
            splice.overlaps.DT[, c("from", "count_sum")])

        junc.subset[splice.summa$from,
            c("SO_I") := splice.summa$count_sum]

        junc[junc.subset, on = c("Event"), c("SO_I") := get("i.SO_I")]

        # For annotated junctions, take SpliceOver as max of
        # SpliceLeft, SpliceRight, or SpliceOver
        junc[get("SO_L") < get("SO_I"), c("SO_L") := get("SO_I")]
        junc[get("SO_R") < get("SO_I"), c("SO_R") := get("SO_I")]
        # Finally, for extreme cases, make SO_L = SL if underestimates
        junc[get("SO_L") < get("SL"), c("SO_L") := get("SL")]
        junc[get("SO_R") < get("SR"), c("SO_R") := get("SR")]
        junc[, c("SO_I") := NULL]
    }
    return(junc)
}

# Imports IRFinder data, calculates SpliceOver from junction counts
.collateData_process_irf <- function(sample, strand, junc,
        irf.common, norm_output_path) {
    irf <- as.data.table(
        read.fst(file.path(norm_output_path, "temp",
            paste(sample, "irf.fst.tmp", sep = ".")))
    )

    irf[, c("start") := get("start") + 1]
    irf <- irf[irf.common, on = colnames(irf.common)[seq_len(6)],
        c("EventRegion") := get("i.EventRegion")]

    # Extra statistics:
    irf[, c("SpliceMax") := 0]
    irf[get("SpliceLeft") >= get("SpliceRight"),
        c("SpliceMax") := get("SpliceLeft")]
    irf[get("SpliceLeft") < get("SpliceRight"),
        c("SpliceMax") := get("SpliceRight")]

    irf[junc, on = c("seqnames", "start", "end", "strand"),
        c("SpliceOverLeft") := get("SO_L")]
    irf[junc, on = c("seqnames", "start", "end", "strand"),
        c("SpliceOverRight") := get("SO_R")]
    irf[get("SpliceOverLeft") >= get("SpliceOverRight"),
        c("SpliceOverMax") := get("SpliceOverLeft")]
    irf[get("SpliceOverLeft") < get("SpliceOverRight"),
        c("SpliceOverMax") := get("SpliceOverRight")]

    irf[, c("IROratio") := 0]
    irf[get("IntronDepth") < 1 & get("IntronDepth") > 0 &
        (get("Coverage") + get("SpliceOverMax")) > 0,
        c("IROratio") := get("Coverage") / (
            get("Coverage") + get("SpliceOverMax"))]
    irf[get("IntronDepth") >= 1,
        c("IROratio") := get("IntronDepth") /
            (get("IntronDepth") + get("SpliceOverMax"))]

    irf[, c("TotalDepth") := get("IntronDepth") + get("SpliceOverMax")]
    setnames(irf, "Name", "EventName")
    return(irf)
}

# Calculates junction counts for each annotated splice event. Also calculates:
# - participation: The proportion of junctions at the region for each ASE
# - coverage: The total number of junctions covered at that ASE region
.collateData_process_splice <- function(junc, irf, Splice.Anno) {
    splice <- copy(Splice.Anno)

    # Summarises counts for all splice events as defined by Event(1/2)(a/b)
    splice[, c("count_Event1a", "count_Event2a",
        "count_Event1b", "count_Event2b") := list(0, 0, 0, 0)]
    splice[!is.na(get("Event1a")),
        c("count_Event1a") := junc$count[match(get("Event1a"), junc$Event)]]
    splice[is.na(get("count_Event1a")),
        c("count_Event1a") := 0]
    splice[!is.na(get("Event1a")) & get("EventType") == "RI",
        c("count_Event1a") :=
            irf$IntronDepth[match(get("Event1a"), irf$EventRegion)]
    ]

    splice[!is.na(get("Event2a")),
        c("count_Event2a") := junc$count[match(get("Event2a"), junc$Event)]]
    splice[is.na(get("count_Event2a")),
        c("count_Event2a") := 0]

    splice[!is.na(get("Event1b")),
        c("count_Event1b") := junc$count[match(get("Event1b"), junc$Event)]]
    splice[is.na(get("count_Event1b")),
        c("count_Event1b") := 0]

    splice[!is.na(get("Event2b")),
        c("count_Event2b") := junc$count[match(get("Event2b"), junc$Event)]]
    splice[is.na(get("count_Event2b")),
        c("count_Event2b") := 0]

    # Hack: for RI, ExonToIntronReadsLeft/Right is stored in count_Event2a/b
    splice[get("EventType") == "RI" &
            tstrsplit(get("Event1a"), split = "/", fixed = TRUE)[[2]] == "+",
        c("count_Event2a", "count_Event2b") := list(
            irf$ExonToIntronReadsLeft[match(get("Event1a"), irf$EventRegion)],
            irf$ExonToIntronReadsRight[match(get("Event1a"), irf$EventRegion)]
        )
    ]
    splice[get("EventType") == "RI" &
            tstrsplit(get("Event1a"), split = "/", fixed = TRUE)[[2]] == "-",
        c("count_Event2a", "count_Event2b") := list(
            irf$ExonToIntronReadsRight[match(get("Event1a"), irf$EventRegion)],
            irf$ExonToIntronReadsLeft[match(get("Event1a"), irf$EventRegion)]
        )
    ]

    # Calculates total splice events that participates from the up/down
    #   stream exon islands
    splice[, c("count_JG_up", "count_JG_down") := list(0, 0)]
    splice[!is.na(get("JG_up")) & get("strand") == "+",
        c("count_JG_up") := junc$SO_L[match(get("JG_up"), junc$JG_up)]]
    splice[!is.na(get("JG_up")) & get("strand") == "-",
        c("count_JG_up") := junc$SO_R[match(get("JG_up"), junc$JG_up)]]
    splice[is.na(get("count_JG_up")), c("count_JG_up") := 0]
    splice[!is.na(get("JG_down")) & get("strand") == "-",
        c("count_JG_down") := junc$SO_L[match(get("JG_down"), junc$JG_down)]]
    splice[!is.na(get("JG_down")) & get("strand") == "+",
        c("count_JG_down") := junc$SO_R[match(get("JG_down"), junc$JG_down)]]
    splice[is.na(get("count_JG_down")), c("count_JG_down") := 0]

    # Splice participation
    # - this calculates the number of events that participates in splicing
    #   across the upstream / downstream exon island that belong to
    #   either isoform A or B
    splice[, c("partic_up", "partic_down") := list(0, 0)]
    splice[get("EventType") %in% c("MXE", "SE", "ALE", "A3SS", "RI"),
        c("partic_up") := get("count_Event1a") + get("count_Event1b")]
    splice[get("EventType") %in% c("MXE"),
        c("partic_down") := get("count_Event2a") + get("count_Event2b")]
    splice[get("EventType") %in% c("SE"),
        c("partic_down") := get("count_Event2a") + get("count_Event1b")]
    splice[get("EventType") %in% c("AFE", "A5SS", "RI"),
        c("partic_down") := get("count_Event1a") + get("count_Event1b")]

    # Splice "coverage" = participation / max_JG
    splice[, c("cov_up", "cov_down") := list(0, 0)]
    splice[get("count_JG_up") > 0,
        c("cov_up") := get("partic_up") / get("count_JG_up")]
    splice[get("count_JG_down") > 0,
        c("cov_down") := get("partic_down") / get("count_JG_down")]
    splice[get("EventType") %in% c("MXE", "SE", "RI") &
        get("cov_up") < get("cov_down"),
        c("coverage") := get("cov_up")]
    splice[get("EventType") %in% c("MXE", "SE", "RI") &
        get("cov_up") >= get("cov_down"),
        c("coverage") := get("cov_down")]
    splice[get("EventType") %in% c("ALE", "A3SS"),
        c("coverage") := get("cov_up")]
    splice[get("EventType") %in% c("AFE", "A5SS"),
        c("coverage") := get("cov_down")]

    return(splice)
}

# Calculates TotalDepth which includes local IR levels.
# - Used to normalize Coverage plots
.collateData_process_splice_depth <- function(splice, irf) {

    # Calculate depth for splicing where EventRegion doesn't cover a single
    #   intron or where IRFinder has decided not to assess the intron
    splice.no_region <- splice[!(get("EventRegion") %in% irf$EventRegion)]
    splice.no_region[, c("Depth1a") :=
        irf$TotalDepth[match(get("Event1a"), irf$EventRegion)]]
    splice.no_region[, c("Depth2a") :=
        irf$TotalDepth[match(get("Event2a"), irf$EventRegion)]]
    splice.no_region[, c("Depth1b") :=
        irf$TotalDepth[match(get("Event1b"), irf$EventRegion)]]
    splice.no_region[, c("Depth2b") :=
        irf$TotalDepth[match(get("Event2b"), irf$EventRegion)]]
    splice.no_region[is.na(get("Depth1a")), c("Depth1a") := 0]
    splice.no_region[is.na(get("Depth1b")), c("Depth1b") := 0]
    splice.no_region[is.na(get("Depth2a")), c("Depth2a") := 0]
    splice.no_region[is.na(get("Depth2b")), c("Depth2b") := 0]

    # First guess depth based on depths of any junction that is assessed
    #   by IRFinder
    splice.no_region[, c("Depth") := 0]
    splice.no_region[get("Depth1a") > get("Depth2a"),
        c("DepthA") := get("Depth1a")]
    splice.no_region[get("Depth1b") > get("Depth2b"),
        c("DepthB") := get("Depth1b")]
    splice.no_region[get("Depth1a") <= get("Depth2a"),
        c("DepthA") := get("Depth2a")]
    splice.no_region[get("Depth1b") <= get("Depth2b"),
        c("DepthB") := get("Depth2b")]
    splice.no_region[get("DepthA") > get("DepthB"),
        c("Depth") := get("DepthA")]
    splice.no_region[get("DepthA") <= get("DepthB"),
        c("Depth") := get("DepthB")]

    # If this fails, guess depth based on JG_up / JG_down
    splice.no_region[get("Depth") == 0 &
        get("count_JG_up") > get("count_JG_down"),
        c("Depth") := get("count_JG_up")]
    splice.no_region[get("Depth") == 0 &
        get("count_JG_up") <= get("count_JG_down"),
        c("Depth") := get("count_JG_down")]

    splice[, c("TotalDepth") := 0]
    splice[irf, on = "EventRegion",
        c("TotalDepth") := get("i.TotalDepth")]
    splice[splice.no_region, on = "EventName",
        c("TotalDepth") := get("i.Depth")]
    return(splice)
}

# Compiles all the data as assays, write as temp FST files
# - This acts as "on-disk memory" to avoid using too much memory
.collateData_process_assays_as_fst <- function(templates,
        sample, junc, irf, splice, IRMode, norm_output_path) {

    assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
    inc.todo <- c("Up_Inc", "Down_Inc")
    exc.todo <- c("Up_Exc", "Down_Exc")
    junc.todo <- c("junc_PSI", "junc_counts")

    # Included / Excluded counts for IR and splicing
    templates$assay[, c("Included") := c(
        irf$IntronDepth,
        0.5 * (splice$count_Event1a[splice$EventType %in% c("SE", "MXE")] +
            splice$count_Event2a[splice$EventType %in% c("SE", "MXE")]),
        splice$count_Event1a[!splice$EventType %in% c("SE", "MXE")]
    )]
    if (IRMode == "SpliceOverMax") {
        templates$assay[, c("Excluded") := c(
            irf$SpliceOverMax,
            0.5 * (splice$count_Event1b[splice$EventType %in% c("MXE")] +
                splice$count_Event2b[splice$EventType %in% c("MXE")]),
            splice$count_Event1b[!splice$EventType %in% c("MXE")]
        )]
    } else {
        templates$assay[, c("Excluded") := c(
            irf$SpliceMax,
            0.5 * (splice$count_Event1b[splice$EventType %in% c("MXE")] +
                splice$count_Event2b[splice$EventType %in% c("MXE")]),
            splice$count_Event1b[!splice$EventType %in% c("MXE")]
        )]
    }

    # Validity checking for IR, MXE, SE
    irf[get("strand") == "+", c("Up_Inc") := get("ExonToIntronReadsLeft")]
    irf[get("strand") == "-", c("Up_Inc") := get("ExonToIntronReadsRight")]
    irf[get("strand") == "+", c("Down_Inc") := get("ExonToIntronReadsRight")]
    irf[get("strand") == "-", c("Down_Inc") := get("ExonToIntronReadsLeft")]
    templates$inc[, c("Up_Inc") := c(irf$Up_Inc,
            splice$count_Event1a[splice$EventType %in% c("MXE", "SE")],
            splice$count_Event2a[splice$EventType %in% c("RI")]
    )]
    templates$inc[, c("Down_Inc") := c(irf$Down_Inc,
            splice$count_Event2a[splice$EventType %in% c("MXE", "SE")],
            splice$count_Event2b[splice$EventType %in% c("RI")]
    )]
    templates$exc[, c("Up_Exc") :=
        splice$count_Event1b[splice$EventType %in% c("MXE")]]
    templates$exc[, c("Down_Exc") :=
        splice$count_Event2b[splice$EventType %in% c("MXE")]]

    templates$assay[, c("Depth") := c(irf$TotalDepth, splice$TotalDepth)]
    templates$assay[, c("Coverage") := c(irf$Coverage, splice$coverage)]

    splice[get("EventType") %in% c("MXE", "SE") &
        get("cov_up") < get("cov_down"),
        c("minDepth") := get("count_JG_up")]
    splice[get("EventType") %in% c("MXE", "SE") &
        get("cov_up") >= get("cov_down"),
        c("minDepth") := get("count_JG_down")]
    splice[get("EventType") %in% c("ALE", "A3SS", "RI"),
        c("minDepth") := get("count_JG_up")]
    splice[get("EventType") %in% c("AFE", "A5SS"),
        c("minDepth") := get("count_JG_down")]
    templates$assay[, c("minDepth") := c(irf$IntronDepth, splice$minDepth)]

    junc[get("count") == 0, c("PSI") := 0]
    junc[get("SO_L") > get("SO_R"),
        c("PSI") := get("count") / get("SO_L")]
    junc[get("SO_R") >= get("SO_L") & get("SO_R") > 0,
        c("PSI") := get("count") / get("SO_R")]
    templates$junc[junc, on = c("seqnames", "start", "end", "strand"),
        c("junc_PSI", "junc_counts") := list(get("i.PSI"), get("i.count"))]

    fst::write.fst(as.data.frame(templates$assay[, assay.todo, with = FALSE]),
        file.path(norm_output_path, "temp",
                paste("assays", sample, "fst.tmp", sep = ".")))
    fst::write.fst(as.data.frame(templates$inc[, inc.todo, with = FALSE]),
        file.path(norm_output_path, "temp",
                paste("included", sample, "fst.tmp", sep = ".")))
    fst::write.fst(as.data.frame(templates$exc[, exc.todo, with = FALSE]),
        file.path(norm_output_path, "temp",
                paste("excluded", sample, "fst.tmp", sep = ".")))
    fst::write.fst(as.data.frame(templates$junc[, junc.todo, with = FALSE]),
        file.path(norm_output_path, "temp",
                paste("junc_psi", sample, "fst.tmp", sep = ".")))
}

################################################################################

# Reads the "on-disk memory" FST files and compile to H5
.collateData_compile_assays_from_fst <- function(df.internal,
        norm_output_path, samples_per_block) {

    rowname_lists <-
        .collateData_H5_initialize(nrow(df.internal), norm_output_path)

    .collateData_H5_write_assays(df.internal, norm_output_path,
        samples_per_block)

    # Retrieve assays:
    assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
    inc.todo <- c("Up_Inc", "Down_Inc")
    exc.todo <- c("Up_Exc", "Down_Exc")
    junc.todo <- c("junc_PSI", "junc_counts")
    stuff.todo <- c(assay.todo, inc.todo, exc.todo, junc.todo)

    h5filename <- file.path(norm_output_path, "data.h5")
    assays <- list()
    for (assay in assay.todo) {
        h5writeDimnames(list(rowname_lists$EventName, df.internal$sample),
            h5filename, assay)
        assays[[assay]] <- HDF5Array(h5filename, assay)
    }
    for (inc in inc.todo) {
        h5writeDimnames(list(rowname_lists$Inc_Events, df.internal$sample),
            h5filename, inc)
        assays[[inc]] <- HDF5Array(h5filename, inc)
    }
    for (exc in exc.todo) {
        h5writeDimnames(list(rowname_lists$Exc_Events, df.internal$sample),
            h5filename, exc)
        assays[[exc]] <- HDF5Array(h5filename, exc)
    }
    for (junc in junc.todo) {
        h5writeDimnames(list(rowname_lists$junc_rownames, df.internal$sample),
            h5filename, junc)
        assays[[junc]] <- HDF5Array(h5filename, junc)
    }
    return(assays)
}

# Initializes H5 arrays using rowData; return rownames as a list
.collateData_H5_initialize <- function(
        num_samples, norm_output_path
) {
    assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
    inc.todo <- c("Up_Inc", "Down_Inc")
    exc.todo <- c("Up_Exc", "Down_Exc")
    junc.todo <- c("junc_PSI", "junc_counts")
    stuff.todo <- c(assay.todo, inc.todo, exc.todo, junc.todo)

    rowData <- as.data.table(
        read.fst(file.path(norm_output_path, "rowEvent.fst")))
    Inc_Events <- rowData$EventName[rowData$EventType %in%
        c("IR", "MXE", "SE", "RI")]
    Exc_Events <- rowData$EventName[rowData$EventType %in% "MXE"]

    junc_index <- fst::read.fst(file.path(
        norm_output_path, "junc_PSI_index.fst"
    ))
    junc_rownames <- with(junc_index,
        paste0(seqnames, ":", start, "-", end, "/", strand))

    h5filename <- file.path(norm_output_path, "data.h5")
    if (file.exists(h5filename)) file.remove(h5filename)
    h5createFile(h5filename)
    for (assay in assay.todo) {
        chunk_row = nrow(rowData)
        h5createDataset(file = h5filename,
            dataset = assay,
            dims = c(nrow(rowData), num_samples),
            storage.mode = "double", 
            chunk = c(chunk_row, 1), level = 6
        )
    }
    for (inc in inc.todo) {
        chunk_row = length(Inc_Events)
        h5createDataset(file = h5filename,
            dataset = inc,
            dims = c(length(Inc_Events), num_samples),
            storage.mode = "double", 
            chunk = c(chunk_row, 1), level = 6
        )
    }
    for (exc in exc.todo) {
        chunk_row = length(Exc_Events)
        h5createDataset(file = h5filename, dataset = exc,
            dims = c(length(Exc_Events), num_samples),
            storage.mode = "double", 
            chunk = c(chunk_row, 1), level = 6
        )
    }
    for (junc in junc.todo) {
        chunk_row = length(junc_rownames)
        h5createDataset(file = h5filename, dataset = junc,
            dims = c(length(junc_rownames), num_samples),
            storage.mode = "double", 
            chunk = c(chunk_row, 1), level = 6
        )
    }
    rowname_lists <- list(
        EventName = rowData$EventName,
        Inc_Events = Inc_Events, Exc_Events = Exc_Events,
        junc_rownames = junc_rownames
    )
    return(rowname_lists)
}

# Read temp FST files; add to H5 assays
.collateData_H5_write_assays <- function(
        df.internal, norm_output_path, samples_per_block
) {
    assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
    inc.todo <- c("Up_Inc", "Down_Inc")
    exc.todo <- c("Up_Exc", "Down_Exc")
    junc.todo <- c("junc_PSI", "junc_counts")
    stuff.todo <- c(assay.todo, inc.todo, exc.todo, junc.todo)

    h5filename <- file.path(norm_output_path, "data.h5")

    # Make a nested list
    matrices <- list()
    for (stuff in stuff.todo) {
        matrices[[stuff]] <- list()
    }
    buf_i <- 0 # Counts the number of in-memory samples
    pb <- txtProgressBar(max = nrow(df.internal), style = 3)
    for (i in seq_len(nrow(df.internal))) {
        setTxtProgressBar(pb, i)
        sample <- df.internal$sample[i]
        buf_i <- buf_i + 1

        mat <- fst::read.fst(file.path(norm_output_path, "temp",
            paste("assays", sample, "fst.tmp", sep = ".")))
        for (stuff in assay.todo) {
            matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
        }
        mat <- fst::read.fst(file.path(norm_output_path, "temp",
            paste("included", sample, "fst.tmp", sep = ".")))
        for (stuff in inc.todo) {
            matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
        }
        mat <- fst::read.fst(file.path(norm_output_path, "temp",
            paste("excluded", sample, "fst.tmp", sep = ".")))
        for (stuff in exc.todo) {
            matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
        }
        mat <- fst::read.fst(file.path(norm_output_path, "temp",
            paste("junc_psi", sample, "fst.tmp", sep = ".")))
        for (stuff in junc.todo) {
            matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
        }
        if (buf_i >= samples_per_block | i == nrow(df.internal)) {
            for (stuff in stuff.todo) {
                # write
                h5write(as.matrix(do.call(cbind, matrices[[stuff]])),
                    file = h5filename, name = stuff,
                    index = list(NULL, seq(i - buf_i + 1, i))
                )
                # reset
                matrices[[stuff]] <- list()
            }
            buf_i <- 0
            gc()
        }
    }
    setTxtProgressBar(pb, i)
    close(pb)
}

################################################################################

# Writes sample stats to FST
.collateData_write_stats <- function(df.internal, norm_output_path) {
    outfile <- file.path(norm_output_path, "stats.fst")
    # Convert path to relative path, for reproducibility:
    df.internal$path <- .make_path_relative(df.internal$path, norm_output_path)
    write.fst(as.data.frame(df.internal), outfile)
}

# Writes default colData to RDS
.collateData_write_colData <- function(df.internal, coverage_files,
        norm_output_path) {
    if(!is.null(coverage_files)) {
        covfiles_full <- normalizePath(file.path(norm_output_path, coverage_files))
        # Create barebones colData.Rds - save coverage files as well
        if (length(coverage_files) == nrow(df.internal) & IsCOV(covfiles_full)) {
            df.files <- data.table(
                sample = df.internal$sample,
                bam_file = "",
                irf_file = df.internal$path,
                cov_file = coverage_files
            )
        } else {
            df.files <- data.table(
                sample = df.internal$sample,
                bam_file = "",
                irf_file = df.internal$path,
                cov_file = ""
            )
        }
    } else {
        df.files <- data.table(
            sample = df.internal$sample,
            bam_file = "",
            irf_file = df.internal$path,
            cov_file = ""
        )    
    }

    #
    colData <- list(
        df.files = df.files,
        df.anno = data.table(sample = df.internal$sample)
    )
    saveRDS(colData, file.path(norm_output_path, "colData.Rds"))
}

# Loads the newly created H5 database so we can save it as a
#   SummarizedExperiment for quick recall later
.collateData_initialise_HDF5 <- function(collate_path, colData, assays) {
    item.todo <- c("rowEvent", "Included", "Excluded", "Depth", "Coverage",
        "minDepth", "Up_Inc", "Down_Inc", "Up_Exc", "Down_Exc")

    # Annotate NMD direction
    rowData <- as.data.table(read.fst(file.path(collate_path, "rowEvent.fst")))
    rowData[, c("NMD_direction") := 0]
    rowData[get("Inc_Is_NMD") & !get("Exc_Is_NMD"), c("NMD_direction") := 1]
    rowData[!get("Inc_Is_NMD") & get("Exc_Is_NMD"), c("NMD_direction") := -1]
    rowData <- as.data.frame(rowData)

    colData <- as.data.frame(colData)
    colData_use <- colData[, -1, drop = FALSE]
    rownames(colData_use) <- colData$sample

    se <- SummarizedExperiment(
        assays = SimpleList(
            Included = assays[["Included"]],
            Excluded = assays[["Excluded"]],
            Depth = assays[["Depth"]],
            Coverage = assays[["Coverage"]],
            minDepth = assays[["minDepth"]]
        ), rowData = rowData, colData = colData_use
    )
    rownames(se) <- rowData(se)$EventName

    metadata(se)$Up_Inc <- assays[["Up_Inc"]]
    metadata(se)$Down_Inc <- assays[["Down_Inc"]]
    metadata(se)$Up_Exc <- assays[["Up_Exc"]]
    metadata(se)$Down_Exc <- assays[["Down_Exc"]]

    colData.Rds <- readRDS(file.path(collate_path, "colData.Rds"))
    if ("df.files" %in% names(colData.Rds) &&
        "cov_file" %in% colnames(colData.Rds$df.files)) {
        metadata(se)$cov_file <- colData.Rds$df.files$cov_file
    }
    stats.df <- fst::read.fst(file.path(collate_path, "stats.fst"))
    metadata(se)$sampleQC <- stats.df[, -1, drop = FALSE]
    rownames(metadata(se)$sampleQC) <- colnames(se)

    # Add reference last
    metadata(se)$ref <- readRDS(file.path(
        collate_path, "annotation", "cov_data.Rds"
    ))

    return(se)
}

.collateData_save_NxtSE <- function(se, filepath) {
    se@assays <- .collateData_simplify_assay_paths(se@assays)
    se@metadata[["Up_Inc"]] <- .collateData_simplify_assay_path(
        se@metadata[["Up_Inc"]])
    se@metadata[["Down_Inc"]] <- .collateData_simplify_assay_path(
        se@metadata[["Down_Inc"]])
    se@metadata[["Up_Exc"]] <- .collateData_simplify_assay_path(
        se@metadata[["Up_Exc"]])
    se@metadata[["Down_Exc"]] <- .collateData_simplify_assay_path(
        se@metadata[["Down_Exc"]])
    saveRDS(se, filepath)
}

# Internals for save_NxtSE; adapted from HDF5Array saveHDF5SummarizedExperiment
# - These are needed if the CollateData folder has been moved since last access
# - allows relative paths to be used

# Saving a NxtSE object

# Add single assay
.collateData_simplify_assay_path <- function(assay) {
    DelayedArray::modify_seeds(assay,
        function(x) {
            x@filepath <- basename(x@filepath)
            x
        }
    )
}

# Add list of assays
.collateData_simplify_assay_paths <- function(assays) {
    nassay <- length(assays)
    for (i in seq_len(nassay)) {
        a <- .collateData_simplify_assay_path(getListElement(assays, i))
        assays <- setListElement(assays, i, a)
    }
    return(assays)
}

# Loading a NxtSE object

# Fix a single assay
.collateData_expand_assay_path <- function(assay, path) {
    DelayedArray::modify_seeds(assay,
        function(x) {
            x@filepath <- file.path(path, x@filepath)
            x
        }
    )
}

# Fix a list of assays
.collateData_expand_assay_paths <- function(assays, path) {

    nassay <- length(assays)
    for (i in seq_len(nassay)) {
        a <- .collateData_expand_assay_path(getListElement(assays, i), path)
        assays <- setListElement(assays, i, a)
    }
    return(assays)
}

# Internals - compile reference data from genome, for quick access

.prepare_covplot_data <- function(reference_path) {
    .validate_reference(reference_path)
    genome <- Get_Genome(reference_path)
    data <- list(
        seqInfo = seqinfo(genome),
        gene_list = .getGeneList(reference_path),
        elem.DT = .loadViewRef(reference_path),
        transcripts.DT = .loadTranscripts(reference_path)
    )
    return(data)
}

.loadViewRef <- function(reference_path) {
    .validate_reference(reference_path)

    dir_path <- file.path(reference_path, "fst")

    exons.DT <- as.data.table(read.fst(file.path(dir_path, "Exons.fst"),
        c("seqnames", "start", "end", "strand", "type", "transcript_id")))
    exons.DT <- exons.DT[get("transcript_id") != "protein_coding"]

    protein.DT <- as.data.table(read.fst(file.path(dir_path, "Proteins.fst"),
        c("seqnames", "start", "end", "strand", "type", "transcript_id")))
    misc.DT <- as.data.table(read.fst(file.path(dir_path, "Misc.fst"),
        c("seqnames", "start", "end", "strand", "type", "transcript_id")))

    total.DT <- rbindlist(list(
        exons.DT[, c("seqnames", "start", "end", "strand", "type",
            "transcript_id")],
        protein.DT[, c("seqnames", "start", "end", "strand", "type",
            "transcript_id")],
        misc.DT[, c("seqnames", "start", "end", "strand", "type",
            "transcript_id")]
    ))
    return(total.DT)
}

.getGeneList <- function(reference_path) {
    .validate_reference(reference_path)

    file_path <- file.path(reference_path, "fst", "Genes.fst")
    if (!file.exists(file_path)) {
return(NULL)
}

    df <- as.data.table(read.fst(file_path))
    return(df)
}

.loadTranscripts <- function(reference_path) {
    .validate_reference(reference_path)

    file_path <- file.path(reference_path, "fst", "Transcripts.fst")

    Transcripts.DT <- as.data.table(read.fst(file_path))

    if ("transcript_support_level" %in% colnames(Transcripts.DT)) {
        Transcripts.DT$transcript_support_level <-
            tstrsplit(Transcripts.DT$transcript_support_level, split = " ")[[1]]
        Transcripts.DT$transcript_support_level[
            is.na(Transcripts.DT$transcript_support_level)] <- "NA"
    } else {
        Transcripts.DT$transcript_support_level <- 1
    }

    return(Transcripts.DT)
}
alexchwong/NxtIRFcore documentation built on Oct. 31, 2022, 9:14 a.m.