#' Collates a dataset from (processBAM) output files of individual samples
#'
#' `collateData()` creates a dataset from a collection of [processBAM]
#' output files belonging to an experiment.
#'
#' @details
#' In Windows, collateData runs using only 1 thread, as
#' BiocParallel's MulticoreParam is not supported.
#'
#' It is assumed that all sample [processBAM] outputs were generated using the
#' same reference.
#'
#' The combination of junction counts and IR quantification from
#' [processBAM] is used to calculate percentage spliced in (PSI) of alternative
#' splice events, and intron retention ratios (IR-ratio) of retained introns.
#' Also, QC information is collated. Data is organised in a H5file and FST files
#' for memory and processor efficient downstream access using [makeSE].
#'
#' The original IRFinder algorithm, see the following
#' [wiki](https://github.com/williamritchie/IRFinder/wiki/IRFinder-Output),
#' uses `SpliceMax` to estimate abundance of spliced transcripts.
#' This calculates the number of mapped splice events
#' that share the boundary coordinate of either the left or right flanking
#' exon `SpliceLeft,SpliceRight`, estimating splice abundance as the larger
#' of the two values.
#'
#' SpliceWiz proposes a new algorithm, `SpliceOver`,
#' to account for the possibility that the major isoform shares neither
#' boundary, but arises from either of the flanking exon clusters. Exon
#' clusters are contiguous regions covered by exons from any transcript
#' (except those designated as `retained_intron` or
#' `sense_intronic`), and are separated by
#' obligate intronic regions (genomic regions that are introns for all
#' transcripts). For introns that are internal to a single exon cluster
#' (i.e. akin to "known-exon" introns from IRFinder), `SpliceOver`
#' uses [GenomicRanges::findOverlaps] to sum all splice reads that overlap
#' the same genomic region as the intron of interest.
#'
#' Detection of novel ASEs: When `novelSplicing` is set to `TRUE`,
#' novel junctions (split reads across unannotated junctions from samples
#' of the dataset being collated) are used in conjunction with the reference
#' to compile a list of novel ASEs. To avoid being overwhelmed by a large
#' number of false positive novel junctions (often due to mis-alignments),
#' a simple filtering strategy is used. This involves including novel
#' junctions only if it occurs in a minimum number of samples (default 3),
#' or if the number of split reads of a novel junction is above a pre-defined
#' threshold (default 10) in a certain number of samples (default 1). These
#' parameters can be set using `novelSplicing_minSamples`,
#' `novelSplicing_countThreshold` and `novelSplicing_minSamplesAboveThreshold`
#' respectively.
#'
#' @param Experiment (Required) A 2 or 3 column data frame, ideally generated by
#' [findSpliceWizOutput] or [findSamples].
#' The first column designate the sample names, and the 2nd column
#' contains the path to the [processBAM] output file (of type
#' `sample.txt.gz`).
#' (Optionally) a 3rd column contains the coverage files (of type
#' `sample.cov`) of the corresponding samples.
#' NB: all other columns are ignored.
#' @param reference_path (Required) The path to the reference generated by
#' [Build-Reference-methods]
#' @param output_path (Required) The path to contain the output files for the
#' collated dataset
#' @param IRMode (default `SpliceOver`) The algorithm to calculate
#' 'splice abundance' in IR quantification.
#' Valid options are `SpliceOver` and `SpliceMax`.
#' See details
#' @param lowMemoryMode (default `TRUE`) `collateData()` will perform
#' optimizations to conserve memory if this is set to `TRUE`. Otherwise,
#' will prioritise performance.
#' @param n_threads (default `1`) The number of threads to use. If you run out
#' of memory, try lowering the number of threads
#' @param overwrite (default `FALSE`) If `collateData()` has previously been run
#' using the same set of samples, it will not be overwritten unless this is
#' set to `TRUE`.
#' @param novelSplicing (default FALSE) Whether collateData will use
#' novel junction reads detected in samples to infer novel splice variants.
#' All tandem split reads (those bridging two consecutive splice junctions)
#' are used, as well as novel split reads that satisfy abundance criteria
#' (see `novelSplicing_minSamples`, `novelSplicing_minSamplesAboveThreshold`, and
#' `novelSplicing_countThreshold`) are used to synthesise a dataset-specific
#' SpliceWiz reference. See details.
#' @param novelSplicing_minSamples (default 3) Novel junctions are included in
#' building of novel reference if number samples with non-zero counts
#' exceeds this number.
#' @param novelSplicing_countThreshold (default 10) Threshold of split-reads across
#' novel junctions; used in conjunction with
#' `novelSplicing_minSamplesAboveThreshold`
#' @param novelSplicing_minSamplesAboveThreshold (default 1) Novel junctions are
#' included in building of novel reference if novel junction reads are
#' above a pre-defined threshold exceeds this number
#' @param novelSplicing_requireOneAnnotatedSJ (default `TRUE`) The default requires
#' novel junctions to have one annotated splice site. If this is disabled,
#' collateData will include novel junctions where neither splice site is
#' annotated.
#' @param novelSplicing_useTJ (default `TRUE`) For novel splicing, should
#' SpliceWiz use reads with 2 or more junctions to find novel exons? Ignored
#' if novelSplicing is set to `FALSE`.
#' @param forceStrandAgnostic (default `FALSE`) In poorly-prepared stranded
#' libraries, it may be better to quantify in unstranded mode. Set this to
#' `TRUE` if your stranded libraries may be contaminated with unstranded reads
#' @param packageCOVfiles (default `FALSE`) Whether COV files should be copied
#' over to the NxtSE object. This is useful if one wishes to transfer the
#' NxtSE folder to a collaborator, who can then open the NxtSE object with
#' valid COV file paths.
#' @return `collateData()` writes to the directory given by `output_path`.
#' This output directory is portable (i.e. it can be moved to a different
#' location after running `collateData()` before running [makeSE]), but
#' individual files within the output folder should not be moved.\cr\cr
#' Also, the [processBAM] and [collateData] output folders should be copied to
#' the same destination and their relative paths preserved. Otherwise, the
#' locations of the "COV" files will not be recorded in the collated data and
#' will have to be re-assigned using `covfile(se)<-`. See [makeSE]
#' @examples
#' buildRef(
#' reference_path = file.path(tempdir(), "Reference"),
#' fasta = chrZ_genome(),
#' gtf = chrZ_gtf()
#' )
#'
#' bams <- SpliceWiz_example_bams()
#' processBAM(bams$path, bams$sample,
#' reference_path = file.path(tempdir(), "Reference"),
#' output_path = file.path(tempdir(), "SpliceWiz_Output")
#' )
#'
#' expr <- findSpliceWizOutput(file.path(tempdir(), "SpliceWiz_Output"))
#' collateData(expr,
#' reference_path = file.path(tempdir(), "Reference"),
#' output_path = file.path(tempdir(), "Collated_output")
#' )
#'
#' # Enable novel splicing:
#'
#' collateData(expr,
#' reference_path = file.path(tempdir(), "Reference"),
#' output_path = file.path(tempdir(), "Collated_output"),
#' novelSplicing = TRUE
#' )
#'
#' @seealso [processBAM], [makeSE]
#' @md
#' @export
collateData <- function(Experiment, reference_path, output_path,
IRMode = c("SpliceOver", "SpliceMax"),
packageCOVfiles = FALSE,
novelSplicing = FALSE, forceStrandAgnostic = FALSE,
novelSplicing_minSamples = 3, novelSplicing_countThreshold = 10,
novelSplicing_minSamplesAboveThreshold = 1,
novelSplicing_requireOneAnnotatedSJ = TRUE,
novelSplicing_useTJ = TRUE,
# novelSplicing_extrapolateTJ = FALSE,
overwrite = FALSE, n_threads = 1,
lowMemoryMode = TRUE
) {
# TODO - dynamic memory management
# - see https://stackoverflow.com/questions/6457290/how-to-check-the-amount-of-ram
# TODO - benchmark putative TJ before releasing it
# @param novelSplicing_extrapolateTJ (default `FALSE`) Whether to allow any
# pair of observed (and known) junctions and observed (and known) exons
# to construct "putative" tandem junctions. Assists in identifying novel
# exon skipping and MXE events at a cost of increased false positives.
novelSplicing_extrapolateTJ <- FALSE
if(lowMemoryMode) n_threads <- min(n_threads, 4)
IRMode <- match.arg(IRMode)
if (IRMode == "")
.log(paste("In collateData(),",
"IRMode must be either 'SpliceOver' (default) or 'SpliceMax'"))
BPPARAM_mod <- .validate_threads(n_threads)
if(!is.numeric(novelSplicing_minSamples))
novelSplicing_minSamples <- 0
if(!is.numeric(novelSplicing_countThreshold))
novelSplicing_countThreshold <- 0
if(!is.numeric(novelSplicing_minSamplesAboveThreshold))
novelSplicing_minSamplesAboveThreshold <- 0
if(!is.logical(novelSplicing_requireOneAnnotatedSJ))
novelSplicing_requireOneAnnotatedSJ <- TRUE
if(!is.logical(novelSplicing)) novelSplicing <- FALSE
N_steps <- 8
dash_progress("Validating Experiment; checking COV files...", N_steps)
.log("Validating Experiment; checking COV files...", "message")
norm_output_path <- .collateData_validate(Experiment,
reference_path, output_path)
coverage_files <- .make_path_relative(
.collateData_COV(Experiment), output_path)
df.internal <- .collateData_expr(Experiment)
if (!overwrite && file.exists(file.path(output_path, "seed.Rds"))) {
se <- .makeSE_load_NxtSE(output_path, N = 7)
if (all(colnames(se) %in% df.internal$sample) &
all(df.internal$sample %in% colnames(se))
) {
.log(paste("SpliceWiz already collated this experiment",
"in given directory"), "message")
return()
}
}
originalSWthreads <- .getSWthreads()
# tryCatch({
setSWthreads(1) # try this to prevent memory leak
jobs <- .split_vector(seq_len(nrow(df.internal)), BPPARAM_mod$workers)
dash_progress("Compiling Sample Stats", N_steps)
.log("Compiling Sample Stats", "message")
df.internal <- .collateData_stats(df.internal, jobs, BPPARAM_mod)
stranded <- !any(df.internal$strand == 0) & !forceStrandAgnostic
dash_progress("Compiling Junction List", N_steps)
.collateData_junc_merge(df.internal,
jobs, BPPARAM_mod, norm_output_path)
.collateData_junc_stats(df.internal, jobs,
BPPARAM_mod, norm_output_path,
juncThreshold = novelSplicing_countThreshold)
dash_progress("Compiling Intron Retention List", N_steps)
oldScipen <- options(scipen=999)
.collateData_sw_merge(df.internal, jobs, BPPARAM_mod,
norm_output_path, stranded)
options(oldScipen)
# Tandem junction compilation
if(novelSplicing & novelSplicing_useTJ) {
.collateData_tj_merge(df.internal, jobs,
BPPARAM_mod, norm_output_path)
}
# Reassign +/- based on junctions.fst annotation
# Annotate junctions
dash_progress("Tidying up splice junctions and intron retentions",
N_steps)
.log("Tidying up splice junctions and intron retentions...", "message")
if(BPPARAM_mod$workers == 1) {
.collateData_annotate(reference_path, norm_output_path,
stranded, novelSplicing, lowMemoryMode,
minSamplesWithJunc = novelSplicing_minSamples,
minSamplesAboveJuncThreshold =
novelSplicing_minSamplesAboveThreshold,
novelSplicing_requireOneAnnotatedSJ =
novelSplicing_requireOneAnnotatedSJ,
novelSplicing_useTJ =
novelSplicing_useTJ,
novelSplicing_extrapolateTJ =
novelSplicing_extrapolateTJ
)
} else {
# perform task inside child thread, so we can dump the memory later
.collateData_annotate_BPPARAM(reference_path, norm_output_path,
stranded, novelSplicing, lowMemoryMode,
minSamplesWithJunc = novelSplicing_minSamples,
minSamplesAboveJuncThreshold =
novelSplicing_minSamplesAboveThreshold,
novelSplicing_requireOneAnnotatedSJ =
novelSplicing_requireOneAnnotatedSJ,
novelSplicing_useTJ =
novelSplicing_useTJ,
novelSplicing_extrapolateTJ =
novelSplicing_extrapolateTJ
)
}
message("done\n")
dash_progress("Generating NxtSE assays", N_steps)
.log("Generating NxtSE assays", "message")
# Re-define threads
n_threads_collate_assays <- n_threads
# One job per chunk - more memory efficient
jobs_2 <- .split_vector(
seq_len(nrow(df.internal)),
nrow(df.internal)
)
BPPARAM_mod_progress <- .validate_threads(
n_threads_collate_assays,
progressbar = TRUE,
tasks = nrow(df.internal)
)
agg.list <- BiocParallel::bplapply(
seq_len(nrow(df.internal)),
.collateData_compile_agglist,
jobs = jobs_2, df.internal = df.internal,
norm_output_path = norm_output_path, IRMode = IRMode,
useProgressBar = FALSE,
BPPARAM = BPPARAM_mod_progress
)
gc()
dash_progress("Building Final NxtSE Object", N_steps)
.log("Building Final NxtSE Object", "message")
.log("...consolidating assays to H5 file", "message")
samples_per_block <- 16
# if(lowMemoryMode) samples_per_block <- 4
assays <- .collateData_compile_assays_from_fst(df.internal,
norm_output_path, samples_per_block)
.collateData_write_stats(df.internal, norm_output_path)
.collateData_write_colData(df.internal,
coverage_files, norm_output_path, packageCOVfiles)
.log("...packaging reference", "message")
if(novelSplicing) {
cov_data <- .prepare_covplot_data(reference_path,
file.path(norm_output_path, "Reference"))
} else {
cov_data <- .prepare_covplot_data(reference_path)
}
saveRDS(cov_data, file.path(norm_output_path, "cov_data.Rds"))
.log("...synthesising NxtSE", "message")
# NEW compile NxtSE:
colData.Rds <- readRDS(file.path(norm_output_path, "colData.Rds"))
colData <- .makeSE_colData_clean(colData.Rds$df.anno)
se <- .collateData_initialise_HDF5(norm_output_path, colData, assays)
.collateData_save_NxtSE(se, file.path(norm_output_path, "seed.Rds"))
.collateData_cleanup(norm_output_path)
# Test run of loading NxtSE
.log("...determining how overlapping introns should be removed",
"message")
tryCatch({
tmpse <- makeSE(norm_output_path, verbose = FALSE,
RemoveOverlapping = FALSE)
tmpse2 <- .makeSE_iterate_IR_new(tmpse, verbose = FALSE)
tmpFiltered <- (
rowData(tmpse)$EventName %in%
rowData(tmpse2)$EventName
)
saveRDS(tmpFiltered,
file.path(norm_output_path, "filteredIntrons.Rds"))
rm(tmpse, tmpse2)
gc()
}, error = function(err) {
.log("Test loading of NxtSE object failed.")
})
dash_progress("SpliceWiz (NxtSE) Collation Finished", N_steps)
.log("SpliceWiz (NxtSE) Collation Finished", "message")
# }, error = function(e) {
# stop("In collateData(...)", e, call. = FALSE)
# }, finally = {
.restore_threads(originalSWthreads)
# })
# Return invisible
invisible(NULL)
}
################################################################################
# collateData helper functions:
# Checks Experiment data.frame. Checks paths valid and have existing parent
# folders. Creates required folders.
.collateData_validate <- function(Experiment, reference_path, output_path) {
if (!is(Experiment, "data.frame")) {
.log(paste("In collateData(),",
"Experiment object needs to be a data frame"))
}
if (ncol(Experiment) < 2) {
.log(paste("In collateData(),",
"Experiment needs to contain at least two columns,",
"with the first 2 columns containing",
"(1) sample name and (2) processBAM output"))
}
.validate_reference(reference_path)
if (!dir.exists(dirname(output_path))) {
.log(paste("In collateData(),",
"Parent directory of output path:",
dirname(output_path), "needs to exist"))
}
base_output_path <- normalizePath(dirname(output_path))
norm_output_path <- file.path(base_output_path, basename(output_path))
if (!dir.exists(norm_output_path)) {
dir.create(norm_output_path)
}
temp_output_path <- file.path(norm_output_path, "temp")
if (!dir.exists(temp_output_path)) {
dir.create(temp_output_path)
}
if (!dir.exists(file.path(norm_output_path, "annotation"))) {
dir.create(file.path(norm_output_path, "annotation"))
}
return(norm_output_path)
}
# Check all given COV files are valid COV format
.collateData_COV <- function(Experiment) {
coverage_files <- ""
Experiment <- as.data.frame(Experiment)
if (ncol(Experiment) == 2) return(NULL)
if (ncol(Experiment) > 2 && all(file.exists(Experiment[, 3]))) {
coverage_files <- Experiment[, 3]
}
if (!isCOV(coverage_files)) {
message("Some coverage files do not exist or are corrupted")
coverage_files <- ""
}
if (length(coverage_files) != nrow(Experiment)) {
message("The number of coverage files must equal the number of samples")
coverage_files <- ""
}
return(coverage_files)
}
# Checks all processBAM output files exist. Checks repeat entries.
.collateData_expr <- function(Experiment) {
Experiment <- as.data.frame(Experiment)
colnames(Experiment)[c(1, 2)] <- c("sample", "path")
if (!all(vapply(Experiment$path, file.exists, logical(1)))) {
.log(paste("In collateData(),",
"Some files in Experiment do not exist"))
}
if (length(Experiment$sample) != length(unique(Experiment$sample))) {
.log(paste("In collateData(),",
"Repeated sample names are not allowed"))
}
df.internal <- as.data.table(Experiment[, c(1, 2)])
df.internal$paired <- FALSE
df.internal$strand <- 0
df.internal$depth <- 0
df.internal$mean_frag_size <- 0
df.internal$directionality_strength <- 0
df.internal$Intergenic_Fraction <- 0
df.internal$rRNA_Fraction <- 0
df.internal$NonPolyA_Fraction <- 0
df.internal$Mitochondrial_Fraction <- 0
df.internal$Unanno_Jn_Fraction <- 0
df.internal$NMD_Jn_Fraction <- 0
df.internal$Fraction_Splice_Reads <- 0
df.internal$Fraction_Span_Reads <- 0
df.internal$IRBurden_clean <- 0
df.internal$IRBurden_exitrons <- 0
df.internal$IRBurden_clean_unstranded <- 0
df.internal$IRBurden_exitrons_unstranded <- 0
df.internal$IRBurden_antisense <- 0
return(df.internal)
}
# Designates equal jobs per thread
.collateData_jobs <- function(n_expr, BPPARAM_mod, samples_per_block) {
n_jobs <- min(ceiling(n_expr / samples_per_block),
BPPARAM_mod$workers)
jobs <- .split_vector(seq_len(n_expr), n_jobs)
return(jobs)
}
################################################################################
# Sub
# Collates statistics of processBAM QC, returns a data frame of these QC stats
.collateData_stats <- function(df.internal, jobs, BPPARAM_mod) {
n_jobs <- length(jobs)
df.internal <- rbindlist(
BiocParallel::bplapply(seq_len(n_jobs),
function(x, jobs, df.internal) {
suppressPackageStartupMessages({
requireNamespace("data.table")
requireNamespace("stats")
})
work <- jobs[[x]]
block <- df.internal[work]
for (i in seq_len(length(work))) {
data.list <- get_multi_DT_from_gz(
normalizePath(block$path[i]),
c("BAM", "Directionality", "QC"))
stats <- data.list[["BAM"]]
direct <- data.list[["Directionality"]]
QC <- data.list[["QC"]]
block <- .collateData_stats_reads(block, i, stats, direct)
block <- .collateData_stats_QC(block, i, QC, direct)
}
return(block)
}, jobs = jobs, df.internal = df.internal, BPPARAM = BPPARAM_mod
)
)
return(df.internal)
}
################################################################################
.collateData_stats_reads <- function(block, i, stats, direct) {
statsVec <- unname(unlist(stats[,2]))
names(statsVec) <- unname(unlist(stats[,1]))
numNuc <- as.numeric(statsVec["Total nucleotides"])
numSingle <- as.numeric(statsVec["Total singles processed"])
numPairs <- as.numeric(statsVec["Total pairs processed"])
if (numSingle == 0 & numPairs > 0) {
block$paired[i] <- TRUE
block$depth[i] <- numPairs
block$mean_frag_size[i] <- numNuc / numPairs
} else if (numSingle > 0 && numPairs / numSingle > 10) {
block$paired[i] <- TRUE
block$depth[i] <- numPairs
block$mean_frag_size[i] <- numNuc / numPairs
} else {
block$paired[i] <- FALSE
block$depth[i] <- numSingle
block$mean_frag_size[i] <- numNuc / numSingle
}
block$strand[i] <- as.numeric(
direct$Value[direct[,1] == "Overall Directionality"])
return(block)
}
.collateData_stats_QC <- function(block, i, QC, direct) {
block$directionality_strength[i] <- as.numeric(
direct$Value[grepl("Dir score known junctions", unlist(direct[,1]))])
block$Intergenic_Fraction[i] <-
QC$Value[QC$QC == "Intergenic Reads"] / block$depth[i]
block$rRNA_Fraction[i] <-
QC$Value[QC$QC == "rRNA Reads"] / block$depth[i]
block$NonPolyA_Fraction[i] <-
QC$Value[QC$QC == "NonPolyA Reads"] / block$depth[i]
block$Mitochondrial_Fraction[i] <-
QC$Value[QC$QC == "Mitochondrial Reads"] / block$depth[i]
block$Unanno_Jn_Fraction[i] <-
QC$Value[QC$QC == "Unannotated Junctions"] /
(
QC$Value[QC$QC == "Unannotated Junctions"] +
QC$Value[QC$QC == "Annotated Junctions"]
)
block$NMD_Jn_Fraction[i] <-
QC$Value[QC$QC == "NMD Junctions"] /
QC$Value[QC$QC == "Annotated Junctions"]
block$Fraction_Splice_Reads[i] <-
QC$Value[QC$QC == "Annotated Junctions"] / block$depth[i]
block$Fraction_Span_Reads[i] <-
QC$Value[QC$QC == "Spans Reads"] / block$depth[i]
# IRBurden calculations
block$IRBurden_clean_unstranded[i] <-
QC$Value[QC$QC == "Non-Directional Clean IntronDepth Sum"] /
(
QC$Value[QC$QC == "Non-Directional Clean IntronDepth Sum"] +
QC$Value[QC$QC == "Annotated Junctions"]
)
block$IRBurden_exitrons_unstranded[i] <-
QC$Value[QC$QC == "Non-Directional Known-Exon IntronDepth Sum"] /
(
QC$Value[QC$QC == "Non-Directional Known-Exon IntronDepth Sum"] +
QC$Value[QC$QC == "Annotated Junctions"]
)
block$IRBurden_antisense[i] <-
QC$Value[QC$QC == "Non-Directional Anti-Sense IntronDepth Sum"] /
(
QC$Value[QC$QC == "Non-Directional Anti-Sense IntronDepth Sum"] +
QC$Value[QC$QC == "Annotated Junctions"]
)
if (block$strand[i] != 0) {
block$IRBurden_clean[i] <-
QC$Value[QC$QC == "Directional Clean IntronDepth Sum"] /
(
QC$Value[QC$QC == "Directional Clean IntronDepth Sum"] +
QC$Value[QC$QC == "Annotated Junctions"]
)
block$IRBurden_exitrons[i] <-
QC$Value[QC$QC == "Directional Known-Exon IntronDepth Sum"] /
(
QC$Value[QC$QC == "Directional Known-Exon IntronDepth Sum"] +
QC$Value[QC$QC == "Annotated Junctions"]
)
}
return(block)
}
################################################################################
# Sub
# Compiles a unified list of detected splice junctions
.collateData_junc_merge <- function(
df.internal, jobs, BPPARAM_mod,
output_path
) {
temp_output_path <- file.path(output_path, "temp")
n_jobs <- length(jobs)
.log("Compiling Junction List...", "message", appendLF = FALSE)
# Extract junctions from processBAM output, save temp FST file
junc.list <- BiocParallel::bplapply(
seq_len(n_jobs),
function(x, jobs, df.internal, temp_output_path) {
suppressPackageStartupMessages({
requireNamespace("data.table")
requireNamespace("stats")
})
work <- jobs[[x]]
block <- df.internal[work]
junc.segment <- NULL
for (i in seq_len(length(work))) {
data.list <- get_multi_DT_from_gz(
normalizePath(block$path[i]), c("JC_seqname"))
junc <- data.list[["JC_seqname"]]
setnames(junc, "JC_seqname", "seqnames")
junc[, c("seqnames") := as.character(get("seqnames"))]
if (is.null(junc.segment)) {
junc.segment <- junc[, seq_len(4), with = FALSE]
} else {
junc.segment <- merge(junc.segment,
junc[, seq_len(4), with = FALSE], all = TRUE)
}
# Write temp file
fst::write.fst(as.data.frame(junc),
file.path(temp_output_path, paste(block$sample[i],
"junc.fst.tmp", sep = ".")))
}
return(junc.segment)
}, jobs = jobs, df.internal = df.internal,
temp_output_path = temp_output_path, BPPARAM = BPPARAM_mod
)
# Combine list of individual junction dfs into unified list of junction
message("merging...", appendLF = FALSE)
junc.common <- NULL
for (i in seq_len(length(junc.list))) {
if (is.null(junc.common)) {
junc.common <- junc.list[[i]]
} else {
junc.common <- merge(junc.common, junc.list[[i]],
all = TRUE, by = colnames(junc.common))
}
}
junc.common[, c("start") := get("start") + 1]
message("done")
write.fst(as.data.frame(junc.common), file.path(output_path,
"annotation", "junc.common.fst"))
rm(junc.common)
gc()
invisible()
}
# Returns a junc_common with junction abundance statistics
.collateData_junc_stats <- function(
df.internal, jobs, BPPARAM_mod,
output_path,
juncThreshold = 10 # min junctions for counting samples
) {
temp_output_path <- file.path(output_path, "temp")
n_jobs <- length(jobs)
.log("Compiling Junction Stats...", "message", appendLF = FALSE)
# Extract junctions from processBAM output, save temp FST file
junc.list <- BiocParallel::bplapply(
seq_len(n_jobs),
function(x, jobs, df.internal, output_path, juncThreshold) {
suppressPackageStartupMessages({
requireNamespace("data.table")
requireNamespace("stats")
})
work <- jobs[[x]]
block <- df.internal[work]
junc.common <- as.data.table(read.fst(file.path(output_path,
"annotation", "junc.common.fst")))
junc.common[, c("seqnames") := as.character(get("seqnames"))]
final.df <- data.frame(
countJuncInSamples = rep(0, nrow(junc.common)),
countJuncThresholdSamples = rep(0, nrow(junc.common))
)
for (i in seq_len(length(work))) {
value <- .collateData_junc_count(block$sample[i],
junc.common, file.path(output_path, "temp"))
final.df$countJuncInSamples <- final.df$countJuncInSamples +
ifelse(value > 0, 1, 0)
final.df$countJuncThresholdSamples <-
final.df$countJuncThresholdSamples +
ifelse(value >= juncThreshold, 1, 0)
}
return(final.df)
}, jobs = jobs, df.internal = df.internal,
output_path = output_path,
juncThreshold = juncThreshold,
BPPARAM = BPPARAM_mod
)
# Combine list of individual junction dfs into unified list of junction
message("merging...", appendLF = FALSE)
final.df <- Reduce(.collateData_junc_stats_merge_fn, junc.list)
message("done")
write.fst(as.data.frame(final.df), file.path(output_path,
"annotation", "junc.common.counts.fst"))
rm(junc.list, final.df)
gc()
invisible()
}
.collateData_junc_stats_merge_fn <- function(x, y) {
as.data.frame(as.matrix(x) + as.matrix(y))
}
# Counts junctions for one sample
.collateData_junc_count <- function(
sample, # strand,
junc.common, temp_output_path
) {
junc <- as.data.table(
read.fst(file.path(temp_output_path,
paste(sample, "junc.fst.tmp", sep = ".")))
)
junc[, c("seqnames") := as.character(get("seqnames"))]
junc[, c("start") := get("start") + 1]
junc$strand <- NULL # Use strand from junc.common
junc <- junc[junc.common, on = colnames(junc.common)[c(1, 2, 3)]]
junc$count <- junc$total
junc[is.na(get("count")), c("count") := 0]
final <- junc$count
rm(junc)
gc()
return(final)
}
# Assume all processBAM outputs are created from same ref. Checks this
.collateData_sw_merge <- function(df.internal, jobs, BPPARAM_mod,
output_path, stranded) {
# Check names of all introns are the same in all samples
temp_output_path <- file.path(output_path, "temp")
n_jobs <- length(jobs)
.log("Compiling Intron Retention List...", "message", appendLF = FALSE)
sw.list <- BiocParallel::bplapply(seq_len(n_jobs),
function(x, jobs, df.internal, temp_output_path, stranded) {
suppressPackageStartupMessages({
requireNamespace("data.table")
requireNamespace("stats")
})
work <- jobs[[x]]
block <- df.internal[work]
sw.names <- c()
phrase <- ifelse(stranded, "Dir_Chr", "Nondir_Chr")
for (i in seq_len(length(work))) {
data.list <- get_multi_DT_from_gz(
normalizePath(block$path[i]), phrase)
sw <- data.list[[phrase]]
setnames(sw, c(phrase, "Start", "End", "Strand"),
c("seqnames", "start", "end", "strand"))
sw[, c("seqnames") := as.character(get("seqnames"))]
if (is.null(sw.names)) {
sw.names <- sw$Name
} else {
if (!identical(sw.names, sw$Name))
.collateData_stop_sw_mismatch()
}
fst::write.fst(as.data.frame(sw),
file.path(temp_output_path,
paste(block$sample[i], "sw.fst.tmp", sep = ".")))
}
return(sw.names)
}, jobs = jobs, df.internal = df.internal,
temp_output_path = temp_output_path,
stranded = stranded, BPPARAM = BPPARAM_mod
)
# Checks common columns are the same
if (length(sw.list) > 1) {
for (i in seq_len(length(sw.list))) {
if (!identical(sw.list[[i]], sw.list[[1]]))
.collateData_stop_sw_mismatch()
}
}
# Returns common columns
sw <- fst::read.fst(file.path(temp_output_path,
paste(df.internal$sample[1], "sw.fst.tmp", sep = ".")),
as.data.table = TRUE)
sw.common <- sw[, seq_len(6), with = FALSE]
sw.common[, c("start") := get("start") + 1]
message("done")
write.fst(as.data.frame(sw.common), file.path(output_path,
"annotation", "sw.common.fst"))
rm(sw.common)
gc()
invisible()
}
# Compiles a unified list of detected splice junctions
.collateData_tj_merge <- function(df.internal, jobs, BPPARAM_mod,
output_path) {
temp_output_path <- file.path(output_path, "temp")
n_jobs <- length(jobs)
.log("Compiling Tandem Junction List...", "message", appendLF = FALSE)
# Extract junctions from processBAM output, save temp FST file
tj.list <- BiocParallel::bplapply(
seq_len(n_jobs),
function(x, jobs, df.internal, temp_output_path) {
suppressPackageStartupMessages({
requireNamespace("data.table")
requireNamespace("stats")
})
work <- jobs[[x]]
block <- df.internal[work]
tj.segment <- NULL
for (i in seq_len(length(work))) {
data.list <- get_multi_DT_from_gz(
normalizePath(block$path[i]), c("TJ_seqname"))
tj <- data.list[["TJ_seqname"]]
setnames(tj, "TJ_seqname", "seqnames")
# Filter for novel tandem junctions only
tj <- tj[get("strand") == "."]
tj[, c("seqnames") := as.character(get("seqnames"))]
# 0-base to 1-base conversion
tj[, c("start1") := get("start1") + 1]
tj[, c("start2") := get("start2") + 1]
if (is.null(tj.segment)) {
tj.segment <- tj[, seq_len(6), with = FALSE]
} else {
tj.segment <- merge(tj.segment,
tj[, seq_len(6), with = FALSE], all = TRUE)
}
}
return(tj.segment)
}, jobs = jobs, df.internal = df.internal,
temp_output_path = temp_output_path, BPPARAM = BPPARAM_mod
)
# Combine list of individual junction dfs into unified list of junction
message("merging...", appendLF = FALSE)
tj.common <- NULL
for (i in seq_len(length(tj.list))) {
if (is.null(tj.common)) {
tj.common <- tj.list[[i]]
} else {
tj.common <- merge(tj.common, tj.list[[i]],
all = TRUE, by = colnames(tj.common))
}
}
message("done")
write.fst(as.data.frame(tj.common), file.path(output_path,
"annotation", "tj.common.fst"))
rm(tj.common)
gc()
invisible()
}
.collateData_stop_sw_mismatch <- function() {
stopmsg <- paste(
"Some processBAM outputs were generated",
"with a different SpliceWiz reference.",
"Suggest re-run processBAM on all sample files",
"using the same reference"
)
.log(stopmsg)
}
################################################################################
# Sub
# Annotate processBAM introns and junctions according to given reference
.collateData_annotate <- function(reference_path, norm_output_path,
stranded, novelSplicing, lowMemoryMode = TRUE,
minSamplesWithJunc = 3, minSamplesAboveJuncThreshold = 1,
novelSplicing_requireOneAnnotatedSJ = TRUE,
novelSplicing_useTJ = TRUE,
novelSplicing_extrapolateTJ = FALSE
) {
message("...annotating splice junctions")
.collateData_junc_annotate(2, reference_path, norm_output_path,
lowMemoryMode)
if(novelSplicing & novelSplicing_useTJ) {
message("...looking for novel exons")
.collateData_tj_annotate(2, reference_path, norm_output_path)
}
# Assemble intron_novel_transcript reference from novel junctions and
# novel tandem junctions
.collateData_assemble_reference(
2, reference_path,
norm_output_path, lowMemoryMode, novelSplicing,
minSamplesWithJunc, minSamplesAboveJuncThreshold,
novelSplicing_requireOneAnnotatedSJ,
novelSplicing_useTJ,
novelSplicing_extrapolateTJ,
verbose = TRUE
)
use_ref_path <- file.path(norm_output_path, "Reference")
if(novelSplicing) {
.log("Tidying up splice junctions and intron retentions (part 2)...",
"message")
}
message("...grouping splice junctions")
.collateData_junc_group(2, use_ref_path, norm_output_path)
message("...grouping introns")
.collateData_sw_group(2, use_ref_path, norm_output_path, stranded)
message("...loading splice events")
.collateData_splice_anno(2, use_ref_path, norm_output_path)
message("...compiling rowEvents")
.collateData_rowEvent(2, use_ref_path, norm_output_path)
}
.collateData_annotate_BPPARAM <- function(reference_path, norm_output_path,
stranded, novelSplicing, lowMemoryMode = TRUE,
minSamplesWithJunc = 3, minSamplesAboveJuncThreshold = 1,
novelSplicing_requireOneAnnotatedSJ = TRUE,
novelSplicing_useTJ = TRUE,
novelSplicing_extrapolateTJ = FALSE
) {
BPPARAM_annotate <- .validate_threads(2)
message("...annotating splice junctions")
tmp <- BiocParallel::bplapply(
seq_len(2),
.collateData_junc_annotate,
reference_path = reference_path,
norm_output_path = norm_output_path,
lowMemoryMode = lowMemoryMode,
BPPARAM = BPPARAM_annotate
)
if(novelSplicing) {
message("...looking for novel exons")
if(novelSplicing_useTJ) {
tmp <- BiocParallel::bplapply(
seq_len(2),
.collateData_tj_annotate,
reference_path = reference_path,
norm_output_path = norm_output_path,
BPPARAM = BPPARAM_annotate
)
}
.log(paste("Assembling novel splicing reference in dedicated thread,",
"this may take up to 10 minutes..."), "message", appendLF = FALSE)
tmp <- BiocParallel::bplapply(
seq_len(2),
.collateData_assemble_reference,
reference_path = reference_path,
norm_output_path = norm_output_path,
lowMemoryMode = lowMemoryMode,
novelSplicing = novelSplicing,
minSamplesWithJunc = minSamplesWithJunc,
minSamplesAboveJuncThreshold = minSamplesAboveJuncThreshold,
novelSplicing_requireOneAnnotatedSJ = novelSplicing_requireOneAnnotatedSJ,
novelSplicing_useTJ = novelSplicing_useTJ,
novelSplicing_extrapolateTJ = novelSplicing_extrapolateTJ,
BPPARAM = BPPARAM_annotate
)
message("done")
.log("Tidying up splice junctions and intron retentions (part 2)...",
"message")
} else {
# No need to thread a simply file copy function
.collateData_assemble_reference(
2, reference_path,
norm_output_path, lowMemoryMode, novelSplicing,
minSamplesWithJunc, minSamplesAboveJuncThreshold,
novelSplicing_requireOneAnnotatedSJ,
novelSplicing_useTJ,
novelSplicing_extrapolateTJ,
verbose = TRUE
)
}
use_ref_path <- file.path(norm_output_path, "Reference")
message("...grouping splice junctions")
tmp <- BiocParallel::bplapply(
seq_len(2),
.collateData_junc_group,
reference_path = use_ref_path,
norm_output_path = norm_output_path,
BPPARAM = BPPARAM_annotate
)
message("...grouping introns")
tmp <- BiocParallel::bplapply(
seq_len(2),
.collateData_sw_group,
reference_path = use_ref_path,
norm_output_path = norm_output_path,
stranded = stranded,
BPPARAM = BPPARAM_annotate
)
message("...loading splice events")
tmp <- BiocParallel::bplapply(
seq_len(2),
.collateData_splice_anno,
reference_path = use_ref_path,
norm_output_path = norm_output_path,
BPPARAM = BPPARAM_annotate
)
message("...compiling rowEvents")
tmp <- BiocParallel::bplapply(
seq_len(2),
.collateData_rowEvent,
reference_path = use_ref_path,
norm_output_path = norm_output_path,
BPPARAM = BPPARAM_annotate
)
}
################################################################################
# Annotate junction splice motifs
.collateData_junc_annotate <- function(
threadID, reference_path, norm_output_path,
lowMemoryMode = TRUE
) {
if(threadID != 2) return()
junc.strand <- read.fst(
path = file.path(reference_path, "fst", "junctions.fst"),
columns = c("seqnames", "start", "end", "strand", "splice_motif"),
as.data.table = TRUE
)
junc.strand <- unique(junc.strand)
junc.strand[, c("seqnames") := as.character(get("seqnames"))]
junc.common <- as.data.table(read.fst(file.path(norm_output_path,
"annotation", "junc.common.fst")))
junc.stats <- as.data.table(read.fst(file.path(norm_output_path,
"annotation", "junc.common.counts.fst")))
junc.common[, c("seqnames") := as.character(get("seqnames"))]
junc.common <- cbind(junc.common, junc.stats)
# Determine strand of junc.common junctions
junc.common[, c("strand") := NULL]
junc.common <- merge(junc.common, junc.strand,
all = TRUE, by = c("seqnames", "start", "end"))
# Novel junctions are assigned *
junc.common[is.na(get("strand")), c("strand") := "*"]
junc.common.unanno <- junc.common[get("strand") == "*"]
junc.common.anno <- junc.common[get("strand") != "*"]
# make sure valid seqnames
junc.common.unanno <- junc.common.unanno[!is.na(get("seqnames"))]
junc.common.unanno <- junc.common.unanno[get("seqnames") != ""]
# Determine strandedness based on splice junction motif
if (nrow(junc.common.unanno) != 0) {
genome <- Get_Genome(reference_path, as_DNAStringSet = !lowMemoryMode)
seqinfo <- as.data.frame(seqinfo(genome))
# Test 2bit inefficiency
genome <- .check_2bit_performance(reference_path, genome)
# Filter unanno by available sequences
junc.common.unanno <- junc.common.unanno[
seqnames %in% rownames(seqinfo)]
# sanity check: remove unannotated junctions that lie outside genome
junc.common.unanno.gr <- .grDT(junc.common.unanno)
seqlevels(junc.common.unanno.gr, pruning.mode = "coarse") <-
seqlevels(genome)
seqinfo.gr <- as(seqinfo(genome), "GenomicRanges")
OL <- findOverlaps(junc.common.unanno.gr, seqinfo.gr, type = "within")
junc.common.unanno <- junc.common.unanno[
unique(from(OL)), which = FALSE]
if (nrow(junc.common.unanno) != 0) {
# Create left and right motif GRanges
left.gr <- with(junc.common.unanno,
GRanges(seqnames = as.character(seqnames),
ranges = IRanges(start = start, end = start + 1),
strand = "+"))
right.gr <- with(junc.common.unanno,
GRanges(seqnames = as.character(seqnames),
ranges = IRanges(start = end - 1, end = end),
strand = "+"))
junc.common.unanno[, c("splice_motif") := paste0(
as.character(getSeq(genome, left.gr)),
as.character(getSeq(genome, right.gr))
)]
splice_motifs <- data.frame(
seqs = c("GTAG", "GCAG", "ATAC", "ATAG"),
seqs_r = c("CTAC", "CTGC", "GTAT", "CTAT")
)
junc.common.unanno[get("splice_motif") %in% splice_motifs$seqs,
c("strand") := "+"]
junc.common.unanno[get("splice_motif") %in% splice_motifs$seqs_r,
c("strand") := "-"]
# Do not accept un-annotated GTACs - too confusing
# Exclude unannotated non-splice motifs
junc.common.unanno <- junc.common.unanno[
get("strand") != "*"]
junc.common.unanno[get("strand") == "-",
c("splice_motif") := splice_motifs$seqs[match(
get("splice_motif"), splice_motifs$seqs_r)]]
junc.final <- rbindlist(list(junc.common.anno, junc.common.unanno))
} else {
junc.final <- junc.common.anno
}
# Cleanup memory:
rm(genome)
gc()
} else {
junc.final <- junc.common.anno
}
# Assign region names to junctions:
junc.final[, c("Event") := paste0(get("seqnames"), ":",
get("start"), "-", get("end"), "/", get("strand"))]
junc.final <- junc.final[,
c("seqnames", "start", "end", "strand",
"splice_motif", "Event",
"countJuncInSamples", "countJuncThresholdSamples")
]
write.fst(as.data.frame(junc.final), file.path(norm_output_path,
"annotation", "junc.common.annotated.fst"))
if(exists("junc.common.unanno")) rm(junc.common.unanno)
rm(junc.final, junc.common.anno, junc.common, junc.strand)
gc()
}
# Annotate junction splice motifs
.collateData_tj_annotate <- function(
threadID, reference_path, norm_output_path,
lowMemoryMode = TRUE
) {
if(threadID != 2) return()
# candidate.introns <- as.data.table(
# read.fst(file.path(reference_path, "fst", "junctions.fst"))
# )
# candidate.introns[, c("seqnames") := as.character(get("seqnames"))]
junc.common <- as.data.table(read.fst(file.path(norm_output_path,
"annotation", "junc.common.annotated.fst")))
tj.common <- as.data.table(read.fst(file.path(norm_output_path,
"annotation", "tj.common.fst")))
junc.common[, c("seqnames") := as.character(get("seqnames"))]
tj.common[, c("seqnames") := as.character(get("seqnames"))]
### - Strand assignment based on annotated junctions - ###
# First remove any tandem junctions not in junc.common
jc_events_unstranded <- paste0(junc.common$seqnames, ":",
junc.common$start, "-", junc.common$end)
tj_event1 <- paste0(tj.common$seqnames, ":",
tj.common$start1, "-", tj.common$end1)
tj_event2 <- paste0(tj.common$seqnames, ":",
tj.common$start2, "-", tj.common$end2)
tj.common <- tj.common[tj_event1 %in% jc_events_unstranded &
tj_event2 %in% jc_events_unstranded]
# Then, assemble a strand table
jc.strand <- data.frame(
event = jc_events_unstranded,
strand = junc.common$strand
)
# Match strands based on event1 / 2
tj_event1 <- paste0(tj.common$seqnames, ":",
tj.common$start1, "-", tj.common$end1)
tj_event2 <- paste0(tj.common$seqnames, ":",
tj.common$start2, "-", tj.common$end2)
tj.strand <- as.data.table(data.frame(
strand1 = jc.strand$strand[match(tj_event1, jc.strand$event)],
strand2 = jc.strand$strand[match(tj_event2, jc.strand$event)],
stringsAsFactors = FALSE
))
tj.strand$status <- ""
tj.strand[get("strand1") == "+" & get("strand2") == "+",
c("status") := "+"]
tj.strand[get("strand1") == "-" & get("strand2") == "-",
c("status") := "-"]
tj.strand[get("strand1") == "*",
c("status") := get("strand2")]
tj.strand[get("strand2") == "*",
c("status") := get("strand1")]
# Filter TJ based on matching strand
tj.common$strand <- tj.strand$status
tj.common <- tj.common[strand != ""]
### - Not sure we need to filter out non-novel exons - ###
# - tandem junctions can also help find novel A5/3SS and skipped exons
# exons_brief <- read.fst(
# path = file.path(reference_path, "fst", "Exons.fst"),
# columns = c("seqnames", "start", "end", "strand"),
# as.data.table = TRUE
# )
# exons_brief <- unique(exons_brief)
# exons_brief[, c("seqnames") := as.character(get("seqnames"))]
# exons_left <- paste0(exons_brief$seqnames, ":", exons_brief$start)
# tj_exon_left <- paste0(tj.common$seqnames, ":", tj.common$end1 + 1)
# exons_right <- paste0(exons_brief$seqnames, ":", exons_brief$end)
# tj_exon_right <- paste0(tj.common$seqnames, ":", tj.common$start2 - 1)
# tj.common.new <- tj.common[!(tj_exon_left %in% exons_left) &
# !(tj_exon_right %in% exons_right)]
write.fst(as.data.frame(tj.common), file.path(norm_output_path,
"annotation", "tj.common.annotated.fst"))
# Cleanup
rm(junc.common, tj.common, jc.strand, tj.strand)
gc()
}
################################################################################
# Assemble novel transcripts and generate new reference
.collateData_assemble_reference <- function(
threadID, reference_path, norm_output_path, lowMemoryMode, novelSplicing,
minSamplesWithJunc, minSamplesAboveJuncThreshold,
novelSplicing_requireOneAnnotatedSJ,
novelSplicing_useTJ = TRUE,
novelSplicing_extrapolateTJ = FALSE,
verbose = FALSE
) {
if(threadID != 2) return()
if(novelSplicing) {
.log(paste("Assembling novel splicing reference:"), "message")
fst2Copy <- c(
"Ontology.fst",
"Introns.Dir.fst", "Introns.ND.fst", "IR.NMD.fst"
)
} else {
message("...copying splicing reference")
fst2Copy <- c(
"Exons.fst", "Exons.Group.fst", "Genes.fst", "gtf_fixed.fst",
"junctions.fst", "Transcripts.fst",
"Misc.fst", "Ontology.fst", "Proteins.fst",
"Splice.Extended.fst", "Splice.fst", "Splice.options.fst",
"Introns.Dir.fst", "Introns.ND.fst", "IR.NMD.fst"
)
}
novel_ref_path <- file.path(norm_output_path, "Reference")
.validate_path(novel_ref_path, subdirs = "resource")
.validate_path(novel_ref_path, subdirs = "fst")
# Copy over files that don't need to be generated
file.copy(file.path(reference_path, "SpliceWiz.ref.gz"),
file.path(novel_ref_path, "SpliceWiz.ref.gz"), overwrite = TRUE)
for(fstFile in fst2Copy) {
if(file.exists(file.path(reference_path, "fst", fstFile))) {
file.copy(file.path(reference_path, "fst", fstFile),
file.path(novel_ref_path, "fst", fstFile), overwrite = TRUE)
}
}
if(novelSplicing) {
if(verbose) message("...loading reference FASTA/GTF")
if(file.exists(file.path(reference_path, "fst/gtf_fixed.fst"))) {
reference_data <- list(
genome = Get_Genome(reference_path, validate = FALSE,
as_DNAStringSet = !lowMemoryMode),
gtf_gr = .grDT(
read.fst(file.path(reference_path, "fst/gtf_fixed.fst")),
keep.extra.columns = TRUE
)
)
} else {
reference_data <- .get_reference_data(
reference_path = reference_path,
fasta = "", gtf = "", verbose = FALSE,
overwrite = FALSE, force_download = FALSE,
pseudo_fetch_fasta = lowMemoryMode, pseudo_fetch_gtf = FALSE)
reference_data <- .validate_gtf_chromosomes(
reference_data, reference_path, pseudo_fetch = lowMemoryMode)
reference_data$gtf_gr <- .fix_gtf(reference_data$gtf_gr)
}
if(verbose) message("...injecting novel transcripts to GTF")
# Insert novel gtf here
# reference_data$gtf_gr is a GRanges object
novel_gtf <- .collateData_novel_assemble_transcripts(
reference_path, norm_output_path, reference_data$gtf_gr,
minSamplesWithJunc, minSamplesAboveJuncThreshold,
novelSplicing_requireOneAnnotatedSJ,
novelSplicing_useTJ
)
# Finish inserting novel gtf
unique_seqlevels <- unique(c(
seqlevels(reference_data$gtf_gr), seqlevels(novel_gtf)
))
seqlevels(reference_data$gtf_gr) <- unique_seqlevels
seqlevels(novel_gtf) <- unique_seqlevels
# Putative TJ injection
if(novelSplicing_extrapolateTJ) {
putTJ_gtf <- .collateData_novel_injectPutativeTJ(
reference_path, norm_output_path,
reference_data$gtf_gr, novel_gtf
)
unique_seqlevels <- unique(c(
seqlevels(reference_data$gtf_gr), seqlevels(novel_gtf),
seqlevels(putTJ_gtf)
))
seqlevels(reference_data$gtf_gr) <- unique_seqlevels
seqlevels(novel_gtf) <- unique_seqlevels
seqlevels(putTJ_gtf) <- unique_seqlevels
novel_gtf <- c(novel_gtf, putTJ_gtf)
}
if(verbose) message("...processing GTF")
.process_gtf(c(reference_data$gtf_gr, novel_gtf),
novel_ref_path, verbose = FALSE)
# extra_files$genome_style <- .gtf_get_genome_style(reference_data$gtf_gr)
reference_data$gtf_gr <- NULL # To save memory, remove original gtf
novelGTFfile <- file.path(norm_output_path, "novelTranscripts.gtf")
rtracklayer::export(novel_gtf, novelGTFfile, "gtf")
gzip(filename = novelGTFfile, destname = paste0(novelGTFfile, ".gz"),
overwrite = TRUE)
if (file.exists(novelGTFfile) & file.exists(paste0(novelGTFfile, ".gz"))
) {
file.remove(novelGTFfile)
}
rm(novel_gtf)
gc()
if(verbose) message("...processing introns from GTF")
reference_data$genome <- .check_2bit_performance(reference_path,
reference_data$genome, verbose = FALSE)
.process_introns(novel_ref_path, reference_data$genome,
useExtendedTranscripts = TRUE, verbose = FALSE)
rm(reference_data)
gc()
if(verbose) message("...annotating alternative splicing events")
.gen_splice(novel_ref_path, verbose = FALSE)
message("done")
}
settings.list <- readRDS(file.path(reference_path, "settings.Rds"))
# (TODO) - modify settings.Rds
saveRDS(settings.list, file.path(novel_ref_path, "settings.Rds"))
}
.collateData_novel_assemble_transcripts <- function(
reference_path, norm_output_path, gtf,
minSamplesWithJunc, minSamplesAboveJuncThreshold,
novelSplicing_requireOneAnnotatedSJ = TRUE,
novelSplicing_useTJ = TRUE
) {
# Load junc.common
junc.common <- as.data.table(read.fst(file.path(norm_output_path,
"annotation", "junc.common.annotated.fst")))
# Filter out known junctions
known.junctions <- unique(as.data.table(read.fst(
file.path(reference_path, "fst", "junctions.fst"),
columns = c("seqnames", "start", "end", "strand", "Event"))))
junc.common[, c("seqnames") := as.character(get("seqnames"))]
known.junctions[, c("seqnames") := as.character(get("seqnames"))]
junc.novel <- junc.common[!(get("Event") %in% known.junctions$Event)]
if(nrow(junc.novel) == 0) {
rm(junc.common, known.junctions)
gc()
return(NULL)
}
# filter novel junctions
junc.novel <- junc.novel[
get("countJuncInSamples") >= minSamplesWithJunc |
get("countJuncThresholdSamples") >= minSamplesAboveJuncThreshold
]
if(nrow(junc.novel) == 0) {
rm(junc.common, known.junctions)
gc()
return(NULL)
}
# Filter for when there is at least 1 common splice site with knowns
if(novelSplicing_requireOneAnnotatedSJ) {
kj.left <- with(known.junctions, GRanges(seqnames = seqnames,
ranges = IRanges(start, start), strand = strand))
nj.left <- with(junc.novel, GRanges(seqnames = seqnames,
ranges = IRanges(start, start), strand = strand))
kj.right <- with(known.junctions, GRanges(seqnames = seqnames,
ranges = IRanges(end, end), strand = strand))
nj.right <- with(junc.novel, GRanges(seqnames = seqnames,
ranges = IRanges(end, end), strand = strand))
OL_left <- .findOverlaps_merge(nj.left, kj.left)
OL_right <- .findOverlaps_merge(nj.right, kj.right)
at_least_one_end <- unique(c(from(OL_left), from(OL_right)))
junc.novel <- junc.novel[at_least_one_end]
if(nrow(junc.novel) == 0) {
rm(junc.common, known.junctions)
gc()
return(NULL)
}
}
rm(junc.common, known.junctions)
gc()
if(novelSplicing_useTJ) {
# No need to filter tandem junctions as this was done in earlier step
tj.novel <- as.data.table(read.fst(file.path(norm_output_path,
"annotation", "tj.common.annotated.fst")))
if(nrow(tj.novel) > 0) {
tj.novel[, c("seqnames") := as.character(get("seqnames"))]
# Filter tj.novel by junc.novel
setnames(tj.novel, c("start1", "end1"), c("start", "end"))
tmp <- tj.novel[!junc.novel,
on = c("seqnames", "start", "end", "strand")]
tj.novel <- tj.novel[!tmp, on = colnames(tj.novel)]
setnames(tj.novel, c("start", "end"), c("start1", "end1"))
setnames(tj.novel, c("start2", "end2"), c("start", "end"))
tmp <- tj.novel[!junc.novel,
on = c("seqnames", "start", "end", "strand")]
tj.novel <- tj.novel[!tmp, on = colnames(tj.novel)]
setnames(tj.novel, c("start", "end"), c("start2", "end2"))
# Remove events in junc.novel that are also featured in tj.novel
# tmp_gr <- GRanges(tj.novel$seqnames,
# IRanges(tj.novel$start1, tj.novel$end1), tj.novel$strand)
# OL <- .findOverlaps_merge(.grDT(junc.novel), tmp_gr, type = "equal")
# if(length(OL@from) > 0) junc.novel <- junc.novel[-unique(OL@from)]
# tmp_gr <- GRanges(tj.novel$seqnames,
# IRanges(tj.novel$start2, tj.novel$end2), tj.novel$strand)
# OL <- .findOverlaps_merge(.grDT(junc.novel), tmp_gr, type = "equal")
# if(length(OL@from) > 0) junc.novel <- junc.novel[-unique(OL@from)]
}
} else {
tj.novel <- data.table()
}
n_trans <- nrow(junc.novel) + nrow(tj.novel)
if(n_trans < 1) return(NULL)
new_gtf <- .collateData_construct_novelGTF(
gtf, junc.novel, tj.novel,
novelTr_source = "novel",
novelGeneID_header = "novelGene",
novelGeneBT = "novel_gene",
novelTrID_header = "novelTrID",
novelTrName_suffix = "-novelTr",
novelTrBT = "novel_transcript"
)
return(new_gtf)
}
.collateData_novel_injectPutativeTJ <- function(
reference_path, norm_output_path, ref_gtf, novel_gtf
) {
# Generate a table of all junctions (known + novel)
exons <- c(
ref_gtf[ref_gtf$type == "exon"],
novel_gtf[novel_gtf$type == "exon"]
)
allJunctions <- .grlGaps(split(
exons, exons$transcript_id
))
allJunctions <- as.data.table(allJunctions)
allJunctions[, c("group", "group_name") := list(NULL,NULL)]
allJunctions <- unique(as.data.table(sort(.grDT(allJunctions))))
# Generate a table of all annotated + novel TJ's
# Known TJ's from SpliceWiz reference
suppressWarnings({
knownTJ <- fread(file.path(reference_path, "SpliceWiz.ref.gz"),
skip = "ref-tj")
})
colnames(knownTJ) <-
c("seqnames", "start1", "end1", "start2", "end2", "strand")
knownTJ[, c("start1", "start2") := list(
get("start1") + 1, get("start2") + 1)]
tandem_gtf <- novel_gtf[novel_gtf$type == "exon"]
tandem_gtf <- tandem_gtf[tandem_gtf$exon_number == 3]
tandemTrID <- unique(tandem_gtf$transcript_id)
tandemTr <- novel_gtf[novel_gtf$transcript_id %in% tandemTrID &
novel_gtf$type == "transcript"]
# Retrieve novel TJ's
novelTJ <- data.table(
seqnames = as.character(seqnames(tandemTr)),
start1 = 0,
end1 = 0,
start2 = 0,
end2 = 0,
strand = as.character(strand(tandemTr))
)
novelTandemExons <- novel_gtf[novel_gtf$type == "exon" &
novel_gtf$transcript_id %in% tandemTrID]
exon1 <- novelTandemExons[
(novelTandemExons$exon_number == 1 &
as.character(strand(novelTandemExons)) == "+") |
(novelTandemExons$exon_number == 3 &
as.character(strand(novelTandemExons)) == "-")
]
exon2 <- novelTandemExons[
(novelTandemExons$exon_number == 2)
]
exon3 <- novelTandemExons[
(novelTandemExons$exon_number == 1 &
as.character(strand(novelTandemExons)) == "-") |
(novelTandemExons$exon_number == 3 &
as.character(strand(novelTandemExons)) == "+")
]
names(exon1) <- exon1$transcript_id
names(exon2) <- exon2$transcript_id
names(exon3) <- exon3$transcript_id
exon1 <- exon1[tandemTr$transcript_id]
exon2 <- exon2[tandemTr$transcript_id]
exon3 <- exon3[tandemTr$transcript_id]
novelTJ$start1 <- BiocGenerics::end(exon1) + 1
novelTJ$end1 <- BiocGenerics::start(exon2) - 1
novelTJ$start2 <- BiocGenerics::end(exon2) + 1
novelTJ$end2 <- BiocGenerics::start(exon3) - 1
# Create putative novel TJ's
# junction1 - exon - junction2
leftJn <- copy(allJunctions[, c("seqnames", "start", "end", "strand")])
rightJn <- copy(allJunctions[, c("seqnames", "start", "end", "strand")])
putExons <- copy(as.data.table(exons)[,
c("seqnames", "start", "end", "strand")])
leftJn <- unique(leftJn)
rightJn <- unique(rightJn)
putExons <- unique(putExons)
setnames(leftJn, c("start", "end"), c("start1", "end1"))
setnames(rightJn, c("start", "end"), c("start2", "end2"))
setnames(putExons, c("start", "end"), c("end1", "start2"))
putExons[, c("end1", "start2") := list(get("end1") - 1, get("start2") + 1)]
putTJ <- putExons[leftJn, on = c("seqnames", "end1", "strand")]
putTJ <- putTJ[rightJn, on = c("seqnames", "start2", "strand")]
putTJ <- putTJ[, c("seqnames", "start1", "end1", "start2", "end2", "strand")]
putTJ <- putTJ[!is.na(get("end2"))]
putTJ <- putTJ[complete.cases(putTJ)]
putTJ <- unique(putTJ)
# anti-join against knowns
novelPutTJ <- fsetdiff(putTJ, knownTJ)
nontandemTr <- novel_gtf[
!(novel_gtf$transcript_id %in% tandemTrID) & novel_gtf$type == "transcript"
]
nontandemExons <- novel_gtf[
novel_gtf$type == "exon" & novel_gtf$transcript_id %in% nontandemTr$transcript_id
]
novelJunctions <- .grlGaps(split(
nontandemExons,
nontandemExons$transcript_id
))
novelJunctions <- as.data.table(novelJunctions)
novelJunctions[, c("group", "group_name") := list(NULL,NULL)]
novelJunctions <- .grDT(novelJunctions)
novelJunctions <- sort(novelJunctions)
novelJunctions <- as.data.table(novelJunctions)
novelJunctions <- unique(novelJunctions)
novelJunctions <- novelJunctions[, c("seqnames", "start", "end", "strand"), with = FALSE]
knownJunctions <- as.data.table(fst::read.fst(file.path(
reference_path, "fst/junctions.fst"
), columns = c("seqnames", "start", "end", "strand")))
knownJunctions <- unique(knownJunctions)
novelJunctions <- fsetdiff(novelJunctions, knownJunctions)
filterByJunctions <- rbind(knownJunctions, novelJunctions)
# Now package novelPutTJ into novel_gtf
if(nrow(novelPutTJ) == 0) return(NULL)
new_gtf <- .collateData_construct_novelGTF(
c(ref_gtf, novel_gtf), NULL, novelPutTJ,
novelTr_source = "novelPutative",
novelGeneID_header = "novelGenePutative",
novelGeneBT = "novel_genePutative",
novelTrID_header = "novelPutTrID",
novelTrName_suffix = "-novelPutTr",
novelTrBT = "novel_transcript_putative"
)
return(new_gtf)
}
.collateData_construct_novelGTF <- function(
gtf, junc.novel = NULL, tj.novel = NULL,
novelTr_source = "novel",
novelGeneID_header = "novelGene",
novelGeneBT = "novel_gene",
novelTrID_header = "novelTrID",
novelTrName_suffix = "-novelTr",
novelTrBT = "novel_transcript"
) {
n_junc <- ifelse(is(junc.novel, "data.frame"), nrow(junc.novel), 0)
n_tj <- ifelse(is(tj.novel, "data.frame"), nrow(tj.novel), 0)
n_trans <- n_junc + n_tj
if(n_trans < 1) return(NULL)
new_gtf <- GRanges()
gr_transcript <- GRanges()
# Initialize novel transcripts and determine genes
suppressWarnings({
# gr_transcript <- c(
# GRanges(
# junc.novel$seqnames,
# IRanges(junc.novel$start - 50, junc.novel$end + 50),
# junc.novel$strand
# ),
# GRanges(
# tj.novel$seqnames,
# IRanges(tj.novel$start1 - 50, tj.novel$end2 + 50),
# tj.novel$strand
# )
# )
if(n_junc > 0) {
gr_transcript <- c(gr_transcript,
GRanges(
junc.novel$seqnames,
IRanges(junc.novel$start - 50, junc.novel$end + 50),
junc.novel$strand
)
)
}
if(n_tj > 0) {
gr_transcript <- c(gr_transcript,
GRanges(
tj.novel$seqnames,
IRanges(tj.novel$start1 - 50, tj.novel$end2 + 50),
tj.novel$strand
)
)
}
})
Genes <- as.data.table(gtf[gtf$type == "gene"])
if(any(grepl(novelGeneID_header, Genes$gene_id))) {
.log("Issue with given gene_id header", "message")
novelGeneID_header <- paste0(novelGeneID_header, "2")
}
if(any(grepl(novelGeneBT, Genes$gene_biotype))) {
.log("Issue with given gene_id header", "message")
novelGeneBT <- paste0(novelGeneBT, "2")
}
Transcripts <- as.data.table(gtf[gtf$type == "transcript"])
if(any(grepl(novelTrID_header, Transcripts$transcript_id))) {
.log("Issue with given transcript_id header", "message")
novelTrID_header <- paste0(novelTrID_header, "2")
}
if(any(grepl(novelTrName_suffix, Transcripts$transcript_name))) {
.log("Issue with given transcript_name suffix", "message")
novelTrName_suffix <- paste0(novelTrName_suffix, "2")
}
if(any(grepl(novelTrBT, Transcripts$transcript_biotype))) {
.log("Issue with given transcript_biotype header", "message")
novelTrBT <- paste0(novelTrBT, "2")
}
# Annotate novel junctions as belonging to known genes if they overlap
OL <- .findOverlaps_merge(gr_transcript, .grDT(Genes))
OL.DT <- data.table(from = OL@from, to = OL@to)
OL.DT <- unique(OL.DT, by = "from")
junc_genes <- data.table(juncID = seq_len(length(gr_transcript)))
junc_genes$gene_id <- junc_genes$gene_name <- junc_genes$gene_biotype <- NA
junc_genes$gene_id[OL.DT$from] <- Genes$gene_id[OL.DT$to]
junc_genes$gene_name[OL.DT$from] <- Genes$gene_name[OL.DT$to]
junc_genes$gene_biotype[OL.DT$from] <- Genes$gene_biotype[OL.DT$to]
# Construct a list of novel genes
if(any(is.na(junc_genes$gene_id))) {
tr_novelGene <- sort(reduce(
gr_transcript[is.na(junc_genes$gene_id)]
))
empty_mCol_nG <- data.frame(matrix(ncol = ncol(mcols(gtf)),
nrow = length(tr_novelGene)))
colnames(empty_mCol_nG) <- colnames(mcols(gtf))
empty_mCol_nG$source <- novelTr_source
empty_mCol_nG$gene_id <- paste0(novelGeneID_header,
formatC(seq_len(length(tr_novelGene)),
width = 6, format = "d", flag = "0"))
empty_mCol_nG$gene_name <- empty_mCol_nG$gene_id
empty_mCol_nG$gene_biotype <- novelGeneBT
mcols(tr_novelGene) <- empty_mCol_nG
mcols(tr_novelGene)$type = "gene"
new_gtf <- c(new_gtf, tr_novelGene)
# Reannotate gene list
OL <- findOverlaps(gr_transcript, tr_novelGene)
OL.DT <- data.table(from = OL@from, to = OL@to)
OL.DT <- unique(OL.DT, by = "from")
junc_genes$gene_id[OL.DT$from] <- empty_mCol_nG$gene_id[OL.DT$to]
junc_genes$gene_name[OL.DT$from] <- empty_mCol_nG$gene_name[OL.DT$to]
junc_genes$gene_biotype[OL.DT$from] <- novelGeneBT
}
junc_genes[, c("transcript_number") := seq_len(.N), by = "gene_id"]
junc_genes[, c("transcript_number_str") := formatC(
get("transcript_number"), width = 3, format = "d", flag = "0")]
junc_genes[, c("transcript_name") := paste0(
get("gene_name"), novelTrName_suffix, get("transcript_number_str"))]
junc_genes[, c("transcript_id") := paste0(novelTrID_header, formatC(
seq_len(.N), width = 6, format = "d", flag = "0"))]
# Construct a list of novel transcripts and exons
empty_mCol <- data.frame(matrix(ncol = ncol(mcols(gtf)),
nrow = length(gr_transcript)))
colnames(empty_mCol) <- colnames(mcols(gtf))
empty_mCol$source <- novelTr_source
empty_mCol$gene_id <- junc_genes$gene_id
empty_mCol$gene_name <- junc_genes$gene_name
empty_mCol$transcript_id <- junc_genes$transcript_id
empty_mCol$transcript_name <- junc_genes$transcript_name
empty_mCol$gene_biotype <- junc_genes$gene_biotype
empty_mCol$transcript_biotype <- novelTrBT
mcols(gr_transcript) <- empty_mCol
mcols(gr_transcript)$type = "transcript"
new_gtf <- c(new_gtf, gr_transcript)
# Construct exons and transcripts
if(n_junc > 0) {
gr_novel_leftExon <- with(junc.novel, GRanges(seqnames = seqnames,
ranges = IRanges(start = start - 50, end = start - 1),
strand = strand))
gr_novel_rightExon <- with(junc.novel, GRanges(seqnames = seqnames,
ranges = IRanges(start = end + 1, end = end + 50),
strand = strand))
mcols(gr_novel_leftExon) <- empty_mCol[seq_len(n_junc),]
mcols(gr_novel_rightExon) <- empty_mCol[seq_len(n_junc),]
mcols(gr_novel_leftExon)$exon_number <- ifelse(
strand(gr_novel_leftExon) == "+", 1, 2)
mcols(gr_novel_rightExon)$exon_number <- ifelse(
strand(gr_novel_rightExon) == "-", 1, 2)
mcols(gr_novel_leftExon)$exon_id <- paste0(
mcols(gr_novel_leftExon)$transcript_id,
"-exon", mcols(gr_novel_leftExon)$exon_number
)
mcols(gr_novel_rightExon)$exon_id <- paste0(
mcols(gr_novel_rightExon)$transcript_id,
"-exon", mcols(gr_novel_rightExon)$exon_number
)
mcols(gr_novel_leftExon)$type <- "exon"
mcols(gr_novel_rightExon)$type <- "exon"
new_gtf <- c(new_gtf, gr_novel_leftExon, gr_novel_rightExon)
}
if(n_tj > 0) {
gr_tjnovel_leftExon <- GRanges(tj.novel$seqnames,
IRanges(tj.novel$start1 - 50, tj.novel$start1 - 1),
tj.novel$strand)
gr_tjnovel_middleExon <- GRanges(tj.novel$seqnames,
IRanges(tj.novel$end1 + 1, tj.novel$start2 - 1),
tj.novel$strand)
gr_tjnovel_rightExon <- GRanges(tj.novel$seqnames,
IRanges(tj.novel$end2 + 1, tj.novel$end2 + 50),
tj.novel$strand)
mcols(gr_tjnovel_leftExon) <- empty_mCol[
seq(n_junc + 1, nrow(empty_mCol)),]
mcols(gr_tjnovel_middleExon) <- empty_mCol[
seq(n_junc + 1, nrow(empty_mCol)),]
mcols(gr_tjnovel_rightExon) <- empty_mCol[
seq(n_junc + 1, nrow(empty_mCol)),]
mcols(gr_tjnovel_leftExon)$exon_number <- ifelse(
strand(gr_tjnovel_leftExon) == "+", 1, 3)
mcols(gr_tjnovel_middleExon)$exon_number <- 2
mcols(gr_tjnovel_rightExon)$exon_number <- ifelse(
strand(gr_tjnovel_rightExon) == "-", 1, 3)
mcols(gr_tjnovel_leftExon)$exon_id <- paste0(
mcols(gr_tjnovel_leftExon)$transcript_id,
"-exon", mcols(gr_tjnovel_leftExon)$exon_number
)
mcols(gr_tjnovel_middleExon)$exon_id <- paste0(
mcols(gr_tjnovel_middleExon)$transcript_id,
"-exon", mcols(gr_tjnovel_middleExon)$exon_number
)
mcols(gr_tjnovel_rightExon)$exon_id <- paste0(
mcols(gr_tjnovel_rightExon)$transcript_id,
"-exon", mcols(gr_tjnovel_rightExon)$exon_number
)
mcols(gr_tjnovel_leftExon)$type <- "exon"
mcols(gr_tjnovel_middleExon)$type <- "exon"
mcols(gr_tjnovel_rightExon)$type <- "exon"
new_gtf <- c(new_gtf, gr_tjnovel_leftExon,
gr_tjnovel_middleExon, gr_tjnovel_rightExon)
}
return(new_gtf)
}
################################################################################
# Use Exon Groups file to designate flanking exon islands to ALL junctions
.collateData_junc_group <- function(
threadID, reference_path, norm_output_path
) {
if(threadID != 2) return()
junc.common <- as.data.table(read.fst(file.path(norm_output_path,
"annotation", "junc.common.annotated.fst")))
Exon.Groups <- as.data.table(
read.fst(file.path(reference_path, "fst", "Exons.Group.fst"))
)
junc.common[, c("seqnames") := as.character(get("seqnames"))]
Exon.Groups[, c("seqnames") := as.character(get("seqnames"))]
# Always calculate stranded for junctions
Exon.Groups.S <- Exon.Groups[get("strand") != "*"]
# Same method as in BuildRef
junc.common <- .process_introns_group_overlap(
junc.common, Exon.Groups.S,
c("gene_group_up", "exon_group_up",
"gene_group_down", "exon_group_down"),
c("gene_group", "exon_group", "gene_group", "exon_group")
)
junc.common <- .process_introns_group_fix_RI(
junc.common, Exon.Groups.S,
c("gene_group_up", "exon_group_up",
"gene_group_down", "exon_group_down"),
c("gene_group", "intron_number", "gene_group", "intron_number")
)
junc.common[,
c("JG_up", "JG_down") := list(
paste(get("gene_group_up"), get("exon_group_up"), sep = "_"),
paste(get("gene_group_down"), get("exon_group_down"), sep = "_")
)
]
# Remove temp columns
junc.common$gene_group_up <- NULL
junc.common$gene_group_down <- NULL
junc.common$exon_group_up <- NULL
junc.common$exon_group_down <- NULL
write.fst(as.data.frame(junc.common), file.path(norm_output_path,
"annotation", "Junc.fst"))
junc_PSI <- junc.common[, c("seqnames", "start", "end", "strand")]
write.fst(as.data.frame(junc_PSI),
file.path(norm_output_path, "junc_PSI_index.fst"))
rm(junc.common, Exon.Groups, Exon.Groups.S, junc_PSI)
gc()
}
# Use Exon Groups file to designate flanking exon islands to ALL introns
.collateData_sw_group <- function(threadID,
reference_path, norm_output_path, stranded = TRUE
) {
if(threadID != 2) return()
sw.common <- as.data.table(read.fst(file.path(norm_output_path,
"annotation", "sw.common.fst")))
# Use Exon Groups file to designate exon groups to all junctions
Exon.Groups <- as.data.table(
read.fst(file.path(reference_path, "fst", "Exons.Group.fst")))
sw.common[, c("seqnames") := as.character(get("seqnames"))]
Exon.Groups[, c("seqnames") := as.character(get("seqnames"))]
# Always calculate stranded for junctions
Exon.Groups.S <- Exon.Groups[get("strand") != "*"]
sw.common <- .process_introns_group_overlap(
sw.common, Exon.Groups.S,
c("gene_group_up", "exon_group_up",
"gene_group_down", "exon_group_down"),
c("gene_group", "exon_group", "gene_group", "exon_group")
)
sw.common <- .process_introns_group_fix_RI(
sw.common, Exon.Groups.S,
c("gene_group_up", "exon_group_up",
"gene_group_down", "exon_group_down"),
c("gene_group", "intron_number", "gene_group", "intron_number")
)
sw.common[,
c("JG_up", "JG_down") := list(
paste(get("gene_group_up"), get("exon_group_up"), sep = "_"),
paste(get("gene_group_down"), get("exon_group_down"), sep = "_")
)
]
sw.common$gene_group_up <- NULL
sw.common$gene_group_down <- NULL
sw.common$exon_group_up <- NULL
sw.common$exon_group_down <- NULL
if (!stranded) {
Exon.Groups.S <- Exon.Groups[get("strand") == "*"]
} else {
Exon.Groups.S <- Exon.Groups[get("strand") != "*"]
}
sw.common <- .process_introns_group_overlap(
sw.common, Exon.Groups.S,
c("gene_group_up", "exon_group_up",
"gene_group_down", "exon_group_down"),
c("gene_group", "exon_group", "gene_group", "exon_group")
)
sw.common <- .process_introns_group_fix_RI(
sw.common, Exon.Groups.S,
c("gene_group_up", "exon_group_up",
"gene_group_down", "exon_group_down"),
c("gene_group", "intron_number", "gene_group", "intron_number")
)
sw.common[,
c("IRG_up", "IRG_down") := list(
paste(get("gene_group_up"), get("exon_group_up"), sep = "_"),
paste(get("gene_group_down"), get("exon_group_down"), sep = "_")
)
]
sw.common[, c("EventRegion") :=
paste0(get("seqnames"), ":", get("start"), "-",
get("end"), "/", get("strand"))]
write.fst(as.data.frame(sw.common),
file.path(norm_output_path, "annotation", "IR.fst"))
rm(sw.common, Exon.Groups, Exon.Groups.S)
gc()
}
# Annotates splice junctions with ID's of flanking exon islands
.collateData_splice_anno <- function(threadID,
reference_path, norm_output_path
) {
if(threadID != 2) return()
if(!file.exists(file.path(reference_path, "fst", "Splice.fst"))) return()
candidate.introns <- as.data.table(
read.fst(file.path(reference_path, "fst", "junctions.fst")))
sw.common <- as.data.table(read.fst(file.path(norm_output_path,
"annotation", "IR.fst")))
candidate.introns[, c("seqnames") := as.character(get("seqnames"))]
sw.common[, c("seqnames") := as.character(get("seqnames"))]
# Order introns by importance (WHY?)
candidate.introns[, c("transcript_biotype_2") := get("transcript_biotype")]
candidate.introns[
!(get("transcript_biotype") %in%
c("protein_coding", "processed_transcript",
"lincRNA", "antisense", "nonsense_mediated_decay")),
c("transcript_biotype_2") := "other"]
candidate.introns[, c("transcript_biotype_2") :=
factor(get("transcript_biotype_2"),
c("protein_coding", "processed_transcript",
"lincRNA", "antisense", "other", "nonsense_mediated_decay"),
ordered = TRUE)]
if ("transcript_support_level" %in% colnames(candidate.introns)) {
setorderv(candidate.introns,
c("transcript_biotype_2", "transcript_support_level"))
} else {
setorder(candidate.introns, "transcript_biotype_2")
}
candidate.introns[, c("Event1a") := get("Event")]
candidate.introns[, c("Event2a") := get("Event")]
# Annotate exon islands for splice events
Splice.Anno <- read.fst(file.path(reference_path, "fst", "Splice.fst"),
as.data.table = TRUE)
# Remove retained introns not assayed in sw.common
Splice.Anno <- Splice.Anno[
get("Event1a") %in% sw.common$EventRegion |
get("EventType") != "RI"
]
Splice.Anno[candidate.introns, on = "Event1a",
c("up_1a") := paste(get("i.gene_group_stranded"),
get("i.exon_group_stranded_upstream"), sep = "_")]
Splice.Anno[candidate.introns, on = "Event1a",
c("down_1a") := paste(get("i.gene_group_stranded"),
get("i.exon_group_stranded_downstream"), sep = "_")]
Splice.Anno[candidate.introns, on = "Event2a",
c("down_2a") := paste(get("i.gene_group_stranded"),
get("i.exon_group_stranded_downstream"), sep = "_")]
Splice.Anno[get("EventType") %in% c("MXE", "SE", "ALE", "A3SS", "RI"),
c("JG_up") := get("up_1a")]
Splice.Anno[get("EventType") %in% c("SE", "AFE", "A5SS", "RI"),
c("JG_down") := get("down_1a")]
Splice.Anno[get("EventType") %in% c("MXE"),
c("JG_down") := get("down_2a")]
Splice.Anno$up_1a <- NULL
Splice.Anno$down_1a <- NULL
Splice.Anno$down_2a <- NULL
Splice.Anno[, c("strand") :=
tstrsplit(get("Event1a"), split = "/")[[2]]]
write.fst(as.data.frame(Splice.Anno),
file.path(norm_output_path, "annotation", "Splice.fst"))
rm(Splice.Anno, candidate.introns, sw.common)
gc()
}
# Generate rowData annotations
.collateData_rowEvent <- function(threadID, reference_path, norm_output_path) {
if(threadID != 2) return()
.collateData_rowEvent_brief(norm_output_path)
.collateData_rowEvent_mapGenes(norm_output_path)
.collateData_rowEvent_splice_option(reference_path, norm_output_path)
.collateData_rowEvent_full(reference_path, norm_output_path)
}
# Truncated rowEvent, with EventName, EventType, EventRegion
.collateData_rowEvent_brief <- function(norm_output_path) {
# make rowEvent brief here
sw.common <- as.data.table(read.fst(
file.path(norm_output_path, "annotation", "IR.fst")))
sw.common[, c("seqnames") := as.character(get("seqnames"))]
sw.anno.brief <- sw.common[, c("Name", "EventRegion")]
setnames(sw.anno.brief, "Name", "EventName")
sw.anno.brief[, c("EventType") := "IR"]
sw.anno.brief <- sw.anno.brief[,
c("EventName", "EventType", "EventRegion")]
rowEvent <- sw.anno.brief
spliceFile <- file.path(norm_output_path, "annotation", "Splice.fst")
if(file.exists(spliceFile)) {
Splice.Anno <- as.data.table(read.fst(spliceFile))
splice.anno.brief <- Splice.Anno[,
c("EventName", "EventType", "EventRegion")]
rowEvent <- rbind(rowEvent, splice.anno.brief)
}
write.fst(as.data.frame(rowEvent),
file.path(norm_output_path, "rowEvent.brief.fst"))
rm(sw.common, Splice.Anno, sw.anno.brief, splice.anno.brief)
gc()
}
# Create mapping of EventNames to Genes (for Gene Ontology)
.collateData_rowEvent_mapGenes <- function(norm_output_path) {
reference_path <- file.path(norm_output_path, "Reference")
spliceFile <- file.path(reference_path, "fst/Splice.fst")
TrFile <- file.path(reference_path, "fst/Transcripts.fst")
IRdirFile <- file.path(reference_path, "fst/Introns.Dir.fst")
IRnondirFile <- file.path(reference_path, "fst/Introns.ND.fst")
# Create mappers
IR_trid <- rbind(
read.fst(
IRdirFile, columns = c("EventName", "transcript_id")
),
read.fst(
IRnondirFile, columns = c("EventName", "transcript_id")
)
)
Tr2Gene <- read.fst(TrFile, columns = c("transcript_id", "gene_id"))
IR_trid <- as.data.table(IR_trid)
Tr2Gene <- as.data.table(Tr2Gene)
IR_trid <- Tr2Gene[IR_trid, on = "transcript_id"]
IR_trid$gene_id_b <- IR_trid$gene_id
allEvents <- IR_trid[, c("EventName", "gene_id", "gene_id_b"), with = FALSE]
if(file.exists(spliceFile)) {
splice_geneid <- read.fst(
spliceFile, columns = c("EventName", "gene_id", "gene_id_b")
)
splice_geneid$gene_id <- as.character(splice_geneid$gene_id)
splice_geneid$gene_id_b <- as.character(splice_geneid$gene_id_b)
splice_geneid$gene_id <- as.character(splice_geneid$gene_id)
splice_geneid$gene_id_b <- as.character(splice_geneid$gene_id_b)
}
allEvents <- rbind(allEvents, splice_geneid)
allEvents <- as.data.table(allEvents)
# Order by events in rowEvent
rowEvent <- read.fst(
file.path(norm_output_path, "rowEvent.brief.fst")
)
rowEvent$gene_id <- allEvents$gene_id[match(
rowEvent$EventName, allEvents$EventName)]
rowEvent$gene_id_b <- allEvents$gene_id_b[match(
rowEvent$EventName, allEvents$EventName)]
write.fst(as.data.frame(rowEvent),
file.path(norm_output_path, "rowEvent.brief.fst"))
rm(rowEvent, splice_geneid, IR_trid, Tr2Gene, allEvents)
gc()
}
# Generate annotation based on importance of involved transcripts
.collateData_rowEvent_splice_option <- function(
reference_path, norm_output_path
) {
if(!file.exists(file.path(reference_path, "fst", "Splice.options.fst")))
return()
Splice.Options <- as.data.table(read.fst(
file.path(reference_path, "fst", "Splice.options.fst")))
Transcripts <- as.data.table(read.fst(
file.path(reference_path, "fst", "Transcripts.fst")))
Splice.Anno.Brief <- read.fst(
file.path(reference_path, "fst", "Splice.fst"),
as.data.table = TRUE, columns = c("EventName", "EventID"))
# Splice.Options[Splice.Anno.Brief, on = "EventID",
# c("EventName") := get("i.EventName")]
# Splice.Options[Transcripts, on = "transcript_id",
# c("transcript_biotype") := get("i.transcript_biotype")]
Splice.Options$EventName <- Splice.Anno.Brief$EventName[match(
Splice.Options$EventID, Splice.Anno.Brief$EventID)]
Splice.Options$transcript_biotype <- Transcripts$transcript_biotype[match(
Splice.Options$transcript_id, Transcripts$transcript_id)]
Splice.Options.Summary <- copy(Splice.Options)
# Splice.Options.Summary[,
# c("tsl_min") := min(get("transcript_support_level")),
# by = c("EventID", "isoform")]
# Splice.Options.Summary[,
# c("any_is_PC") := any(get("is_protein_coding")),
# by = c("EventID", "isoform")]
# Splice.Options.Summary[,
# c("all_is_NMD") := all(grepl("decay", get("transcript_biotype"))),
# by = c("EventID", "isoform")]
Splice.Options.Summary[,
c("tsl_min", "any_is_PC", "all_is_NMD") := list(
min(get("transcript_support_level")),
any(get("is_protein_coding")),
all(grepl("decay", get("transcript_biotype")))
), by = c("EventID", "isoform")]
write.fst(as.data.frame(Splice.Options.Summary),
file.path(norm_output_path, "annotation", "Splice.Options.Summary.fst"))
rm(Splice.Options.Summary, Splice.Options, Transcripts, Splice.Anno.Brief)
gc()
}
# Annotate full rowEvent. Includes:
# - whether Inc/Exc is a protein coding splicing event
# - whether Inc/Exc represent an event causing NMD
# - the highest-ranking TSL for each alternative (A/B) of event
.collateData_rowEvent_full <- function(reference_path, norm_output_path) {
Splice.Options.Summary <- as.data.table(read.fst(
file.path(norm_output_path, "annotation",
"Splice.Options.Summary.fst")))
Splice.Anno <- as.data.table(read.fst(
file.path(norm_output_path, "annotation", "Splice.fst")))
rowEvent.Extended <- read.fst(
file.path(norm_output_path, "rowEvent.brief.fst"),
as.data.table = TRUE)
# IR-NMD: not guaranteed to exist:
IRNMDfile <- file.path(reference_path, "fst", "IR.NMD.fst")
IR_NMD <- NULL
if(file.exists(IRNMDfile)) {
IR_NMD <- read.fst(IRNMDfile, as.data.table = TRUE)
}
candidate.introns <- as.data.table(
read.fst(file.path(reference_path, "fst", "junctions.fst")))
rowEvent.Extended[get("EventType") == "IR",
c("intron_id") := tstrsplit(get("EventName"), split = "/")[[2]]]
rowEvent.Extended.IR <- rowEvent.Extended[get("EventType") == "IR"]
rowEvent.Extended.splice <- rowEvent.Extended[get("EventType") != "IR"]
rowEvent.Extended.IR[,
c("Inc_Is_Protein_Coding", "Exc_Is_Protein_Coding") :=
list(FALSE,FALSE)
]
rowEvent.Extended.splice[,
c("Inc_Is_Protein_Coding", "Exc_Is_Protein_Coding") :=
list(FALSE,FALSE)
]
if(!is.null(IR_NMD)) {
rowEvent.Extended.IR[
get("intron_id") %in% IR_NMD$intron_id[IR_NMD$intron_type == "CDS"],
c("Inc_Is_Protein_Coding", "Exc_Is_Protein_Coding") := TRUE
]
}
rowEvent.Extended.splice[
get("EventName") %in% Splice.Options.Summary$EventName[
Splice.Options.Summary$isoform == "A" &
Splice.Options.Summary$any_is_PC == TRUE
],
c("Inc_Is_Protein_Coding") := TRUE
]
rowEvent.Extended.splice[
get("EventName") %in% Splice.Options.Summary$EventName[
Splice.Options.Summary$isoform == "B" &
Splice.Options.Summary$any_is_PC == TRUE
],
c("Exc_Is_Protein_Coding") := TRUE
]
# rowEvent.Extended[Splice.Options.Summary[get("isoform") == "A"],
# on = "EventName", c("Inc_Is_Protein_Coding") := get("i.any_is_PC")]
# rowEvent.Extended[Splice.Options.Summary[get("isoform") == "B"],
# on = "EventName", c("Exc_Is_Protein_Coding") := get("i.any_is_PC")]
rowEvent.Extended.IR[, c("Exc_Is_NMD", "Inc_Is_NMD") :=
list(FALSE, FALSE)]
rowEvent.Extended.splice[, c("Exc_Is_NMD", "Inc_Is_NMD") :=
list(FALSE, FALSE)]
tmpA <- Splice.Options.Summary[get("isoform") == "A"]
tmpB <- Splice.Options.Summary[get("isoform") == "B"]
## Important changes:
## - for IR, NMD is
## - reported for all introns calculated in reference (not just CDS)
## - NA if intron not tested (not FALSE, as in version <= 1.1.2)
## - only true for spliced transcripts if true for all transcripts
## containing the same intron
if(!is.null(IR_NMD)) {
# rowEvent.Extended[IR_NMD[!is.na(get("splice_is_NMD"))], on = "intron_id",
# c("Exc_Is_NMD") := get("i.splice_is_NMD")]
# rowEvent.Extended[IR_NMD, on = "intron_id",
# c("Inc_Is_NMD") := get("i.IRT_is_NMD")]
rowEvent.Extended.IR$Inc_Is_NMD <- IR_NMD$IRT_is_NMD[match(
rowEvent.Extended.IR$intron_id, IR_NMD$intron_id)]
rowEvent.Extended.IR$tmpExc_Is_NMD <- IR_NMD$splice_is_NMD[match(
rowEvent.Extended.IR$intron_id, IR_NMD$intron_id)]
rowEvent.Extended.IR[,
c("sumExc_Is_NMD") := sum(get("tmpExc_Is_NMD") == TRUE, na.rm = TRUE),
by = "EventRegion"
]
rowEvent.Extended.IR[, c("Exc_Is_NMD") := get("sumExc_Is_NMD") > 0]
rowEvent.Extended.IR$sumExc_Is_NMD <- NULL
rowEvent.Extended.IR$tmpExc_Is_NMD <- NULL
rowEvent.Extended.IR[!(get("intron_id") %in% IR_NMD$intron_id),
c("Exc_Is_NMD", "Inc_Is_NMD") := list(NA, NA)]
# rowEvent.Extended[Splice.Options.Summary[get("isoform") == "A"],
# on = "EventName", c("Inc_Is_NMD") := get("i.all_is_NMD")]
# rowEvent.Extended[Splice.Options.Summary[get("isoform") == "B"],
# on = "EventName", c("Exc_Is_NMD") := get("i.all_is_NMD")]
rowEvent.Extended.splice$Inc_Is_NMD <- tmpA$all_is_NMD[match(
rowEvent.Extended.splice$EventName, tmpA$EventName)]
rowEvent.Extended.splice$Exc_Is_NMD <- tmpB$all_is_NMD[match(
rowEvent.Extended.splice$EventName, tmpB$EventName)]
}
rowEvent.Extended.IR[, c("Inc_TSL", "Exc_TSL") := list(NA, NA)]
rowEvent.Extended.splice[, c("Inc_TSL", "Exc_TSL") := list(NA, NA)]
# rowEvent.Extended[candidate.introns, on = "intron_id",
# c("Inc_TSL") := get("i.transcript_support_level")]
# rowEvent.Extended[candidate.introns, on = "intron_id",
# c("Exc_TSL") := get("i.transcript_support_level")]
rowEvent.Extended.IR$Inc_TSL <-
candidate.introns$transcript_support_level[match(
rowEvent.Extended.IR$intron_id, candidate.introns$intron_id)]
rowEvent.Extended.IR$Exc_TSL <-
candidate.introns$transcript_support_level[match(
rowEvent.Extended.IR$intron_id, candidate.introns$intron_id)]
# rowEvent.Extended[Splice.Options.Summary[get("isoform") == "A"],
# on = "EventName", c("Inc_TSL") := get("i.tsl_min")]
# rowEvent.Extended[Splice.Options.Summary[get("isoform") == "B"],
# on = "EventName", c("Exc_TSL") := get("i.tsl_min")]
rowEvent.Extended.splice$Inc_TSL <- tmpA$tsl_min[match(
rowEvent.Extended.splice$EventName, tmpA$EventName)]
rowEvent.Extended.splice$Exc_TSL <- tmpB$tsl_min[match(
rowEvent.Extended.splice$EventName, tmpB$EventName)]
# Designate exclusive first intron / last intron
candidate.introns[, c("inverse_intron_number") :=
max(get("intron_number")) - get("intron_number") + 1,
by = "transcript_id"]
candidate.introns[,
c("max_intron_number", "max_inv_intron_number") := list(
max(get("intron_number")), max(get("inverse_intron_number"))),
by = "Event"
]
rowEvent.Extended <- rbind(rowEvent.Extended.IR, rowEvent.Extended.splice)
# define Event1 / Event2
# rowEvent.Extended[get("EventType") == "IR",
# c("Event1a") := get("EventRegion")]
# rowEvent.Extended[Splice.Anno, on = "EventName",
# c("Event1a", "Event2a", "Event1b", "Event2b") :=
# list(get("i.Event1a"), get("i.Event2a"),
# get("i.Event1b"), get("i.Event2b"))]
rowEvent.Extended$Event1a <- Splice.Anno$Event1a[match(
rowEvent.Extended$EventName, Splice.Anno$EventName)]
rowEvent.Extended$Event2a <- Splice.Anno$Event2a[match(
rowEvent.Extended$EventName, Splice.Anno$EventName)]
rowEvent.Extended$Event1b <- Splice.Anno$Event1b[match(
rowEvent.Extended$EventName, Splice.Anno$EventName)]
rowEvent.Extended$Event2b <- Splice.Anno$Event2b[match(
rowEvent.Extended$EventName, Splice.Anno$EventName)]
rowEvent.Extended[get("EventType") == "IR",
c("Event1a") := get("EventRegion")]
# rowEvent.Extended[candidate.introns, on = "Event1a",
# c("iafi_A") := (get("i.max_intron_number") == 1)]
# rowEvent.Extended[candidate.introns, on = "Event1b",
# c("iafi_B") := (get("i.max_intron_number") == 1)]
# rowEvent.Extended[, c("is_always_first_intron") :=
# get("iafi_A") & get("iafi_B")]
# rowEvent.Extended[, c("iafi_A", "iafi_B") := list(NULL, NULL)]
rowEvent.Extended$is_always_first_intron <-
rowEvent.Extended$Event1a %in% candidate.introns$Event[
candidate.introns$max_intron_number == 1] &
rowEvent.Extended$Event1b %in% candidate.introns$Event[
candidate.introns$max_intron_number == 1]
# rowEvent.Extended[candidate.introns, on = "Event1a",
# c("iali_A") := (get("i.max_inv_intron_number") == 1)]
# rowEvent.Extended[candidate.introns, on = "Event1b",
# c("iali_B") := (get("i.max_inv_intron_number") == 1)]
# rowEvent.Extended[, c("is_always_last_intron") :=
# get("iali_A") & get("iali_B")]
# rowEvent.Extended[, c("iali_A", "iali_B") := list(NULL, NULL)]
rowEvent.Extended$is_always_last_intron <-
rowEvent.Extended$Event1a %in% candidate.introns$Event[
candidate.introns$max_inv_intron_number == 1] &
rowEvent.Extended$Event1b %in% candidate.introns$Event[
candidate.introns$max_inv_intron_number == 1]
rowEvent.Extended[get("EventType") %in% c("MXE", "SE"),
c("is_always_first_intron", "is_always_last_intron") := list(NA,NA)]
# Convert IR to `annotated`
rowEvent.Extended[, c("is_annotated_IR") := FALSE]
Exons <- read.fst(file.path(reference_path, "fst", "Exons.fst"),
as.data.table = TRUE)
OL <- findOverlaps(
coord2GR(unlist(
rowEvent.Extended[get("EventType") == "IR", "EventRegion"]
)),
.grDT(Exons),
type = "within"
)
rowEvent.Extended[unique(OL@from), c("is_annotated_IR") := TRUE]
# Annotate NMD direction
rowEvent.Extended[, c("NMD_direction") := 0]
if(!is.null(IR_NMD)) {
rowEvent.Extended[get("Inc_Is_NMD") & !get("Exc_Is_NMD"),
c("NMD_direction") := 1]
rowEvent.Extended[!get("Inc_Is_NMD") & get("Exc_Is_NMD"),
c("NMD_direction") := -1]
}
write.fst(as.data.frame(rowEvent.Extended),
file.path(norm_output_path, "rowEvent.fst"))
rm(rowEvent.Extended, rowEvent.Extended.splice, rowEvent.Extended.IR,
candidate.introns, IR_NMD, Splice.Options.Summary, Splice.Anno)
gc()
}
################################################################################
# Sub
# Collates data from junction counts and processBAM coverage
.collateData_compile_agglist <- function(x, jobs, df.internal,
norm_output_path, IRMode,
useProgressBar = TRUE) {
# Load dataframe headers (left-most columns containing annotations)
rowEvent <- as.data.table(read.fst(
file.path(norm_output_path, "rowEvent.brief.fst")))
junc.common <- as.data.table(read.fst(
file.path(norm_output_path, "annotation", "Junc.fst"),
columns = c("seqnames", "start", "end", "strand",
"Event", "JG_up", "JG_down")))
sw.common <- as.data.table(read.fst(
file.path(norm_output_path, "annotation", "IR.fst"),
columns = c("seqnames", "start", "end", "Name", "Null", "strand",
"EventRegion")))
Splice.Anno <- as.data.table(read.fst(
file.path(norm_output_path, "annotation", "Splice.fst")))
junc_PSI <- as.data.table(read.fst(
file.path(norm_output_path, "junc_PSI_index.fst")))
junc.common[, c("seqnames") := as.character(get("seqnames"))]
sw.common[, c("seqnames") := as.character(get("seqnames"))]
work <- jobs[[x]]
block <- df.internal[work]
templates <- .collateData_seed_init(rowEvent, junc_PSI) # list of DT
pb <- NULL
if(useProgressBar)
pb <- progress_bar$new(
format = " generating arrays [:bar] :percent eta: :eta",
total = length(work), clear = FALSE, width= 100)
for (i in seq_len(length(work))) {
junc <- .collateData_process_junc(
block$sample[i], block$strand[i],
junc.common, norm_output_path)
junc_tmp <- junc[sw.common,
on = c("seqnames", "start", "end", "strand"),
c("seqnames", "start", "end", "strand", "SO_L", "SO_R")]
sw <- .collateData_process_sw(
block$sample[i], block$strand[i],
junc_tmp, sw.common, norm_output_path)
junc_tmp2 <- junc[get("Event") %in%
c(Splice.Anno$Event1a, Splice.Anno$Event1b,
Splice.Anno$Event2a, Splice.Anno$Event2b)]
splice <- .collateData_process_splice(
junc_tmp2, sw, Splice.Anno)
splice <- .collateData_process_splice_depth(
splice, sw)
# Dump unneeded columns to preserve memory
junc <- junc[, c("seqnames", "start", "end", "strand",
"count", "count_unstranded", "SO_L", "SO_R")]
sw_cols_keep <- c("EventRegion", "strand", "IntronDepth",
"ExonToIntronReadsLeft", "ExonToIntronReadsRight",
"TotalDepth", "Coverage")
if (IRMode == "SpliceOver") {
sw <- sw[, c(sw_cols_keep, "SpliceOver"), with = FALSE]
} else {
sw <- sw[, c(sw_cols_keep, "SpliceMax"), with = FALSE]
}
gc()
.collateData_process_assays_as_fst(rowEvent, junc_PSI, block$sample[i],
junc, sw, splice, IRMode, norm_output_path)
# remove temp files - raw extracted junc / SW output from processBAM
file.remove(file.path(norm_output_path, "temp",
paste(block$sample[i], "junc.fst.tmp", sep = ".")))
file.remove(file.path(norm_output_path, "temp",
paste(block$sample[i], "sw.fst.tmp", sep = ".")))
if(!is.null(pb)) pb$tick()
} # end FOR loop
return(NULL)
}
# INTERNAL: copies a list of data.table
.copy_DT <- function(templates) {
copy <- lapply(templates, copy)
names(copy) <- names(templates)
return(copy)
}
# Curate list of subset headers
.collateData_seed_init <- function(rowEvent, junc_PSI) {
templates <- list(
assay = copy(rowEvent),
inc = rowEvent[get("EventType") %in% c("IR", "MXE", "SE", "RI")],
exc = rowEvent[get("EventType") %in% c("MXE")],
junc = copy(junc_PSI)
)
return(templates)
}
# Collates junction counts, given sample strandedness and junction strand anno
# - calculates SpliceLeft, SpliceRight, SpliceOver sums
.collateData_process_junc <- function(sample, strand,
junc.common, norm_output_path) {
junc <- as.data.table(
read.fst(file.path(norm_output_path, "temp",
paste(sample, "junc.fst.tmp", sep = ".")))
)
junc[, c("seqnames") := as.character(get("seqnames"))]
junc[, c("start") := get("start") + 1]
junc$strand <- NULL # Use strand from junc.common
junc <- junc[junc.common, on = colnames(junc.common)[c(1, 2, 3)]]
junc$count_unstranded <- junc$total
if (strand == 0) {
junc$count <- junc$total
} else if (strand == -1) {
junc$count <- 0
junc[get("strand") == "+", c("count") := get("neg")]
junc[get("strand") == "-", c("count") := get("pos")]
junc[get("strand") == "*", c("count") := get("total")]
} else {
junc$count <- 0
junc[get("strand") == "+", c("count") := get("pos")]
junc[get("strand") == "-", c("count") := get("neg")]
junc[get("strand") == "*", c("count") := get("total")]
}
junc[is.na(get("count")), c("count") := 0]
junc <- junc[, c("seqnames", "start", "end", "strand",
"Event", "count", "count_unstranded")]
junc <- cbind(junc, junc.common[, c("JG_up", "JG_down")])
junc[, c("SO_L") := 0]
junc[, c("SO_R") := 0]
junc[, c("SO_I") := 0]
# SpliceLeft and SpliceRight calculations
junc[, c("SL") := sum(get("count")),
by = c("seqnames", "start", "strand")]
junc[, c("SR") := sum(get("count")),
by = c("seqnames", "end", "strand")]
# first overlap any junction that has non-same-island junctions
junc[get("JG_up") != get("JG_down") &
get("JG_up") != "" & get("strand") == "+",
c("SO_L") := sum(get("count")), by = "JG_up"]
junc[get("JG_up") != get("JG_down") &
get("JG_down") != "" & get("strand") == "+",
c("SO_R") := sum(get("count")), by = "JG_down"]
junc[get("JG_up") != get("JG_down") &
get("JG_up") != "" & get("strand") == "-",
c("SO_R") := sum(get("count")), by = "JG_up"]
junc[get("JG_up") != get("JG_down") &
get("JG_down") != "" & get("strand") == "-",
c("SO_L") := sum(get("count")), by = "JG_down"]
# Then use a simple overlap method to account for the remainder
# Filter for same-island junctions
junc.subset <- junc[get("JG_up") == get("JG_down") &
get("JG_up") != "" & get("JG_down") != ""]
junc.subset <- junc.subset[!(grepl("NA_NA", get("JG_up")) &
!grepl("NA_NA", get("JG_down")))]
if (nrow(junc.subset) > 0) {
subject <- junc[get("count") > 0]
OL <- .findOverlaps_merge_DT(junc.subset, subject)
splice.overlaps.DT <- data.table(from = from(OL), to = to(OL))
splice.overlaps.DT[,
c("count") := subject$count[to(OL)]]
splice.overlaps.DT[,
c("count_sum") := sum(get("count")), by = "from"]
splice.summa <- unique(
splice.overlaps.DT[, c("from", "count_sum")])
junc.subset[splice.summa$from,
c("SO_I") := splice.summa$count_sum]
junc[junc.subset, on = c("Event"), c("SO_I") := get("i.SO_I")]
# For annotated junctions, take SpliceOver as max of
# SpliceLeft, SpliceRight, or SpliceOver
junc[get("SO_L") < get("SO_I"), c("SO_L") := get("SO_I")]
junc[get("SO_R") < get("SO_I"), c("SO_R") := get("SO_I")]
# Finally, for extreme cases, make SO_L = SL if underestimates
junc[get("SO_L") < get("SL"), c("SO_L") := get("SL")]
junc[get("SO_R") < get("SR"), c("SO_R") := get("SR")]
junc[, c("SO_I") := NULL]
rm(OL, subject, junc.subset)
gc()
}
return(junc)
}
# Imports processBAM data, calculates SpliceOver from junction counts
.collateData_process_sw <- function(sample, strand, junc,
sw.common, norm_output_path) {
sw <- as.data.table(
read.fst(file.path(norm_output_path, "temp",
paste(sample, "sw.fst.tmp", sep = ".")))
)
sw[, c("seqnames") := as.character(get("seqnames"))]
sw[, c("start") := get("start") + 1]
sw <- sw[sw.common, on = colnames(sw.common)[seq_len(6)],
c("EventRegion") := get("i.EventRegion")]
# Extra statistics:
sw[, c("SpliceMax") := 0]
sw[get("SpliceLeft") >= get("SpliceRight"),
c("SpliceMax") := get("SpliceLeft")]
sw[get("SpliceLeft") < get("SpliceRight"),
c("SpliceMax") := get("SpliceRight")]
sw[junc, on = c("seqnames", "start", "end", "strand"),
c("SpliceOverLeft") := get("SO_L")]
sw[junc, on = c("seqnames", "start", "end", "strand"),
c("SpliceOverRight") := get("SO_R")]
sw[get("SpliceOverLeft") >= get("SpliceOverRight"),
c("SpliceOver") := get("SpliceOverLeft")]
sw[get("SpliceOverLeft") < get("SpliceOverRight"),
c("SpliceOver") := get("SpliceOverRight")]
sw[, c("IROratio") := 0]
sw[get("IntronDepth") < 1 & get("IntronDepth") > 0 &
(get("Coverage") + get("SpliceOver")) > 0,
c("IROratio") := get("Coverage") / (
get("Coverage") + get("SpliceOver"))]
sw[get("IntronDepth") >= 1,
c("IROratio") := get("IntronDepth") /
(get("IntronDepth") + get("SpliceOver"))]
sw[, c("TotalDepth") := get("IntronDepth") + get("SpliceOver")]
setnames(sw, "Name", "EventName")
return(sw)
}
# Calculates junction counts for each annotated splice event. Also calculates:
# - participation: The proportion of junctions at the region for each ASE
# - coverage: The total number of junctions covered at that ASE region
.collateData_process_splice <- function(junc, sw, Splice.Anno) {
splice <- copy(Splice.Anno)
# Summarises counts for all splice events as defined by Event(1/2)(a/b)
splice[, c("count_Event1a", "count_Event2a",
"count_Event1b", "count_Event2b") := list(0, 0, 0, 0)]
splice[!is.na(get("Event1a")),
c("count_Event1a") := junc$count[match(get("Event1a"), junc$Event)]]
splice[is.na(get("count_Event1a")),
c("count_Event1a") := 0]
splice[!is.na(get("Event1a")) & get("EventType") == "RI",
c("count_Event1a") :=
sw$IntronDepth[match(get("Event1a"), sw$EventRegion)]
]
splice[!is.na(get("Event2a")),
c("count_Event2a") := junc$count[match(get("Event2a"), junc$Event)]]
splice[is.na(get("count_Event2a")),
c("count_Event2a") := 0]
splice[!is.na(get("Event1b")),
c("count_Event1b") := junc$count[match(get("Event1b"), junc$Event)]]
splice[is.na(get("count_Event1b")),
c("count_Event1b") := 0]
splice[!is.na(get("Event2b")),
c("count_Event2b") := junc$count[match(get("Event2b"), junc$Event)]]
splice[is.na(get("count_Event2b")),
c("count_Event2b") := 0]
# Hack: for RI, ExonToIntronReadsLeft/Right is stored in count_Event2a/b
splice[get("EventType") == "RI" &
tstrsplit(get("Event1a"), split = "/", fixed = TRUE)[[2]] == "+",
c("count_Event2a", "count_Event2b") := list(
sw$ExonToIntronReadsLeft[match(get("Event1a"), sw$EventRegion)],
sw$ExonToIntronReadsRight[match(get("Event1a"), sw$EventRegion)]
)
]
splice[get("EventType") == "RI" &
tstrsplit(get("Event1a"), split = "/", fixed = TRUE)[[2]] == "-",
c("count_Event2a", "count_Event2b") := list(
sw$ExonToIntronReadsRight[match(get("Event1a"), sw$EventRegion)],
sw$ExonToIntronReadsLeft[match(get("Event1a"), sw$EventRegion)]
)
]
# Calculates total splice events that participates from the up/down
# stream exon islands
splice[, c("count_JG_up", "count_JG_down") := list(0, 0)]
# splice[!is.na(get("JG_up")) & get("strand") == "+",
# c("count_JG_up") := junc$SO_L[match(get("JG_up"), junc$JG_up)]]
# splice[!is.na(get("JG_up")) & get("strand") == "-",
# c("count_JG_up") := junc$SO_R[match(get("JG_up"), junc$JG_up)]]
# splice[is.na(get("count_JG_up")), c("count_JG_up") := 0]
# splice[!is.na(get("JG_down")) & get("strand") == "-",
# c("count_JG_down") := junc$SO_L[match(get("JG_down"), junc$JG_down)]]
# splice[!is.na(get("JG_down")) & get("strand") == "+",
# c("count_JG_down") := junc$SO_R[match(get("JG_down"), junc$JG_down)]]
# splice[is.na(get("count_JG_down")), c("count_JG_down") := 0]
# New method of determing depths
# Use Event1b's SO_L and SO_R for single events
# Use Event1a's SO_L and SO_R for IR/RI
# For SE, use Event1a and Event2a's SO
# For MXE, use Event1a and Event2b's SO
single_events <- c("AFE", "ALE", "A5SS", "A3SS")
splice[get("strand") == "+",
c("count_JG_up_1") := junc$SO_L[match(get("Event1a"), junc$Event)]]
splice[get("strand") == "-",
c("count_JG_up_1") := junc$SO_R[match(get("Event1a"), junc$Event)]]
splice[get("strand") == "+" & !is.na(get("Event1b")),
c("count_JG_up_2") := junc$SO_L[match(get("Event1b"), junc$Event)]]
splice[get("strand") == "-" & !is.na(get("Event1b")),
c("count_JG_up_2") := junc$SO_R[match(get("Event1b"), junc$Event)]]
splice[get("strand") == "+" & is.na(get("Event1b")),
c("count_JG_up_2") := junc$SO_L[match(get("Event1a"), junc$Event)]]
splice[get("strand") == "-" & is.na(get("Event1b")),
c("count_JG_up_2") := junc$SO_R[match(get("Event1a"), junc$Event)]]
splice[get("count_JG_up_1") >= get("count_JG_up_2"),
c("count_JG_up") := get("count_JG_up_1")]
splice[get("count_JG_up_1") < get("count_JG_up_2"),
c("count_JG_up") := get("count_JG_up_2")]
splice[get("strand") == "+" & is.na(get("Event2a")),
c("count_JG_down_1") := junc$SO_R[match(get("Event1a"), junc$Event)]]
splice[get("strand") == "-" & is.na(get("Event2a")),
c("count_JG_down_1") := junc$SO_L[match(get("Event1a"), junc$Event)]]
splice[get("strand") == "+" & !is.na(get("Event2a")),
c("count_JG_down_1") := junc$SO_R[match(get("Event2a"), junc$Event)]]
splice[get("strand") == "-" & !is.na(get("Event2a")),
c("count_JG_down_1") := junc$SO_L[match(get("Event2a"), junc$Event)]]
splice[get("strand") == "+" & is.na(get("Event2b")),
c("count_JG_down_2") := junc$SO_R[match(get("Event1b"), junc$Event)]]
splice[get("strand") == "-" & is.na(get("Event2b")),
c("count_JG_down_2") := junc$SO_L[match(get("Event1b"), junc$Event)]]
splice[get("strand") == "+" & !is.na(get("Event2b")),
c("count_JG_down_2") := junc$SO_R[match(get("Event2b"), junc$Event)]]
splice[get("strand") == "-" & !is.na(get("Event2b")),
c("count_JG_down_2") := junc$SO_L[match(get("Event2b"), junc$Event)]]
splice[get("count_JG_down_1") >= get("count_JG_down_2"),
c("count_JG_down") := get("count_JG_down_1")]
splice[get("count_JG_down_1") < get("count_JG_down_2"),
c("count_JG_down") := get("count_JG_down_2")]
splice$count_JG_up_1 <- splice$count_JG_up_2 <- NULL
splice$count_JG_down_1 <- splice$count_JG_down_2 <- NULL
splice[get("strand") == "+" & !(get("EventType") %in% single_events),
c("count_JG_up") := junc$SO_L[match(get("Event1a"), junc$Event)]]
splice[get("strand") == "-" & !(get("EventType") %in% single_events),
c("count_JG_up") := junc$SO_R[match(get("Event1a"), junc$Event)]]
splice[get("strand") == "+" & get("EventType") %in% single_events,
c("count_JG_up") := junc$SO_L[match(get("Event1b"), junc$Event)]]
splice[get("strand") == "-" & get("EventType") %in% single_events,
c("count_JG_up") := junc$SO_R[match(get("Event1b"), junc$Event)]]
splice[get("strand") == "+" & get("EventType") %in% c("IR", "RI"),
c("count_JG_down") := junc$SO_R[match(get("Event1a"), junc$Event)]]
splice[get("strand") == "-" & get("EventType") %in% c("IR", "RI"),
c("count_JG_down") := junc$SO_L[match(get("Event1a"), junc$Event)]]
splice[get("strand") == "+" & get("EventType") %in% single_events,
c("count_JG_down") := junc$SO_R[match(get("Event1b"), junc$Event)]]
splice[get("strand") == "-" & get("EventType") %in% single_events,
c("count_JG_down") := junc$SO_L[match(get("Event1b"), junc$Event)]]
splice[get("strand") == "+" & get("EventType") == "SE",
c("count_JG_down") := junc$SO_R[match(get("Event2a"), junc$Event)]]
splice[get("strand") == "-" & get("EventType") == "SE",
c("count_JG_down") := junc$SO_L[match(get("Event2a"), junc$Event)]]
splice[get("strand") == "+" & get("EventType") == "MXE",
c("count_JG_down") := junc$SO_R[match(get("Event2b"), junc$Event)]]
splice[get("strand") == "-" & get("EventType") == "MXE",
c("count_JG_down") := junc$SO_L[match(get("Event2b"), junc$Event)]]
# Splice participation
# - this calculates the number of events that participates in splicing
# across the upstream / downstream exon island that belong to
# either isoform A or B
splice[, c("partic_up", "partic_down") := list(0, 0)]
splice[get("EventType") %in% c("MXE", "SE", "ALE", "A3SS", "RI"),
c("partic_up") := get("count_Event1a") + get("count_Event1b")]
splice[get("EventType") %in% c("MXE"),
c("partic_down") := get("count_Event2a") + get("count_Event2b")]
splice[get("EventType") %in% c("SE"),
c("partic_down") := get("count_Event2a") + get("count_Event1b")]
splice[get("EventType") %in% c("AFE", "A5SS", "RI"),
c("partic_down") := get("count_Event1a") + get("count_Event1b")]
# Splice "coverage" = participation / max_JG
splice[, c("cov_up", "cov_down") := list(0, 0)]
splice[get("count_JG_up") > 0,
c("cov_up") := get("partic_up") / get("count_JG_up")]
splice[get("count_JG_down") > 0,
c("cov_down") := get("partic_down") / get("count_JG_down")]
splice[get("EventType") %in% c("MXE", "SE", "RI") &
get("cov_up") < get("cov_down"),
c("coverage") := get("cov_up")]
splice[get("EventType") %in% c("MXE", "SE", "RI") &
get("cov_up") >= get("cov_down"),
c("coverage") := get("cov_down")]
splice[get("EventType") %in% c("ALE", "A3SS"),
c("coverage") := get("cov_up")]
splice[get("EventType") %in% c("AFE", "A5SS"),
c("coverage") := get("cov_down")]
return(splice)
}
# Calculates TotalDepth which includes local IR levels.
# - Used to normalize Coverage plots
.collateData_process_splice_depth <- function(splice, sw) {
# Calculate depth for splicing where EventRegion doesn't cover a single
# intron or where processBAM has decided not to assess the intron
splice.no_region <- splice[!(get("EventRegion") %in% sw$EventRegion)]
splice.no_region[, c("Depth1a") :=
sw$TotalDepth[match(get("Event1a"), sw$EventRegion)]]
splice.no_region[, c("Depth2a") :=
sw$TotalDepth[match(get("Event2a"), sw$EventRegion)]]
splice.no_region[, c("Depth1b") :=
sw$TotalDepth[match(get("Event1b"), sw$EventRegion)]]
splice.no_region[, c("Depth2b") :=
sw$TotalDepth[match(get("Event2b"), sw$EventRegion)]]
splice.no_region[is.na(get("Depth1a")), c("Depth1a") := 0]
splice.no_region[is.na(get("Depth1b")), c("Depth1b") := 0]
splice.no_region[is.na(get("Depth2a")), c("Depth2a") := 0]
splice.no_region[is.na(get("Depth2b")), c("Depth2b") := 0]
# First guess depth based on depths of any junction that is assessed
# by processBAM
# - DepthA, DepthB: upstream and downstream IR+Splice depths
splice.no_region[, c("DepthUp", "DepthDown") := list(0,0)]
splice.no_region[get("Depth1a") > get("Depth1b"),
c("DepthUp") := get("Depth1a")]
splice.no_region[get("Depth1a") <= get("Depth1b"),
c("DepthUp") := get("Depth1b")]
# DepthDown only matters where EventType %in% c("MXE", "SE")
splice.no_region[!(get("EventType") %in% c("MXE", "SE")),
c("DepthDown") := get("DepthUp")]
splice.no_region[get("EventType") %in% c("SE") &
get("Depth1b") > get("Depth2a"),
c("DepthDown") := get("Depth1b")]
splice.no_region[get("EventType") %in% c("SE") &
get("Depth1b") <= get("Depth2a"),
c("DepthDown") := get("Depth2a")]
splice.no_region[get("EventType") %in% c("MXE") &
get("Depth2b") > get("Depth2a"),
c("DepthDown") := get("Depth2b")]
splice.no_region[get("EventType") %in% c("MXE") &
get("Depth2b") <= get("Depth2a"),
c("DepthDown") := get("Depth2a")]
splice.no_region[, c("Depth") := 0]
splice.no_region[get("DepthUp") < get("count_JG_up"),
c("DepthUp") := get("count_JG_up")]
splice.no_region[get("DepthDown") < get("count_JG_down"),
c("DepthDown") := get("count_JG_down")]
splice.no_region[get("DepthUp") > get("DepthDown"),
c("Depth") := get("DepthUp")]
splice.no_region[get("DepthUp") <= get("DepthDown"),
c("Depth") := get("DepthDown")]
splice[, c("TotalDepth") := 0]
splice[sw, on = "EventRegion",
c("TotalDepth") := get("i.TotalDepth")]
splice[splice.no_region, on = "EventName",
c("TotalDepth") := get("i.Depth")]
return(splice)
}
# Compiles all the data as assays, write as temp FST files
# - This acts as "on-disk memory" to avoid using too much memory
.collateData_process_assays_as_fst <- function(rowEvent, junc_PSI,
sample, junc, sw, splice, IRMode, norm_output_path) {
assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
inc.todo <- c("Up_Inc", "Down_Inc")
exc.todo <- c("Up_Exc", "Down_Exc")
junc.todo <- c("junc_PSI", "junc_counts", "junc_counts_uns")
templates <- .collateData_seed_init(rowEvent, junc_PSI)
# Included / Excluded counts for IR and splicing
templates$assay[, c("Included") := c(
sw$IntronDepth,
0.5 * (splice$count_Event1a[splice$EventType %in% c("SE", "MXE")] +
splice$count_Event2a[splice$EventType %in% c("SE", "MXE")]),
splice$count_Event1a[!splice$EventType %in% c("SE", "MXE")]
)]
if (IRMode == "SpliceOver") {
templates$assay[, c("Excluded") := c(
sw$SpliceOver,
0.5 * (splice$count_Event1b[splice$EventType %in% c("MXE")] +
splice$count_Event2b[splice$EventType %in% c("MXE")]),
splice$count_Event1b[!splice$EventType %in% c("MXE")]
)]
} else {
templates$assay[, c("Excluded") := c(
sw$SpliceMax,
0.5 * (splice$count_Event1b[splice$EventType %in% c("MXE")] +
splice$count_Event2b[splice$EventType %in% c("MXE")]),
splice$count_Event1b[!splice$EventType %in% c("MXE")]
)]
}
# Validity checking for IR, MXE, SE
sw[get("strand") == "+", c("Up_Inc") := get("ExonToIntronReadsLeft")]
sw[get("strand") == "-", c("Up_Inc") := get("ExonToIntronReadsRight")]
sw[get("strand") == "+", c("Down_Inc") := get("ExonToIntronReadsRight")]
sw[get("strand") == "-", c("Down_Inc") := get("ExonToIntronReadsLeft")]
templates$inc[, c("Up_Inc") := c(sw$Up_Inc,
splice$count_Event1a[splice$EventType %in% c("MXE", "SE")],
splice$count_Event2a[splice$EventType %in% c("RI")]
)]
templates$inc[, c("Down_Inc") := c(sw$Down_Inc,
splice$count_Event2a[splice$EventType %in% c("MXE", "SE")],
splice$count_Event2b[splice$EventType %in% c("RI")]
)]
templates$exc[, c("Up_Exc") :=
splice$count_Event1b[splice$EventType %in% c("MXE")]]
templates$exc[, c("Down_Exc") :=
splice$count_Event2b[splice$EventType %in% c("MXE")]]
templates$assay[, c("Depth") := c(sw$TotalDepth, splice$TotalDepth)]
templates$assay[, c("Coverage") := c(sw$Coverage, splice$coverage)]
# copy RI's coverage from corresponding IR's coverage
templates$assay[get("EventType") == "RI",
c("Coverage") := sw$Coverage[match(get("EventRegion"), sw$EventRegion)]]
splice[get("EventType") %in% c("MXE", "SE") &
get("cov_up") < get("cov_down"),
c("minDepth") := get("count_JG_up")]
splice[get("EventType") %in% c("MXE", "SE") &
get("cov_up") >= get("cov_down"),
c("minDepth") := get("count_JG_down")]
splice[get("EventType") %in% c("ALE", "A3SS", "RI"),
c("minDepth") := get("count_JG_up")]
splice[get("EventType") %in% c("AFE", "A5SS"),
c("minDepth") := get("count_JG_down")]
templates$assay[, c("minDepth") := c(sw$IntronDepth, splice$minDepth)]
junc[get("count") == 0, c("PSI") := 0]
junc[get("SO_L") > get("SO_R"),
c("PSI") := get("count") / get("SO_L")]
junc[get("SO_R") >= get("SO_L") & get("SO_R") > 0,
c("PSI") := get("count") / get("SO_R")]
templates$junc <- junc[, c("seqnames", "start", "end", "strand",
"PSI", "count", "count_unstranded")]
colnames(templates$junc)[c(5,6,7)] <-
c("junc_PSI", "junc_counts", "junc_counts_uns")
fst::write.fst(as.data.frame(templates$assay[, assay.todo, with = FALSE]),
file.path(norm_output_path, "temp",
paste("assays", sample, "fst.tmp", sep = ".")))
fst::write.fst(as.data.frame(templates$inc[, inc.todo, with = FALSE]),
file.path(norm_output_path, "temp",
paste("included", sample, "fst.tmp", sep = ".")))
fst::write.fst(as.data.frame(templates$exc[, exc.todo, with = FALSE]),
file.path(norm_output_path, "temp",
paste("excluded", sample, "fst.tmp", sep = ".")))
fst::write.fst(as.data.frame(templates$junc[, junc.todo, with = FALSE]),
file.path(norm_output_path, "temp",
paste("junc_psi", sample, "fst.tmp", sep = ".")))
}
################################################################################
# Reads the "on-disk memory" FST files and compile to H5
.collateData_compile_assays_from_fst <- function(df.internal,
norm_output_path, samples_per_block) {
rowname_lists <-
.collateData_H5_initialize(nrow(df.internal), norm_output_path)
.collateData_H5_write_assays(df.internal, norm_output_path,
samples_per_block)
# Retrieve assays:
assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
inc.todo <- c("Up_Inc", "Down_Inc")
exc.todo <- c("Up_Exc", "Down_Exc")
junc.todo <- c("junc_PSI", "junc_counts", "junc_counts_uns")
stuff.todo <- c(assay.todo, inc.todo, exc.todo, junc.todo)
h5filename <- file.path(norm_output_path, "data.h5")
assays <- list()
for (assay in assay.todo) {
# h5writeDimnames(list(rowname_lists$EventName, df.internal$sample),
# h5filename, assay)
assays[[assay]] <- HDF5Array(h5filename, assay)
}
for (inc in inc.todo) {
# h5writeDimnames(list(rowname_lists$Inc_Events, df.internal$sample),
# h5filename, inc)
assays[[inc]] <- HDF5Array(h5filename, inc)
}
for (exc in exc.todo) {
# h5writeDimnames(list(rowname_lists$Exc_Events, df.internal$sample),
# h5filename, exc)
assays[[exc]] <- HDF5Array(h5filename, exc)
}
# if(length(rowname_lists$junc_rownames) > 1000000)
# .log(paste(
# "Please ignore any messages below about compression and chunking;",
# "The H5 file chunk size is already optimized."
# ), "message")
for (junc in junc.todo) {
# h5writeDimnames(list(rowname_lists$junc_rownames, df.internal$sample),
# h5filename, junc)
assays[[junc]] <- HDF5Array(h5filename, junc)
}
return(assays)
}
# Initializes H5 arrays using rowData; return rownames as a list
.collateData_H5_initialize <- function(
num_samples, norm_output_path
) {
assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
inc.todo <- c("Up_Inc", "Down_Inc")
exc.todo <- c("Up_Exc", "Down_Exc")
junc.todo <- c("junc_PSI", "junc_counts", "junc_counts_uns")
stuff.todo <- c(assay.todo, inc.todo, exc.todo, junc.todo)
rowData <- as.data.table(
read.fst(file.path(norm_output_path, "rowEvent.fst")))
Inc_Events <- rowData$EventName[rowData$EventType %in%
c("IR", "MXE", "SE", "RI")]
Exc_Events <- rowData$EventName[rowData$EventType %in% "MXE"]
junc_index <- fst::read.fst(file.path(
norm_output_path, "junc_PSI_index.fst"
))
junc_rownames <- with(junc_index,
paste0(seqnames, ":", start, "-", end, "/", strand))
h5filename <- file.path(norm_output_path, "data.h5")
if (file.exists(h5filename)) file.remove(h5filename)
h5createFile(h5filename)
for (assay in assay.todo) {
chunk_row <- nrow(rowData)
# 100 kb 1-column chunks
if(chunk_row > 1024 * 100 / 8) chunk_row <- 1024 * 100 / 8
h5createDataset(file = h5filename,
dataset = assay,
dims = c(nrow(rowData), num_samples),
storage.mode = "double",
chunk = c(chunk_row, 1), level = 6
)
}
for (inc in inc.todo) {
chunk_row <- length(Inc_Events)
# 100 kb 1-column chunks
if(chunk_row > 1024 * 100 / 8) chunk_row <- 1024 * 100 / 8
h5createDataset(file = h5filename,
dataset = inc,
dims = c(length(Inc_Events), num_samples),
storage.mode = "double",
chunk = c(chunk_row, 1), level = 6
)
}
for (exc in exc.todo) {
# 100 kb 1-column chunks
if(chunk_row > 1024 * 100 / 8) chunk_row <- 1024 * 100 / 8
chunk_row <- length(Exc_Events)
h5createDataset(file = h5filename, dataset = exc,
dims = c(length(Exc_Events), num_samples),
storage.mode = "double",
chunk = c(chunk_row, 1), level = 6
)
}
for (junc in junc.todo) {
# 100 kb 1-column chunks
if(chunk_row > 1024 * 100 / 8) chunk_row <- 1024 * 100 / 8
chunk_row <- length(junc_rownames)
h5createDataset(file = h5filename, dataset = junc,
dims = c(length(junc_rownames), num_samples),
storage.mode = "double",
chunk = c(chunk_row, 1), level = 6
)
}
rowname_lists <- list(
EventName = rowData$EventName,
Inc_Events = Inc_Events, Exc_Events = Exc_Events,
junc_rownames = junc_rownames
)
return(rowname_lists)
}
# Read temp FST files; add to H5 assays
.collateData_H5_write_assays <- function(
df.internal, norm_output_path, samples_per_block
) {
assay.todo <- c("Included", "Excluded", "Depth", "Coverage", "minDepth")
inc.todo <- c("Up_Inc", "Down_Inc")
exc.todo <- c("Up_Exc", "Down_Exc")
junc.todo <- c("junc_PSI", "junc_counts", "junc_counts_uns")
stuff.todo <- c(assay.todo, inc.todo, exc.todo, junc.todo)
h5filename <- file.path(norm_output_path, "data.h5")
# Make a nested list
matrices <- list()
for (stuff in stuff.todo) {
matrices[[stuff]] <- list()
}
buf_i <- 0 # Counts the number of in-memory samples
pb <- progress_bar$new(
format = " finalising H5 database [:bar] :percent eta: :eta",
total = nrow(df.internal), clear = FALSE, width= 100)
for (i in seq_len(nrow(df.internal))) {
sample <- df.internal$sample[i]
buf_i <- buf_i + 1
mat <- fst::read.fst(file.path(norm_output_path, "temp",
paste("assays", sample, "fst.tmp", sep = ".")))
for (stuff in assay.todo) {
matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
}
mat <- fst::read.fst(file.path(norm_output_path, "temp",
paste("included", sample, "fst.tmp", sep = ".")))
for (stuff in inc.todo) {
matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
}
mat <- fst::read.fst(file.path(norm_output_path, "temp",
paste("excluded", sample, "fst.tmp", sep = ".")))
for (stuff in exc.todo) {
matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
}
mat <- fst::read.fst(file.path(norm_output_path, "temp",
paste("junc_psi", sample, "fst.tmp", sep = ".")))
for (stuff in junc.todo) {
matrices[[stuff]][[buf_i]] <- mat[, stuff, drop = FALSE]
}
if (buf_i >= samples_per_block | i == nrow(df.internal)) {
for (stuff in stuff.todo) {
# write
mat <- as.matrix(do.call(cbind, matrices[[stuff]]))
mat[is.na(mat)] <- 0
h5write(mat,
file = h5filename, name = stuff,
index = list(NULL, seq(i - buf_i + 1, i))
)
# reset
matrices[[stuff]] <- list()
}
buf_i <- 0
gc()
}
pb$tick()
}
}
################################################################################
# Writes sample stats to FST
.collateData_write_stats <- function(df.internal, norm_output_path) {
outfile <- file.path(norm_output_path, "stats.fst")
# Convert path to relative path, for reproducibility:
df.internal$path <- .make_path_relative(df.internal$path, norm_output_path)
write.fst(as.data.frame(df.internal), outfile)
}
# Writes default colData to RDS
.collateData_write_colData <- function(
df.internal, coverage_files,
norm_output_path, copyCOV = FALSE
) {
if(!is.null(coverage_files)) {
suppressWarnings({
covfiles_full <- normalizePath(file.path(
norm_output_path, coverage_files))
})
# Create barebones colData.Rds - save coverage files as well
if (
length(coverage_files) == nrow(df.internal) &
all(file.exists(covfiles_full)) &&
isCOV(covfiles_full)
) {
if(copyCOV) {
dirCOV <- file.path(norm_output_path, "COV")
if(!dir.exists(dirCOV)) dir.create(dirCOV)
for(i in seq_len(length(covfiles_full))) {
file.copy(covfiles_full[i], dirCOV, overwrite = TRUE)
covfiles_full[i] <- file.path(dirCOV,
basename(covfiles_full[i]))
}
}
if(!all(file.exists(covfiles_full)))
.log(paste("Some files failed to copy over to", dirCOV))
coverage_files <- .make_path_relative(
covfiles_full, norm_output_path)
df.files <- data.table(
sample = df.internal$sample,
bam_file = "",
sw_file = df.internal$path,
cov_file = coverage_files
)
} else {
df.files <- data.table(
sample = df.internal$sample,
bam_file = "",
sw_file = df.internal$path,
cov_file = ""
)
}
} else {
df.files <- data.table(
sample = df.internal$sample,
bam_file = "",
sw_file = df.internal$path,
cov_file = ""
)
}
#
colData <- list(
df.files = df.files,
df.anno = data.table(sample = df.internal$sample)
)
saveRDS(colData, file.path(norm_output_path, "colData.Rds"))
}
# Loads the newly created H5 database so we can save it as a
# SummarizedExperiment for quick recall later
.collateData_initialise_HDF5 <- function(collate_path, colData, assays) {
item.todo <- c("rowEvent", "Included", "Excluded", "Depth", "Coverage",
"minDepth", "Up_Inc", "Down_Inc", "Up_Exc", "Down_Exc")
rowData <- read.fst(file.path(collate_path, "rowEvent.fst"))
# Convert to rds to save file space at small expense of increased load time
saveRDS(rowData, file.path(collate_path, "rowEvent.Rds"))
# To save space, only use EventName and EventType for now
rowData <- rowData[, c("EventName", "EventType")]
colData <- as.data.frame(colData)
colData_use <- colData[, -1, drop = FALSE]
rownames(colData_use) <- colData$sample
se <- SummarizedExperiment(
assays = SimpleList(
Included = assays[["Included"]],
Excluded = assays[["Excluded"]],
Depth = assays[["Depth"]],
Coverage = assays[["Coverage"]],
minDepth = assays[["minDepth"]]
), rowData = rowData, colData = colData_use
)
rownames(se) <- rowData(se)$EventName
metadata(se)$Up_Inc <- assays[["Up_Inc"]]
metadata(se)$Down_Inc <- assays[["Down_Inc"]]
rownames(metadata(se)$Up_Inc) <- rowData(se)$EventName[
rowData(se)$EventType %in% c("IR", "MXE", "SE", "RI")]
rownames(metadata(se)$Down_Inc) <- rowData(se)$EventName[
rowData(se)$EventType %in% c("IR", "MXE", "SE", "RI")]
metadata(se)$Up_Exc <- assays[["Up_Exc"]]
metadata(se)$Down_Exc <- assays[["Down_Exc"]]
rownames(metadata(se)$Up_Exc) <- rowData(se)$EventName[
rowData(se)$EventType %in% c("MXE")]
rownames(metadata(se)$Down_Exc) <- rowData(se)$EventName[
rowData(se)$EventType %in% c("MXE")]
se@metadata[["junc_PSI"]] <- assays[["junc_PSI"]]
se@metadata[["junc_counts"]] <- assays[["junc_counts"]]
se@metadata[["junc_counts_uns"]] <- assays[["junc_counts_uns"]]
junc_gr_df <- read.fst(file.path(collate_path, "annotation/Junc.fst"))
rownames(se@metadata[["junc_PSI"]]) <- junc_gr_df$Event
rownames(se@metadata[["junc_counts"]]) <- junc_gr_df$Event
rownames(se@metadata[["junc_counts_uns"]]) <- junc_gr_df$Event
se@metadata[["junc_gr"]] <- .grDT(junc_gr_df)
colnames(metadata(se)$Up_Inc) <- colData$sample
colnames(metadata(se)$Down_Inc) <- colData$sample
colnames(metadata(se)$Up_Exc) <- colData$sample
colnames(metadata(se)$Down_Exc) <- colData$sample
colnames(metadata(se)$junc_PSI) <- colData$sample
colnames(metadata(se)$junc_counts) <- colData$sample
colnames(metadata(se)$junc_counts_uns) <- colData$sample
colData.Rds <- readRDS(file.path(collate_path, "colData.Rds"))
if ("df.files" %in% names(colData.Rds) &&
"cov_file" %in% colnames(colData.Rds$df.files)) {
metadata(se)$cov_file <- colData.Rds$df.files$cov_file
}
stats.df <- fst::read.fst(file.path(collate_path, "stats.fst"))
metadata(se)$sampleQC <- stats.df[, -1, drop = FALSE]
rownames(metadata(se)$sampleQC) <- colnames(se)
metadata(se)$BuildVersion <- collateData_version
# Remove remaining columns to save more space in Rds
rowData(se)$EventType <- NULL
rowData(se)$EventName <- NULL
return(se)
}
.collateData_save_NxtSE <- function(se, filepath) {
se@assays <- .collateData_simplify_assay_paths(se@assays)
se@metadata[["Up_Inc"]] <- .collateData_simplify_assay_path(
se@metadata[["Up_Inc"]])
se@metadata[["Down_Inc"]] <- .collateData_simplify_assay_path(
se@metadata[["Down_Inc"]])
se@metadata[["Up_Exc"]] <- .collateData_simplify_assay_path(
se@metadata[["Up_Exc"]])
se@metadata[["Down_Exc"]] <- .collateData_simplify_assay_path(
se@metadata[["Down_Exc"]])
se@metadata[["junc_PSI"]] <- .collateData_simplify_assay_path(
se@metadata[["junc_PSI"]])
se@metadata[["junc_counts"]] <- .collateData_simplify_assay_path(
se@metadata[["junc_counts"]])
se@metadata[["junc_counts_uns"]] <- .collateData_simplify_assay_path(
se@metadata[["junc_counts_uns"]])
saveRDS(se, filepath)
}
# Internals for save_NxtSE; adapted from HDF5Array saveHDF5SummarizedExperiment
# - These are needed if the collateData folder has been moved since last access
# - allows relative paths to be used
# Saving a NxtSE object
# Add single assay
.collateData_simplify_assay_path <- function(assay) {
DelayedArray::modify_seeds(assay,
function(x) {
x@filepath <- basename(x@filepath)
x
}
)
}
# Add list of assays
.collateData_simplify_assay_paths <- function(assays) {
nassay <- length(assays)
for (i in seq_len(nassay)) {
a <- .collateData_simplify_assay_path(getListElement(assays, i))
assays <- setListElement(assays, i, a)
}
return(assays)
}
# Loading a NxtSE object
# Fix a single assay
# .collateData_expand_assay_path <- function(assay, path) {
# DelayedArray::modify_seeds(assay,
# function(x) {
# x@filepath <- file.path(path, x@filepath)
# x
# }
# )
# }
# Fix a list of assays
# .collateData_expand_assay_paths <- function(assays, path) {
# nassay <- length(assays)
# for (i in seq_len(nassay)) {
# a <- .collateData_expand_assay_path(getListElement(assays, i), path)
# assays <- setListElement(assays, i, a)
# }
# return(assays)
# }
# Internals - compile reference data from genome, for quick access
.prepare_covplot_data <- function(
reference_path, use_ref_path = reference_path) {
.validate_reference(reference_path)
genome <- Get_Genome(reference_path)
data <- list(
seqInfo = seqinfo(genome),
geneList = .getGeneList(use_ref_path),
elements = .loadViewRef(use_ref_path),
transcripts = .loadTranscripts(use_ref_path),
spliceList = .loadSpliceEvents(use_ref_path),
IRList = .loadIREvents(use_ref_path),
ontology = .loadOntology(reference_path)
)
return(data)
}
.loadViewRef <- function(reference_path) {
.validate_reference(reference_path)
dir_path <- file.path(reference_path, "fst")
fetchCols <- c("seqnames", "start", "end", "strand", "type",
"transcript_id", "exon_id", "exon_number")
fetchColsProt <- c("seqnames", "start", "end", "strand", "type",
"transcript_id", "protein_id", "ccds_id", "exon_number")
fetchColsIntron_A <- c("seqnames", "start", "end", "strand")
fetchColsIntron_B <- c("transcript_id", "intron_id", "intron_number")
exons.DT <- as.data.table(read.fst(file.path(dir_path, "Exons.fst"),
columns = fetchCols))
# exons.DT <- exons.DT[get("transcript_biotype") != "protein_coding"]
introns.DT <- as.data.table(read.fst(file.path(dir_path, "junctions.fst"),
columns = fetchColsIntron_A))
introns.DT$type <- "intron"
introns.DT <- cbind(introns.DT,
as.data.table(read.fst(file.path(dir_path, "junctions.fst"),
columns = fetchColsIntron_B)
)
)
if(file.exists(file.path(dir_path, "Proteins.fst"))) {
# peek columns in Proteins.fst
protein_test <- read.fst(file.path(dir_path, "Proteins.fst"), to = 1)
protCols <- colnames(protein_test)
fetchColsProt <- intersect(fetchColsProt, protCols)
protein.DT <- as.data.table(
read.fst(file.path(dir_path, "Proteins.fst"),
columns = fetchColsProt))
if(any(c("protein_id", "ccds_id") %in% fetchColsProt)) {
if("ccds_id" %in% fetchColsProt) {
setnames(protein.DT, "ccds_id", "exon_id")
if("protein_id" %in% fetchColsProt)
protein.DT$protein_id <- NULL
} else {
setnames(protein.DT, "protein_id", "exon_id")
}
misc.DT <- as.data.table(read.fst(file.path(dir_path, "Misc.fst"),
columns = fetchCols))
total.DT <- rbindlist(list(
exons.DT[, fetchCols, with = FALSE],
protein.DT[, fetchCols, with = FALSE],
misc.DT[, fetchCols, with = FALSE]
))
} else {
total.DT <- exons.DT
}
} else {
total.DT <- exons.DT
}
setnames(total.DT, c("exon_id", "exon_number"), c("feature_id", "aux_id"))
setnames(introns.DT, c("intron_id", "intron_number"),
c("feature_id", "aux_id"))
return(rbind(total.DT,introns.DT))
}
.getGeneList <- function(reference_path) {
.validate_reference(reference_path)
file_path <- file.path(reference_path, "fst", "Genes.fst")
if (!file.exists(file_path)) return(NULL)
df <- as.data.table(read.fst(file_path))
df$gene_display_name <- paste0(df$gene_name, " (", df$gene_id, ")")
return(df)
}
.loadTranscripts <- function(reference_path) {
.validate_reference(reference_path)
file_path <- file.path(reference_path, "fst", "Transcripts.fst")
Transcripts.DT <- as.data.table(read.fst(file_path))
if ("transcript_support_level" %in% colnames(Transcripts.DT)) {
Transcripts.DT$transcript_support_level <-
tstrsplit(Transcripts.DT$transcript_support_level, split = " ")[[1]]
Transcripts.DT$transcript_support_level[
is.na(Transcripts.DT$transcript_support_level)] <- "NA"
} else {
Transcripts.DT$transcript_support_level <- 1
}
return(Transcripts.DT)
}
.loadSpliceEvents <- function(reference_path) {
.validate_reference(reference_path)
file_path <- file.path(reference_path, "fst", "Splice.fst")
splice.DT <- as.data.table(read.fst(file_path, columns = c(
"EventName", "EventType", "EventRegion",
"Event1a", "Event1b", "Event2a", "Event2b"
)))
return(splice.DT)
}
.loadIREvents <- function(reference_path) {
.validate_reference(reference_path)
Dir_path <- file.path(reference_path, "fst", "Introns.Dir.fst")
ND_path <- file.path(reference_path, "fst", "Introns.ND.fst")
IR.Dir.DT <- as.data.table(read.fst(Dir_path, columns = c(
"EventName", "intron_id", "IRname"
)))
IR.ND.DT <- as.data.table(read.fst(ND_path, columns = c(
"EventName", "intron_id", "IRname"
)))
return(rbind(
IR.Dir.DT,
IR.ND.DT
))
}
.loadOntology <- function(reference_path) {
.validate_reference(reference_path)
file_path <- file.path(reference_path, "fst", "Ontology.fst")
if(!file.exists(file_path)) return(NULL)
return(as.data.table(read.fst(file_path)))
}
################################################################################
.collateData_cleanup <- function(collate_data) {
if (dir.exists(file.path(collate_data, "temp")))
unlink(file.path(collate_data, "temp"), recursive = TRUE)
if (dir.exists(file.path(collate_data, "annotation")))
unlink(file.path(collate_data, "annotation"), recursive = TRUE)
if (dir.exists(file.path(collate_data, "Reference")))
unlink(file.path(collate_data, "Reference"), recursive = TRUE)
unlink(file.path(collate_data, "junc_PSI_index.fst"))
unlink(file.path(collate_data, "rowEvent.brief.fst"))
unlink(file.path(collate_data, "rowEvent.fst"))
unlink(file.path(collate_data, "stats.fst"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.