R/covDataObject-methods.R

Defines functions .gCD_retrieve_annotations .gCD_retrieve_norms .gCD_retrieve_junc .gCD_retrieve_raw_cov_helper .gCD_retrieve_raw_cov .gCD_retrieve_colData .gCD_determine_limits .gCD_determine_loci .gCD_validate_args_event .gCD_validate_args_loci .gCD_validate_args_coords .gCD_validate_args_se .gCD_validate_args getGenomeData getCoverageData covDataObject

Documented in getCoverageData getGenomeData

#' Container to hold raw data for SpliceWiz coverage plots
#'
#' This object is generated using getCoverageData or getGenomeData methods, and
#' is used as input for generating coverage plots.
#' @param se  A \linkS4class{NxtSE} object, created by [makeSE].
#'   COV files must be linked to the NxtSE object. To do this, see the example
#'   in [makeSE]. Required by `plotCoverage`. Not required by `plotGenome` if
#'   `reference_path` is supplied.
#' @param reference_path The path of the reference generated by
#'   [Build-Reference-methods]. Required by `plotGenome` if a 
#'   \linkS4class{NxtSE} object is not specified.
#' @param Event The `EventName` of the IR / alternative splicing event to be
#'   displayed. Use `rownames(se)` to display a list of valid events.
#' @param Gene Whether to use the range for the given `Gene`. If given,
#'   overrides `Event` (but `Event` or `norm_event` will be used to normalise by
#'  condition). Valid `Gene` entries include gene_id (Ensembl ID) or gene_name
#'  (Gene Symbol).
#' @param seqname,start,end The chromosome (string) and genomic `start/end`
#'   coordinates (numeric) of the region to display. If present, overrides both
#'   `Event` and `Gene`. E.g. for a given region of chr1:10000-11000,
#'   use the parameters: `seqname = "chr1", start = 10000, end = 11000`
#' @param coordinates A string specifying genomic coordinates can be given
#'   instead of `seqname,start,end`. Must be of the format "chr:start-end", e.g.
#'   "chr1:10000-11000"
#' @param strand Whether to show coverage of both strands "*" (default), or
#'   from the "+" or "-" strand only.
#' @param zoom_factor Zoom out from event. Each level of zoom zooms out by a
#'   factor of 3. E.g. for a query region of chr1:10000-11000, if a
#'   `zoom_factor` of 1.0 is given, chr1:99000-12000 will be displayed.
#' @param tracks The names of individual samples,
#'   or the names of the different conditions to be plotted. For the latter, set
#'   `condition` to the specified condition category.
#' @param condition To display normalised coverage per condition, set this to
#'   the condition category. If omitted, `tracks` are assumed to refer to the
#'   names of individual samples.
#' @param bases_flanking (Default = `100`) How many bases flanking the zoomed
#'    window. Useful when
#'    used in conjunction with zoom_factor == 0. E.g. for a given region of
#'    chr1:10000-11000, if `zoom_factor = 0` and `bases_flanking = 100`, the
#'    region chr1:9900-11100 will be displayed.
#' @param object For `plotAnnoTrack()`, the `covDataObject` created by
#'    `getCoverageData()` or `getGenomeData()`
#' @param view_start,view_end Start and end coordinates of plotting function.
#'    Note that plot coordinates may be different from retrieval coordinates
#'    and is useful for zooming in.
#' @param reverseGenomeCoords Whether the genomic axis should be reversed
#'    to make it more convenient to plot reverse-stranded genes
#' @param condensed (default `FALSE)
#'    Whether the genomic track should be condensed to plot whole
#'    genes, rather than transcripts. Preferred if multiple genes are plotted
#'    on a zoomed-out plot
#' @param selected_transcripts (default `""`) One or more transcript names or 
#'    ID's to be displayed on the annotation track.
#' @param plot_key_isoforms (default `FALSE`) If `TRUE`, plots only transcripts
#'    involved in the given splicing `Event`.
#' @param usePlotly (default `FALSE`)
#'    Whether to return a plotly or ggplot object.
#' @param ... Ignored / not used
#' @return 
#'   For getCoverageData(): A covDataObject containing required data used to
#'     generate downstream 
#'   For plotAnnoTrack(): A ggplot or plotly object
#' @examples
#' se <- SpliceWiz_example_NxtSE(novelSplicing = TRUE)
#'
#' # Assign annotation of the experimental conditions
#' colData(se)$treatment <- rep(c("A", "B"), each = 3)
#'
#' dataObj <- getCoverageData(
#'     se,
#'     Event = "SE:SRSF3-203-exon4;SRSF3-202-int3",
#'     tracks = colnames(se)
#' )
#'
#' # Show `EventName`s of supported splicing events 
#' # contained within covDataObject
#'
#' showEvents(dataObj)
#'
#' # A limited covDataObject containing only the reference can be generated
#' # from the SpliceWiz reference
#'
#' buildRef(
#'     reference_path = file.path(tempdir(), "Reference"),
#'     fasta = chrZ_genome(),
#'     gtf = chrZ_gtf()
#' )
#'
#' genomeObj <- getGenomeData(
#'     reference_path = file.path(tempdir(), "Reference"),
#'     Gene = "SRSF3"
#' )
#'
#' # Plot reference track directly from the covDataObject
#'
#' # NB: Event plotting is not supported for reference-derived `covDataObject`s
#' plotAnnoTrack(genomeObj) 
#'
#' plotAnnoTrack(dataObj, Event = "SE:SRSF3-203-exon4;SRSF3-202-int3") 
#'
#' @name covDataObject-class
#' @aliases
#' showEvents showEvents,covDataObject-method
#' @seealso [covPlotObject-class]
#' @md
NULL

covDataObject <- function(
        args = list(),
        annotation = list(),
        colData = data.frame(),
        covData = list(),
        juncData = list(),
        normData = list()        
) {
    newCovObj <- new("covDataObject",
        args = args,
        annotation = annotation,
        colData = colData,
        covData = covData, juncData = juncData,
        normData = normData
    )
    newCovObj
}

#' @describeIn covDataObject-class Returns the EventNames for which events can
#'   be normalized using the given covDataObject 
#' @export
setMethod("showEvents", c(object = "covDataObject"), function(
    object
) {
    return(object@normData$rowData$EventName)
})

#' @describeIn covDataObject-class Get coverage / genome data for plotting
#' coverage plots
#' @export
getCoverageData <- function(
        se,
# Info for determining genomic locus
        Event, Gene,
        seqname, start, end,
        coordinates,
        strand = c("*", "+", "-"),
# Amplifiers - for initial plot view start/end
        zoom_factor = 0.2, 
        bases_flanking = 100,
# Samples to retrieve
        tracks,
        condition,
        ...
) {
    # Validate mandatory arguments:
    if(!is(se, "NxtSE")) .log(paste(
        "In getCoverageData(),",
        "`se` must be a NxtSE object"
    ))
    
    args <- as.list(match.call())
    args[[1]] <- NULL
    args[["se"]] <- NULL
    for(argname in names(args)) {
        args[[argname]] <- eval(args[[argname]], parent.frame())
    }
    args[["se"]] <- se
    args[["cov_data"]] <- ref(se)
    
    args[["zoom_factor"]] <- zoom_factor
    args[["bases_flanking"]] <- bases_flanking
    
    # Interpret coordinates
    if (
        !all(c("seqname", "start", "end") %in% names(args)) &
        "coordinates" %in% names(args)
    ) {
        args[["seqname"]] <- tstrsplit(coordinates, 
            split = ":", fixed=TRUE)[[1]]
        rangetxt <- tstrsplit(coordinates, split = ":", fixed=TRUE)[[2]]
        rangetxt <- tstrsplit(rangetxt, split = "/", fixed=TRUE)[[1]]
        args[["start"]] <- as.numeric(tstrsplit(
            rangetxt, split = "-", fixed=TRUE)[[1]])
        args[["end"]] <- as.numeric(tstrsplit(
            rangetxt, split = "-", fixed=TRUE)[[2]])
    }
    strand <- match.arg(strand)
    if(!is_valid(strand)) strand <- "*"
    args[["strand"]] <- strand

    N_steps <- 5
    dash_progress("Validating arguments", N_steps)
    
    # Evaluate view_chr, view_start, view_end, and check arguments
    args <- .gCD_validate_args(args)

    # Retrieve colData
    colData <- .gCD_retrieve_colData(args)
    
    dash_progress("Retrieving ASE normalization values", N_steps)
    # Retrieve in-range events and corresponding normalization values
    normData <- .gCD_retrieve_norms(args, colData)

    dash_progress("Retrieving gene annotations", N_steps)
    # Retrieve genome annotations
    annotation <- .gCD_retrieve_annotations(args)
    
    dash_progress("Retrieving coverage data", N_steps)
    raw_cov <- .gCD_retrieve_raw_cov(args, colData)

    dash_progress("Retrieving junction counts", N_steps)
    juncData <- .gCD_retrieve_junc(args, colData)

    # Remove large objects prior to returning
    args[["se"]] <- args[["cov_data"]] <- NULL
    return(covDataObject(
        args = args,
        annotation = annotation,
        colData = colData,
        covData = raw_cov,
        juncData = juncData,
        normData = normData
    ))
}

#' @describeIn covDataObject-class Get coverage / genome data for plotting
#' coverage plots
#' @export
getGenomeData <- function(
        reference_path,
        Gene,
        seqname, start, end,
        coordinates,
        zoom_factor = 0.2, 
        bases_flanking = 100,
        ...
) {
    if (!file.exists(file.path(reference_path, "cov_data.Rds")))
        .log("Given reference_path is not a valid SpliceWiz reference")

    args <- as.list(match.call())
    args[[1]] <- NULL
    for(argname in names(args)) {
        args[[argname]] <- eval(args[[argname]], parent.frame())
    }
    args[["cov_data"]] <- readRDS(file.path(reference_path, "cov_data.Rds"))
    
    # Interpret coordinates
    args <- .gCD_validate_args_coords(args)

    args <- .gCD_validate_args(args)

    annotation <- .gCD_retrieve_annotations(args)
    
    args[["cov_data"]] <- NULL
    return(covDataObject(
        args = args,
        annotation = annotation
    ))
}

################################################################################

# Internals

.gCD_validate_args <- function(args) {
    if(!("reference_path" %in% names(args))) .gCD_validate_args_se(args)
    return(.gCD_validate_args_loci(args))
}

.gCD_validate_args_se <- function(args) {
    if (!("se" %in% names(args)) || !is(args[["se"]], "NxtSE"))
        .log("In getCoverageData, `se` must be a valid `NxtSE` object")

    # Warn if old NxtSE object
    if(!("row_gr" %in% names(metadata(args[["se"]]))))
        .log(paste(
            "In getCoverageData,", 
            "we recommend updating `se` to the latest version",
            "using update_NxtSE()"
        ), "warning")

    if (!isCOV(covfile(args[["se"]])))
        .log(paste("In getCoverageData,",
            "No valid COV files defined in se.",
            "Please supply the correct paths of the COV files",
            "using covfile(se) <- vector_of_correct_COVfile_paths"))

    # Check condition and tracks
    # - don't check number of tracks - irrelevant at this stage
    
    # - check condition is of length 1
    if("condition" %in% names(args)) {
        if(length(args[["condition"]]) != 1)
            .log(paste("In getCoverageData,", "condition must be of length 1"))
            
        if(!(args[["condition"]] %in% names(colData(args[["se"]]))))
            .log(paste("In getCoverageData,", 
                "condition must be a valid column name in colData(se)"))

        if(!("tracks" %in% names(args))) 
            .log(paste("In getCoverageData,", 
                "condition is defined but tracks is missing"))

        condition_options <- unique(
            colData(args[["se"]])[, args[["condition"]]]
        )
        if (!all(args[["tracks"]] %in% condition_options))
            .log(paste("In getCoverageData,",
                "some tracks do not match valid condition names in",
                args[["condition"]]))

    } else if("tracks" %in% names(args)) {
        if (!all(args[["tracks"]] %in% colnames(args[["se"]])))
            .log(paste("In getCoverageData,",
                "some tracks do not match valid sample names in se"))
    }
}

.gCD_validate_args_coords <- function(args) {
    # Interpret coordinates
    if (
        !all(c("seqname", "start", "end") %in% names(args)) &
        "coordinates" %in% names(args)
    ) {
        args[["seqname"]] <- tstrsplit(args[["coordinates"]], 
            split = ":", fixed=TRUE)[[1]]
        rangetxt <- tstrsplit(args[["coordinates"]], 
            split = ":", fixed=TRUE)[[2]]
        rangetxt <- tstrsplit(rangetxt, split = "/", fixed=TRUE)[[1]]
        args[["start"]] <- as.numeric(tstrsplit(
            rangetxt, split = "-", fixed=TRUE)[[1]])
        args[["end"]] <- as.numeric(tstrsplit(
            rangetxt, split = "-", fixed=TRUE)[[2]])
    }
    return(args)
}

# Checks Gene and loci. Only this is run if plotGenome is run
.gCD_validate_args_loci <- function(args) {
    if (!("se" %in% names(args)) || !is(args[["se"]], "NxtSE")) {
        if(!("cov_data" %in% names(args)))
            .log("In getGenomeData, `cov_data` is missing")
        cov_data <- args[["cov_data"]]
    } else {
        cov_data <- ref(args[["se"]])    
    }
    args[["seqInfo"]] <- cov_data$seqInfo
    # Check we know where to plot
    if (
        !any(c("Event", "Gene") %in% names(args)) &
        !all(c("seqname", "start", "end") %in% names(args))
    ) {
        .log(paste("In getCoverageData / getGenomeData,",
            "Event or Gene cannot be empty, unless coordinates are provided"))
    } else if (all(c("seqname", "start", "end") %in% names(args))) {
        args[["view_chr"]] <- as.character(args[["seqname"]])
        args[["view_start"]] <- args[["start"]]
        args[["view_end"]] <- args[["end"]]
    } else if ("Gene" %in% names(args)) {
        if (
            !(args[["Gene"]] %in% cov_data$geneList$gene_id) &
            !(args[["Gene"]] %in% cov_data$geneList$gene_name)
        ) {
            .log(paste("In getCoverageData / getGenomeData,",
                args[["Gene"]], "is not a valid gene symbol or gene_id"))
        }
        if (!(args[["Gene"]] %in% cov_data$geneList$gene_id)) {
            gene.df <- as.data.frame(
                cov_data$geneList[get("gene_name") == args[["Gene"]]])
            if (nrow(gene.df) != 1) {
                .log(paste("In getCoverageData / getGenomeData,",
                    args[["Gene"]],
                    "is an ambiguous name referring to 2 or more genes.",
                    "Please provide its gene_id instead"))
            }
        } else {
            gene.df <- as.data.frame(
                cov_data$geneList[get("gene_id") == args[["Gene"]]])
            if (nrow(gene.df) != 1) {
                .log(paste("In getCoverageData / getGenomeData,", 
                    args[["Gene"]],
                    "is an ambiguous gene_id referring to 2 or more genes."))
            }
        }
        args[["view_chr"]] <- as.character(gene.df$seqnames)
        args[["view_start"]] <- gene.df$start
        args[["view_end"]] <- gene.df$end
    } else if(!("Event" %in% names(args))) {
        .log(paste("In getCoverageData,",
            "Event must be given if no other loci information is given"))
    } else if(!("se" %in% names(args))){
        .log(paste("In getGenomeData,",
            "Event cannot be used to locate genomic loci"))
    } else {
        # Only "Event" given - check this at later step)
        return(.gCD_validate_args_event(args))
    }
    # determine params here - TODO
    return(.gCD_determine_loci(args))
}

# Checks whether Event given is valid.
.gCD_validate_args_event <- function(args) {
    cov_data <- ref(args[["se"]])
    if (!(args[["Event"]] %in% rownames(args[["se"]])))
        .log(paste("In getCoverageData,", args[["Event"]],
            "is not a valid IR or alternate splicing event in rowData(se)"))
    
    rowData <- rowData(args[["se"]])[args[["Event"]], ]
    if(nrow(rowData) > 1) rowData <- rowData[1,]
    
    args[["view_chr"]] <- tstrsplit(rowData$EventRegion, split = ":")[[1]]
    temp1 <- tstrsplit(rowData$EventRegion, split = "/")
    temp2 <- tstrsplit(temp1[[1]], split = ":")[[2]]
    args[["view_start"]] <- as.numeric(tstrsplit(temp2, split = "-")[[1]])
    args[["view_end"]] <- as.numeric(tstrsplit(temp2, split = "-")[[2]])
    
    return(.gCD_determine_loci(args))
}

.gCD_determine_loci <- function(args) {
    if (!(args[["view_chr"]] %in% names(args[["seqInfo"]])))
        .log(paste("In getCoverageData / getGenomeData,", 
            args[["view_chr"]],
            "is not a valid chromosome reference name in the given genome"))

    if ("bases_flanking" %in% names(args)) {
        if(
            !is.numeric(args[["bases_flanking"]]) || 
            args[["bases_flanking"]] < 0
        ) .log(paste("In getCoverageData / getGenomeData,",
            "bases_flanking must be a non-negative number"))
    } else {
        args[["bases_flanking"]] <- 0
    }

    view_length <- args[["view_end"]] - args[["view_start"]]
    if (!is.numeric(view_length) || view_length < 0)
        .log(paste("In getCoverageData / getGenomeData,",
            "view_length must be a non-negative number"))

    if (("zoom_factor" %in% names(args))) {
        tryCatch({
            args[["zoom_factor"]] <- as.numeric(args[["zoom_factor"]])
        }, error = function(e) {
            args[["zoom_factor"]] <- 0
        })
    } else {
        args[["zoom_factor"]] <- 0
    }
    
    view_center <- (args[["view_start"]] + args[["view_end"]]) / 2
    new_view_length <- view_length * 3^args[["zoom_factor"]] + 
        2 * args[["bases_flanking"]]
    
    view_start <- round(view_center - new_view_length / 2)
    view_end <- round(view_center + new_view_length / 2)
    # Validate genomic window and shift if invalid
    if (view_start < 1) view_start <- 1
    seqInfo <- args[["seqInfo"]][args[["view_chr"]]]
    seqmax <- GenomeInfoDb::seqlengths(seqInfo)
    if(seqmax < 3) .log(paste(
        "In getCoverageData / getGenomeData",
        "chromosome", args[["view_chr"]], "has sequence length of", seqmax,
        "which is not allowed"
    ))
    if (view_end > seqmax) {
        view_end <- seqmax - 1
        view_start <- view_end - new_view_length
        if(view_start < 1) view_start <- 1
    }
    args[["view_start"]] <- view_start
    args[["view_end"]] <- view_end
    
    args <- .gCD_determine_limits(args)
    
    return(args)    
}

.gCD_determine_limits <- function(args) {
    # Get fetch limits of coverage and annotations, based on a 5-fold window
    # of view start/end

    view_start <- args[["view_start"]]
    view_end <- args[["view_end"]]
    
    seqInfo <- args[["seqInfo"]][args[["view_chr"]]]
    seqmax <- GenomeInfoDb::seqlengths(seqInfo)
    
    limit_start <- view_start - 2 * (view_end - view_start)
    limit_end <- view_end + 2 * (view_end - view_start)
    
    limit_span <- limit_end - limit_start 
    
    if(limit_span > seqmax - 1) limit_span <- seqmax - 2
    
    if(limit_start < 1) {
        limit_start <- 1
        limit_end <- limit_start + limit_span
    }
    
    if (limit_end > seqmax) {
        limit_end <- seqmax - 1
        limit_start <- limit_end - limit_span
        if(limit_start < 1) limit_start <- 1
    }
    
    args[["limit_start"]] <- limit_start
    args[["limit_end"]] <- limit_end
    return(args)
}

################################################################################

# Retrieve relevant colData
.gCD_retrieve_colData <- function(args) {

    if(!("tracks" %in% names(args))) {
        return(data.frame())
    }
    tracks <- args[["tracks"]]
    colData <- as.data.frame(colData(args[["se"]]))
    sampleNames <- colnames(args[["se"]])
    
    if("condition" %in% names(args)) {
        cond <- args[["condition"]]
        condStates <- colData[, cond]
        samples <- sampleNames[condStates %in% tracks]
        colData <- colData[samples %in% sampleNames, cond, drop = FALSE]
    } else {
        colData <- colData[args[["tracks"]] %in% sampleNames, , drop = FALSE]
    }
    return(colData)
}

################################################################################

# Retrieve relevant raw coverage
.gCD_retrieve_raw_cov <- function(args, colData) {
    samples <- rownames(colData)
    covfiles <- covfile(args[["se"]])[samples]
    
    session <- shiny::getDefaultReactiveDomain()
    if(!is.null(session)) {
        shiny::withProgress(message = "Reading COV files", {
            raw_cov <- .gCD_retrieve_raw_cov_helper(args, covfiles, samples)
        })
    } else {
        raw_cov <- .gCD_retrieve_raw_cov_helper(args, covfiles, samples)
    }
    return(raw_cov)
}

.gCD_retrieve_raw_cov_helper <- function(args, covfiles, samples) {
    pos <- list()
    neg <- list()
    uns <- list()
    for(i in seq_len(length(samples))) {
        dash_progress("Reading COV files", length(samples))
        pos[[i]] <- getCoverage(
            file = covfiles[i], 
            seqname = args[["view_chr"]], 
            start = args[["limit_start"]] - 1, end = args[["limit_end"]], 
            strand = c("+")
        )
        neg[[i]] <- getCoverage(
            file = covfiles[i], 
            seqname = args[["view_chr"]], 
            start = args[["limit_start"]] - 1, end = args[["limit_end"]], 
            strand = c("-")
        )
        uns[[i]] <- getCoverage(
            file = covfiles[i], 
            seqname = args[["view_chr"]], 
            start = args[["limit_start"]] - 1, end = args[["limit_end"]], 
            strand = c("*")
        )
    }
    names(pos) <- names(neg) <- names(uns) <- samples
    return(list(
        pos = pos,
        neg = neg,
        uns = uns
    ))
}


# Retrieve relevant raw coverage
.gCD_retrieve_junc <- function(args, colData) {

    samples <- rownames(colData)
    
    # Retrieve a list of in-range events
    view_gr <- GRanges(
        seqnames = args[["view_chr"]],
        ranges = IRanges(
            start = args[["limit_start"]], end = args[["limit_end"]]
        ),
        strand = "*"
    )

    OL <- findOverlaps(junc_gr(args[["se"]]), view_gr)

    junc_counts <- junc_counts(args[["se"]])[
        unique(from(OL)),
        samples
    ]
    junc_counts_uns <- junc_counts_uns(args[["se"]])[
        unique(from(OL)),
        samples
    ]
    junc_PSI <- junc_PSI(args[["se"]])[
        unique(from(OL)),
        samples
    ]

    return(list(
        junc_gr = junc_gr(args[["se"]])[unique(from(OL))],
        junc_counts = as.matrix(junc_counts),
        junc_counts_uns = as.matrix(junc_counts_uns),
        junc_PSI = as.matrix(junc_PSI)
    ))
}


################################################################################

# Retrieve in-range events and corresponding normalization values

.gCD_retrieve_norms <- function(args, colData) {

    # Retrieve a list of in-range events
    limit_gr <- GRanges(
        seqnames = args[["view_chr"]],
        ranges = IRanges(
            start = args[["limit_start"]], end = args[["limit_end"]]
        ),
        strand = "*"
    )

    OL <- findOverlaps(row_gr(args[["se"]]), limit_gr, type = "within")
    
    event_gr <- row_gr(args[["se"]])[unique(from(OL))]
    names(event_gr) <- rownames(args[["se"]])[unique(from(OL))]
    
    norms <- as.matrix(assay(args[["se"]], "Depth")[
        names(event_gr),
        rownames(colData),
        drop = FALSE
    ])

    rowData <- rowData(args[["se"]])[unique(from(OL)), ]
    rowData <- as.data.frame(rowData)

    # Get strand data
    sampleStrand <- sampleQC(args[["se"]][, rownames(colData)])$strand
    names(sampleStrand) <- rownames(colData)
    
    return(list(
        event_gr = event_gr,
        norms = norms,
        rowData = rowData,
        sampleStrand = sampleStrand
    ))
}

################################################################################

# Retrieve relevant annotations from cov_data

.gCD_retrieve_annotations <- function(args) {
    view_chr <- args[["view_chr"]]
    view_start <- args[["limit_start"]]
    view_end <- args[["limit_end"]]

    transcripts.DT <- args[["cov_data"]]$transcripts[
        get("seqnames") == view_chr &
        get("start") <= view_end + (view_end - view_start) &
        get("end") >= view_start - (view_end - view_start)
    ]
    setorderv(transcripts.DT, c("transcript_support_level", "width"))

    reduced.DT <- args[["cov_data"]]$elements[
        get("transcript_id") %in% transcripts.DT$transcript_id]
    reduced.DT <- reduced.DT[get("type") %in% c("CDS", "exon", "intron")]

    # Transfer feature_id from exons -> CDS
    CDS.DT <- reduced.DT[get("type") == "CDS"]
    if(nrow(CDS.DT) > 0) {
        exons.DT <- reduced.DT[get("type") == "exon"]
        introns.DT <- reduced.DT[get("type") == "intron"]

        exons.gr <- .grDT(exons.DT)
        CDS.gr <- .grDT(CDS.DT)
        OL <- findOverlaps(exons.gr, CDS.gr)
        
        OL.DT <- data.table(
            from = OL@from, to = OL@to
        )
        OL.DT[, c("feature_id", "exon_trid", "cds_trid") := list(
            exons.DT$feature_id[get("from")],
            exons.DT$transcript_id[get("from")],
            CDS.DT$transcript_id[get("to")]
        )]
        OL.DT <- OL.DT[get("exon_trid") == get("cds_trid")]
        CDS.DT$feature_id[OL.DT$to] <- OL.DT$feature_id
        
        reduced.DT <- rbind(exons.DT, CDS.DT, introns.DT)
    }

    transcripts.DT <- transcripts.DT[
        get("transcript_id") %in% reduced.DT$transcript_id]

    return(list(
        transcripts.DT = transcripts.DT,
        reduced.DT = reduced.DT
    ))
}
alexchwong/SpliceWiz documentation built on March 17, 2024, 3:16 a.m.