factR: Functional Annotation of Custom Transcriptomes

Documented in subsetNewTranscripts

#' Shortlist GTF GRanges object for new transcripts
#'
#' @description
#' `subsetNewTranscripts()` will retain transcripts in `query` that are distinct
#' from those in `ref`
#'
#' @details
#' `subsetNewTranscripts()` will compare query and reference GTF GRanges and 
#' return query transcripts with different exon structures from reference 
#' transcripts. Transcriptome assemblers may sometime extend 5' and 3' ends 
#' of known transcripts based on experimental data. These annotated transcripts 
#' can be removed by inputting "intron" to the refine.by argument. This will 
#' further compare and remove transcripts of identical intron structures. 
#' Alternatively, transcripts with unique CDS coordinates can be selected by 
#' typing "cds" to the refine.by argument.
#'
#'
#' @param query
#' GRanges object containing query GTF data.
#' @param ref
#' GRanges object containing reference GTF data.
#' @param refine.by
#' Whether to refine the selection process by removing query transcripts with
#'  similar introns or CDS structure to reference. Default input is "none", and 
#'  can be changed to "intron" or "cds" respectively.
#'
#' @return
#' Filtered GRanges GTF object
#'
#' @export
#'
#' @author Fursham Hamid
#' @examples
#' # Load dataset
#' data(matched_query_gtf, ref_gtf)
#' 
#' # shortlist new transcripts
#' subsetNewTranscripts(matched_query_gtf, ref_gtf)
subsetNewTranscripts <- function(query, ref, 
                                 refine.by = c("none", "intron", "cds")) {

    # catch missing args
    mandargs <- c("query", "ref")
    passed <- names(as.list(match.call())[-1])
    if (any(!mandargs %in% passed)) {
        rlang::abort(paste(
            "missing values for",
            paste(setdiff(mandargs, passed), collapse = ", ")
        ))
    }

    # retrieve input object names
    argnames <- as.character(match.call())[-1]

    # carry out input checks
    .checkinputs(query, ref, argnames)

    # run subsetting and return new transcripts
    return(.subsetTranscripts(query, ref, refine.by[1]))
}

.checkinputs <- function(query, ref, argnames) {
    # check inputs are gtf
    if (any(!is_gtf(query, ref))) {
        rlang::abort(sprintf(
            "%s is/are not gtf GRanges",
            paste(argnames[seq_len(2)][!is_gtf(query, ref)], collapse = ",")
        ))
    }


    # catch unmatched seqlevels
    if (!has_consistentSeqlevels(query, ref, verbose = FALSE)) {
        rlang::abort(sprintf(
            "`%s` and `%s` has unmatched seqlevel styles. 
Try running: %s <- matchChromosomes(%s, %s)",
            argnames[1], argnames[2], argnames[1], argnames[1], argnames[2]
        ))
    } else {
        outmsg <- has_consistentSeqlevels(query, ref)
    }
}

.subsetTranscripts <- function(query, ref, by) {

    # Filter transcripts by identical exon structure:
    # create exons by transcripts
    rlang::inform("Removing transcripts with exact exon coordinates")
    query_exons <- S4Vectors::split(query[query$type == "exon"], ~transcript_id)
    ref_exons <- S4Vectors::split(ref[ref$type == "exon"], ~transcript_id)

    # search for exact query and ref matches
    fulloverlap <- GenomicRanges::findOverlaps(query_exons, ref_exons,
        type = "equal", select = "first"
    )

    # return transcripts that are not found in reference
    query <- query[!query$transcript_id %in% 
                       names(query_exons)[!is.na(fulloverlap)]]

    # Refine list By identical introns:
    if (by == "intron") {
        rlang::inform("Removing transcripts with exact intron coordinates")
        # create exons by transcripts
        query_exons <- S4Vectors::split(
            query[query$type == "exon"], ~transcript_id)
        ref_exons <- S4Vectors::split(ref[ref$type == "exon"], ~transcript_id)

        # convert exon coord to intron coord
        query_introns <- GenomicRanges::psetdiff(
            BiocGenerics::unlist(range(query_exons)), query_exons)
        ref_introns <- GenomicRanges::psetdiff(
            BiocGenerics::unlist(range(ref_exons)), ref_exons)

        # search for exact query and ref matches
        fulloverlap_intron <- GenomicRanges::findOverlaps(query_introns, 
            ref_introns, type = "equal", select = "first")
        # return transcripts that are not found in reference
        query <- query[!query$transcript_id %in% 
                           names(query_exons)[!is.na(fulloverlap_intron)]]
    }

    # Refine list By identical CDS:
    else if (by == "cds") {
        rlang::inform("Removing transcripts with exact CDS coordinates")
        # create CDS by transcripts
        query_CDS <- S4Vectors::split(query[query$type == "CDS"], 
                                      ~transcript_id)
        ref_CDS <- S4Vectors::split(ref[ref$type == "CDS"], ~transcript_id)

        # search for exact query and ref matches
        fullCDSoverlap <- GenomicRanges::findOverlaps(query_CDS, ref_CDS,
            type = "equal", select = "first"
        )

        # return coding transcripts that are not found in reference
        query <- query[query$transcript_id %in% names(query_CDS)]
        query <- query[!query$transcript_id %in% 
                           names(query_CDS)[!is.na(fullCDSoverlap)]]
    }
    return(query)
}