R/to_diff.R

Defines functions .get_diff to_diff

Documented in to_diff

#' Obtain the differences between transcript structure
#'
#' `to_diff()` obtains the difference between `exons` from a set of transcripts
#' to a reference transcript (`ref_exons`). This can be useful when visualizing
#' the differences between transcript structure. `to_diff()` expects two sets of
#' input exons; 1. `exons` - exons from any number of transcripts that will be
#' compared to `ref_exons` and 2. `ref_exons` - exons from a single transcript
#' which acts as the reference to compare against.
#'
#' @param exons `data.frame()` contains exons which can originate from multiple
#'   transcripts differentiated by `group_var`.
#' @param ref_exons `data.frame()` contains exons that originate from a single
#'   transcript, which `exons` will be compared against.
#' @param group_var `character()` if input data originates from more than 1
#'   transcript, `group_var` must specify the column that differentiates
#'   transcripts (e.g. "transcript_id").
#'
#' @return `data.frame()` details the differences between `exons` and
#'   `ref_exons`.
#'
#' @export
#' @examples
#'
#' library(magrittr)
#' library(ggplot2)
#'
#' # to illustrate the package's functionality
#' # ggtranscript includes example transcript annotation
#' sod1_annotation %>% head()
#'
#' # extract exons
#' sod1_exons <- sod1_annotation %>% dplyr::filter(type == "exon")
#' sod1_exons %>% head()
#'
#' # for this example, let's compare transcripts to the MANE-select transcript
#' sod1_mane <- sod1_exons %>% dplyr::filter(transcript_name == "SOD1-201")
#' sod1_not_mane <- sod1_exons %>% dplyr::filter(transcript_name != "SOD1-201")
#'
#' # to_diff() obtains the differences between the exons as ranges
#' sod1_diffs <- to_diff(
#'     exons = sod1_not_mane,
#'     ref_exons = sod1_mane,
#'     group_var = "transcript_name"
#' )
#'
#' sod1_diffs %>% head()
#'
#' # using geom_range(), it can be useful to visually overlay
#' # the differences on top of the transcript annotation
#' sod1_exons %>%
#'     ggplot(aes(
#'         xstart = start,
#'         xend = end,
#'         y = transcript_name
#'     )) +
#'     geom_range() +
#'     geom_intron(
#'         data = to_intron(sod1_exons, "transcript_name")
#'     ) +
#'     geom_range(
#'         data = sod1_diffs,
#'         ggplot2::aes(fill = diff_type),
#'         alpha = 0.2
#'     )
to_diff <- function(exons, ref_exons, group_var = NULL) {
    .check_coord_object(exons, check_seqnames = TRUE, check_strand = TRUE)
    .check_coord_object(ref_exons, check_seqnames = TRUE, check_strand = TRUE)
    .check_group_var(exons, group_var)

    # need to remember if group is NULL for downstream
    null_group <- is.null(group_var)

    # we have to create dummy group if there is no group for .get_diff
    if (null_group) {
        exons <- exons %>% dplyr::mutate(dummy_group = "A")
        group_var <- "dummy_group"
    }

    diffs <- .get_diff(exons, ref_exons, group_var)

    # remove the dummy_group if created
    if (null_group) diffs[[group_var]] <- NULL

    return(diffs)
}

#' The heavy lifting of `to_diff()` happens here.
#'
#' @keywords internal
#' @noRd
.get_diff <- function(exons, ref_exons, group_var) {
    groups <- exons[[group_var]] %>% unique()

    # needs to be a genomic range for downstream processing
    exons_gr <- GenomicRanges::GRanges(exons)
    ref_exons_gr <- GenomicRanges::GRanges(ref_exons)

    diffs <- vector("list", length = length(group_var))

    for (i in seq_along(groups)) {
        exons_gr_curr <- exons_gr %>%
            .[GenomicRanges::mcols(exons_gr)[[group_var]] == groups[i]]

        # get the disjoint pieces (flattening and breaking apart exons)
        disjoint_pieces <- GenomicRanges::disjoin(
            c(ref_exons_gr, exons_gr_curr)
        )

        # find whether the disjoint pieces overlap exons or ref_exons
        # those that only overlap 1 are the differences
        # TODO - perhaps allow modification of findOverlaps() via ... ?
        overlap_exons <- GenomicRanges::findOverlaps(
            disjoint_pieces, exons_gr_curr
        )
        overlap_ref_exons <- GenomicRanges::findOverlaps(
            disjoint_pieces, ref_exons_gr
        )

        # convert pieces back to data.frame and classify diffs
        # TODO - could improve efficiency by placing this step post-loop
        # i.e. manipulate the grs instead
        diff_curr <- disjoint_pieces %>%
            as.data.frame() %>%
            dplyr::mutate(
                index = dplyr::row_number(),
                type = "diff",
                in_exons = index %in% S4Vectors::queryHits(overlap_exons),
                in_ref_exons = index %in% S4Vectors::queryHits(overlap_ref_exons)
            ) %>%
            dplyr::mutate(
                diff_type = dplyr::case_when(
                    in_exons & in_ref_exons ~ "both",
                    in_exons & !in_ref_exons ~ "not_in_ref",
                    !in_exons & in_ref_exons ~ "in_ref"
                )
            )

        # add back in group info
        diff_curr[[group_var]] <- groups[i]

        # keep only diffs and necessary cols
        diffs[[i]] <-
            diff_curr %>%
            dplyr::filter(diff_type != "both") %>%
            dplyr::select(-in_exons, -in_ref_exons, -index)
    }

    diffs <- diffs %>% do.call(dplyr::bind_rows, .)

    return(diffs)
}
dzhang32/ggtranscript documentation built on Aug. 29, 2024, 2:43 a.m.