R/read_junction.R

Defines functions process_junction_table read_junction_table

Documented in process_junction_table read_junction_table

#' Read a junction TSV file created by Megadepth as a table
#'
#' Read an `*all_jxs.tsv` or `*jxs.tsv` file created by `bam_to_junctions()` or
#' manually by the user using Megadepth. The rows of a `*jxs.tsv` can have
#' either 7 or 14 columns, which can lead to warnings when reading in - these
#' are safe to ignore. For details on the format of the input TSV file, check
#' <https://github.com/ChristopherWilks/megadepth#junctions>.
#'
#' @param tsv_file A `character(1)` specifying the path to the tab-separated
#'   (TSV) file created manually using `megadepth_shell()` or on a previous
#'   `bam_to_junctions()` run.
#'
#' @importFrom readr read_delim
#' @return A `tibble::tibble()` with the junction data that follows the format
#'   specified at <https://github.com/ChristopherWilks/megadepth#junctions>.
#' @export
#'
#' @examples
#'
#' ## Install if necessary
#' install_megadepth()
#'
#' ## Find the example BAM file
#' example_bam <- system.file("tests", "test.bam",
#'     package = "megadepth", mustWork = TRUE
#' )
#'
#' ## Run bam_to_junctions()
#' example_jxs <- bam_to_junctions(example_bam, overwrite = TRUE)
#'
#' ## Read the junctions in as a tibble
#' all_jxs <- read_junction_table(example_jxs[["all_jxs.tsv"]])
#'
#' all_jxs
read_junction_table <- function(tsv_file) {
    # define the expected column names and types
    if (grepl("all_jxs.tsv", tsv_file)) {
        col_names <- c(
            "read_name",
            "chr",
            "start",
            "end",
            "mapping_strand",
            "cigar",
            "unique"
        )

        col_types <- c("ccddici")
    } else if (grepl("jxs.tsv", tsv_file)) {
        col_names <- c(
            "chr_id",
            "POS_field",
            "mapping_strand",
            "insert_length",
            "cigar",
            "junction_coords",
            "unique",
            "mate_ref_id",
            "mate_POS_field",
            "mate_mapping_strand",
            "mate_insert_length",
            "mate_cigar",
            "mate_junction_coords",
            "mate_unique"
        )

        col_types <- c("ciidcciciidcci")
    } else {
        stop('tsv_file must have the extension "all_jxs.tsv" or "jxs.tsv"')
    }

    # load in the junctions
    jxs <- readr::read_delim(
        tsv_file,
        delim = "\t",
        progress = FALSE,
        col_names = col_names,
        col_types = col_types
    )

    ## Translate the strand into the format used in Bioconductor
    for (i in seq_along(jxs)) {
        if (grepl("strand", colnames(jxs[i]))) {
            jxs[[i]] <- ifelse(jxs[[i]] == 0, "+", "-")
        }
    }

    return(jxs)
}

#' Process junctions into a STAR compatible format
#'
#' Parses the junctions outputted from `process_junction_table()` into an STAR
#' compatible format (SJ.out) for more convenient use in downstream analyses.
#' The columns strand, intron_motif and annotated will always be 0 (undefined)
#' but can be derived through extracting the dinucleotide motifs for the given
#' reference coordinates for canonical motifs. This function is an
#' R-implementation of the Megadepth helper script, on which further details of
#' column definitions can be found:
#' <https://github.com/ChristopherWilks/megadepth#junctions>.
#'
#' @param all_jxs A `tibble::tibble()` containing junction data ("all.jxs.tsv")
#'   generated by `bam_to_junctions(all_junctions = TRUE)` and imported through
#'   `megadepth::read_junction_table()`.
#'
#' @importFrom magrittr %>%
#' @importFrom dplyr arrange group_by summarise mutate select
#' @return Processed junctions in a STAR-compatible format.
#'
#' @export
#'
#' @examples
#'
#' ## Install if necessary
#' install_megadepth()
#'
#' ## Find the example BAM file
#' example_bam <- system.file("tests", "test.bam",
#'     package = "megadepth", mustWork = TRUE
#' )
#'
#' ## Run bam_to_junctions()
#' example_jxs <- bam_to_junctions(example_bam, overwrite = TRUE)
#'
#' ## Read the junctions in as a tibble
#' all_jxs <- read_junction_table(example_jxs[["all_jxs.tsv"]])
#'
#' ## Process junctions into a STAR-compatible format
#' processed_jxs <- process_junction_table(all_jxs)
#'
#' processed_jxs
process_junction_table <- function(all_jxs) {
    colnames_exp <- c(
        "read_name",
        "chr",
        "start",
        "end",
        "mapping_strand",
        "cigar",
        "unique"
    )

    colnames_act <- colnames(all_jxs)

    if (!identical(colnames_exp, colnames_act)) {
        stop(paste(
            "all_jxs argument should have colnames:",
            paste(colnames_exp, collapse = ", "),
            "\nHave junctions from a jxs.tsv been used instead of a all_jxs.tsv?"
        ))
    }

    processed_jxs <- all_jxs %>%
        dplyr::arrange(chr, start, end, read_name) %>%
        dplyr::distinct(chr, start, end, read_name, .keep_all = TRUE) %>%
        dplyr::group_by(chr, start, end) %>%
        dplyr::summarise(
            uniquely_mapping_reads = sum(unique),
            n_reads = dplyr::n(),
            .groups = "drop"
        ) %>%
        dplyr::mutate(
            multimapping_reads = n_reads - uniquely_mapping_reads,
            strand = 0,
            intron_motif = 0,
            annotated = 0
        ) %>%
        dplyr::select(
            chr,
            start,
            end,
            strand,
            intron_motif,
            annotated,
            uniquely_mapping_reads,
            multimapping_reads
        )

    return(processed_jxs)
}
LieberInstitute/megadepth documentation built on May 6, 2024, 7:12 p.m.