R/plot_agg_regions.R

Defines functions plot_agg_regions.avg_over_bins plot_agg_regions.unnest_with_anno plot_agg_regions.process_methy_data plot_agg_regions.get_methy_data plot_agg_regions

Documented in plot_agg_regions

#' Plot aggregate regions
#'
#' @param x the NanoMethResult object.
#' @param regions a table of regions containing at least columns chr, strand,
#'   start and end. Any additional columns can be used for grouping.
#' @param binary_threshold the modification probability such that calls with
#'   modification probability above the threshold are considered methylated, and
#'   those with probability equal or below are considered unmethylated.
#' @param group_col the column to group aggregated trends by. This column can
#'   be in from the regions table or samples(x).
#' @param flank the number of flanking bases to add to each side of each region.
#' @param stranded TRUE if negative strand features should have coordinates
#'   flipped to reflect features like transcription start sites.
#' @param span the span for loess smoothing.
#' @param palette the ggplot colour palette used for groups.
#'
#' @return a ggplot object containing the aggregate methylation trend.
#'
#' @examples
#' nmr <- load_example_nanomethresult()
#' gene_anno <- exons_to_genes(NanoMethViz::exons(nmr))
#' plot_agg_regions(nmr, gene_anno)
#' plot_agg_regions(nmr, gene_anno, group_col = "sample")
#' plot_agg_regions(nmr, gene_anno, group_col = "group")
#'
#' @export
plot_agg_regions <- function(
    x,
    regions,
    binary_threshold = 0.5,
    group_col = NULL,
    flank = 2000,
    stranded = TRUE,
    span = 0.05,
    palette = ggplot2::scale_colour_brewer(palette = "Set1")
) {
    if (!is.null(group_col)) {
        avail_columns <- c(colnames(samples(x)), colnames(regions))
        assertthat::assert_that(
            group_col %in% avail_columns,
            msg = glue::glue("'{group_col}' could not be found in columns of 'regions' or samples(x)")
        )
    }

    # grouped regions crashes downstream operations
    regions <- ungroup(regions)

    # query methylation data
    methy_data <- plot_agg_regions.get_methy_data(x, regions, flank)

    # process each methy_data list element to obtain methylation proportions at relative positions
    methy_data <- plot_agg_regions.process_methy_data(methy_data, stranded, flank, binary_threshold)

    # unnest methy_data and add sample annotation
    methy_data <- plot_agg_regions.unnest_with_anno(methy_data, NanoMethViz::samples(x))

    # take the average methylation proportion over equal sized positional bins
    methy_data <- plot_agg_regions.avg_over_bins(methy_data, group_col = group_col)

    # change aes spec depending on if group_col is available
    # hacky fix to the deprecation of aes_string which handled a NULL group_col
    # value
    if (!is.null(group_col)) {
        aes_spec <- ggplot2::aes(
                x = .data$binned_pos,
                y = .data$methy_prop,
                group = .data[[group_col]],
                col = .data[[group_col]])
    } else {
        aes_spec <- ggplot2::aes(
                x = .data$binned_pos,
                y = .data$methy_prop)
    }

    # set up plot
    p <- ggplot2::ggplot() +
        ggplot2::ylim(c(0, 1)) +
        ggplot2::theme_bw() +
        ggplot2::stat_smooth(
            aes_spec,
            method = "loess",
            formula = "y ~ x",
            span = span,
            na.rm = TRUE,
            se = FALSE,
            data = methy_data) +
        palette

    # if flank is 0, then then only annotated start and end, else annotate flanks as well
    if (flank == 0) {
        labels <- c("start", "end")
        breaks <- c(0, 1)
        limits <- c(0, 1)
    } else {
        breaks <- c(-.33, 0, 1, 1.33)
        limits <- c(-0.33, 1.33)
        kb_marker <- round(flank / 1000, 1)
        labels <- c(glue::glue("-{kb_marker}kb"), "start", "end", glue::glue("+{kb_marker}kb"))
        p <- p +
            ggplot2::geom_vline(xintercept = 0, linetype = "dashed", color = "grey80") +
            ggplot2::geom_vline(xintercept = 1, linetype = "dashed", color = "grey80")
    }

    region_widths <- regions$end - regions$start
    width_avg <- round(mean(region_widths))
    width_std_dev <- round(sd(region_widths), 2)

    p + ggplot2::coord_cartesian(clip = "off") +
        ggplot2::scale_x_continuous(
            name = glue::glue("Relative Position (Avg.Width = {width_avg}, Std.Dev = {width_std_dev})"),
            breaks = breaks,
            limits = limits,
            labels = labels) +
        ggplot2::ylab("Average Methylation Proportion")
}

plot_agg_regions.get_methy_data <- function(x, regions, flank) {
    query_row_methy <- function(i, methy, regions, flank) {
        # query a larger region such that smoothing doesn't
        # misbehave around ends
        flank <- flank * 1.1
        query_methy(
            methy,
            regions$chr[i],
            regions$start[i] - flank,
            regions$end[i] + flank,
            force = TRUE)
    }

    methy_data <- purrr::map(
        seq_len(nrow(regions)),
        query_row_methy,
        methy = x,
        regions = regions,
        flank = flank
    )

    regions %>%
        mutate(methy_data = methy_data)
}

plot_agg_regions.process_methy_data <- function(methy_data, stranded, flank, binary_threshold) {

    remove_empty_methy_data <- function(x) {
        x %>%
            dplyr::filter(purrr::map_lgl(.data$methy_data, ~nrow(.x) > 0))
    }

    rescale_positions <- function(x, stranded, flank) {
        # for each region, scale the position values to be between 0 and 1
        for (i in seq_len(nrow(x))) {
            # determine if the positions fall inside the region
            m_data <- x$methy_data[[i]]$pos
            within <- m_data >= x$start[i] & m_data <= x$end[i]
            upstream <- m_data < x$start[i]
            downstream <- m_data > x$end[i]

            # rescale the positions depending on the region
            rel_pos <- numeric(length(m_data))
            rel_pos[within] <- (m_data[within] - x$start[i]) / (x$end[i] - x$start[i])

            # flanks each take up 1/3 of the plot space
            rel_pos[upstream] <- (m_data[upstream] - x$start[i]) / flank / 3
            rel_pos[downstream] <- 1 + ((m_data[downstream] - x$end[i]) / flank / 3)

            # flip positions for negative strand if stranded is TRUE
            if (stranded && length(x$strand[i]) > 0 && x$strand[i] == "-") {
                rel_pos <- 1 - rel_pos
            }

            # store rescaled positions
            x$methy_data[[i]]$rel_pos <- rel_pos
        }

        x
    }

    binarise_statistics <- function(x, binary_threshold) {
        binarise_statistc_in_df <- function(x, binary_threshold) {
            x %>%
                dplyr::mutate(is_methylated = e1071::sigmoid(.data$statistic) > binary_threshold)
        }

        x %>%
            dplyr::mutate(
                methy_data = purrr::map(
                    .data$methy_data,
                    binarise_statistc_in_df, binary_threshold = binary_threshold
                )
            )
    }

    average_binarised_methy_per_pos <- function(x) {
        avg_methy_by_rel_pos <- function(x) {
            x %>%
                dplyr::group_by(.data$sample, .data$chr, .data$strand, .data$rel_pos) %>%
                dplyr::summarise(
                    methy_prop = mean(.data$is_methylated),
                    .groups = "drop")
        }

        x %>%
            dplyr::mutate(
                methy_data = purrr::map(.data$methy_data, avg_methy_by_rel_pos)
            )
    }

    methy_data %>%
        remove_empty_methy_data() %>%
        rescale_positions(stranded, flank) %>%
        binarise_statistics(binary_threshold) %>%
        average_binarised_methy_per_pos()
}


plot_agg_regions.unnest_with_anno <- function(x, samples_anno) {
    x %>%
        dplyr::select(!dplyr::any_of(c("chr", "strand", "start", "end"))) %>%
        tidyr::unnest("methy_data") %>%
        dplyr::inner_join(samples_anno, by = "sample", multiple = "all")
}

plot_agg_regions.avg_over_bins <- function(x, group_col, grid_size = 2^10) {
    min <- -1.1 / 3
    max <- 1 + 1.1 / 3
    binned_pos <- seq(min, max, length.out = grid_size + 1)
    binned_pos_df <- data.frame(
        binned_pos = binned_pos[-1],
        interval = cut(binned_pos, breaks = binned_pos)[-1]
    )

    x %>%
        dplyr::mutate(interval = cut(.data$rel_pos, breaks = binned_pos)) %>%
        dplyr::inner_join(binned_pos_df, by = "interval", multiple = "all") %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(group_col)), binned_pos) %>%
        dplyr::summarise(methy_prop = mean(.data$methy_prop), .groups = "drop")
}
Shians/NanoMethViz documentation built on June 8, 2024, 10:48 p.m.