R/plotting_mutation_patterns.R

Defines functions plot_mutations_in_flanked_sites get_mutations_in_flanked_sites

Documented in get_mutations_in_flanked_sites plot_mutations_in_flanked_sites

#' Mutation counts along relative positions
#'
#' Prepares data frame of mutations at relative positions along flanks and sites
#' @param mutations Data frame of mutations
#' @param this_sites Data frame of sites
#' @param window_size Integer indicating the half-width of sites and flanking regions (added to both sides of midpoint)
#' @param n_patients Integer indicating number of patients
#' 
#' @return Data frame containing the frequency of mutations across site and flank positions
#' \describe{
#'     \item{dist}{the relative position of each base to nearest site midpoint}
#'     \item{mut_count}{the number of overlapping mutations}
#'     \item{is_site}{boolean indicating if the position is within the site}
#'     \item{mut_freq}{frequency of mutations calculated as mut_count / n_patients / total bp * 1e6}
#' }
#' @export
get_mutations_in_flanked_sites = function(mutations, this_sites, window_size, n_patients) {
  
  sites_mid = floor((this_sites$start + this_sites$end) / 2)
  gr_sites_mid = GenomicRanges::GRanges(this_sites$chr, IRanges::IRanges(sites_mid, sites_mid))
  
  mutations_mid = floor((mutations$start + mutations$end) / 2)
  gr_maf_mid = GenomicRanges::GRanges(mutations$chr, IRanges::IRanges(mutations_mid, mutations_mid))
  
  ov_nearest = GenomicRanges::distanceToNearest(gr_maf_mid, gr_sites_mid)	
  
  nearest_muts_mid = mutations_mid[S4Vectors::queryHits(ov_nearest)]
  nearest_muts_chr = mutations[S4Vectors::queryHits(ov_nearest), "chr"]
  
  nearest_sites_mid = sites_mid[S4Vectors::subjectHits(ov_nearest)]
  nearest_sites_chr = this_sites[S4Vectors::subjectHits(ov_nearest), "chr"]
  
  dfr_dist = data.frame (
    muts_chr = nearest_muts_chr, muts_mid = nearest_muts_mid,
    sites_chr = nearest_sites_chr, sites_mid = nearest_sites_mid, 
    stringsAsFactors = FALSE	
  )
  
  dfr_dist$min_dist = dfr_dist$muts_mid - dfr_dist$sites_mid
  
  # sites on the same chromosome as muts; min distance has to be within windows and flanks
  dfr_dist = dfr_dist[
    dfr_dist$muts_chr == dfr_dist$sites_chr & 
      abs(dfr_dist$min_dist) <= 3 * window_size ,, drop = FALSE]
  
  dfr_freq = reshape2::dcast(dfr_dist, min_dist ~ . , value.var = "min_dist", 
                   fun.aggregate = length)
  colnames(dfr_freq) = c("dist", "mut_count")
  
  dfr_freq$is_site = FALSE
  dfr_freq[abs(dfr_freq$dist) <= window_size, "is_site"] = TRUE
  
  # normalising constant to get mutation frequency besides count
  gr_sites_flanks = GenomicRanges::GRanges(this_sites$chr, 
                                           IRanges::IRanges (sites_mid - 3 * window_size, sites_mid + 3 * window_size - 1))
  gr_sites_flanks = GenomicRanges::reduce(gr_sites_flanks)
  total_bp = sum(IRanges::width(gr_sites_flanks))
  total_bp_per_site_position = total_bp / (6 * window_size)
  
  dfr_freq$mut_freq = dfr_freq$mut_count / n_patients / (total_bp_per_site_position / 1e6)
  
  dfr_freq
}



#' Visualize mutations in sites and flanks
#'
#' Barplot of mutations along relative positions to site midpoints
#' @param dfr Data frame of mutation counts at relative positions along stacked sites/flanks (from get_mutations_in_flanked_sites)
#'\describe{
#'     \item{dist}{relative position of mutations}
#'     \item{mut_count}{number of mutations at relative position}
#'     \item{is_site}{boolean indicating whether the relative position is within the site}
#'     \item{mut_freq}{frequency of mutations, calculated as mut_count / n_patients / total bp * 1e6 by get_mutations_in_flanked_sites}
#' }
#' @param window_size Integer indicating the half-width of sites and flanking regions (added to both sides of midpoint)
#' @param what_plot Character indicating y-axis variable (default "mut_freq")
#' @param y_range Numeric y-axis limits (default NULL)
#'
#' @return ggplot object
#' @export
plot_mutations_in_flanked_sites = function(dfr, window_size=100, what_plot = "mut_freq", y_range=NULL) {
  
  dfr1 = dfr[, c(what_plot, "dist", "is_site")]
  colnames(dfr1) = c("value", "dist", "is_site")
  
  p = ggplot2::ggplot(dfr1, ggplot2::aes(dist, value, fill=is_site)) +
    ggplot2::geom_bar(stat="identity") +
    ggplot2::geom_smooth(formula = y ~ x, ggplot2::aes(dist, value), fill="blue", method="loess", span=1/3) +
    ggplot2::scale_x_continuous("Distance to site midpoint (bps)",
                       breaks=c(0, window_size, -window_size,
                                  2*window_size, -2*window_size,
                                  3*window_size, -3*window_size)) +
    ggplot2::scale_y_continuous(what_plot) +
    ggplot2::coord_cartesian(ylim = y_range) +
    ggplot2::scale_fill_manual("", labels=c("TRUE"="site", "FALSE"="flank"), values=c("TRUE"="darkred", "FALSE"="gray")) +
    ggplot2::theme_bw() +
    ggplot2::theme(panel.grid.minor = ggplot2::element_blank(), plot.title = ggplot2::element_text(size = 10),
          axis.text = ggplot2::element_text(colour = "black"))
  p
}
reimandlab/RM2 documentation built on Aug. 13, 2022, 12:22 p.m.