R/plot_coverage.R

Defines functions plot_coverage get_coverage .change_chr_format

Documented in get_coverage plot_coverage

#' Plots the coverage over a region of interest
#'
#' \code{plot_coverage}
#'
#' @param coverage_case Either bw path(s) or dataframe containing coverage
#'   returned from \code{get_coverage}
#' @param coverage_ctrl Either bw path(s) or dataframe containing coverage
#'   returned from \code{get_coverage}
#' @param region_to_plot gr. A genomic range of length 1 used to grab coverage (required if coverage is a bw_path) and to determine the xlimits to plot. If NULL, will use take
#'   the min and the max position from the coverage from case (and ctrls).
#' @param binwidth int.
#'
#' @return tibble with the columns detailing chromosome, position 1 for every base in region_of_interest and normalised coverage
#'
#' @export
plot_coverage <- function(coverage_case, coverage_ctrl = NULL, region_to_plot = NULL, binwidth = 50){

  # if you have inputted a bw, then load the coverage
  # these will load the coverage based on the region_to_plot
  if(!is.data.frame(coverage_case)){

    coverage_case <- get_coverage(bw_paths = coverage_case,
                                  region_of_interest = region_to_plot)

  }

  if(!is.null(coverage_ctrl)){

    if(!is.data.frame(coverage_ctrl)){

      coverage_ctrl <- get_coverage(bw_paths = coverage_ctrl,
                                    region_of_interest = region_to_plot)

    }

    coverage_to_plot <-
    coverage_case %>%
      dplyr::mutate(case_control = "case") %>%
      dplyr::bind_rows(coverage_ctrl %>% dplyr::mutate(case_control = "ctrl"))

  }else{

    coverage_to_plot <-
      coverage_case %>%
      dplyr::mutate(case_control = "case")

  }

  # take the start and end points from plotting from the region to plot or from coverage_to_plot
  if(!is.null(region_to_plot)){

    xmin <- region_to_plot %>% GenomicRanges::start() %>% min()
    xmax <- region_to_plot %>% GenomicRanges::end() %>% max()

  }else{

    xmin <- coverage_to_plot$position %>% min()
    xmax <- coverage_to_plot$position %>% max()

  }

  # if the chromosome has a "chr" remove this for plotting
  chromosome <- coverage_to_plot$chromosome %>%
    unique() %>%
    stringr::str_replace("chr", "")

  coverage_plot <-
    ggplot() +
    stat_summary_bin(data = coverage_to_plot,
                     aes(x = position,
                         y = coverage,
                         colour = case_control,
                         fill = case_control),
                     fun.y = mean,
                     geom = "area",
                     binwidth = binwidth, alpha = 0.2,
                     show.legend = ifelse(is.null(coverage_ctrl), F, T)) +
    scale_x_continuous(name = stringr::str_c("Chromosome ", coverage_to_plot$chromosome %>%
                                               unique() %>%
                                               stringr::str_replace("chr", "")),
                       limits = c(xmin, xmax)) +
    scale_y_continuous(name = "Coverage") +
    scale_colour_manual(name = "", values = ggpubr::get_palette("jco", 2)) +
    scale_fill_manual(name = "", values = ggpubr::get_palette("jco", 2)) +
    ggpubr::theme_pubclean(flip = T) +
    theme(axis.line = element_line(colour = "black"),
          axis.ticks.x = element_blank())

  return(coverage_plot)

}

#' Get coverage from bigwig file(s) across a region of interest
#'
#' \code{get_coverage}
#'
#' @param bw_paths chr. bigwig file path(s) from which to obtain coverage
#' @param region_of_interest gr. A genomic range of length 1. If NULL, will use
#'   the gene start and end.
#' @param normalise lgl. if TRUE will normalise using the coverage over reduced exons of the gene.
#' @param sum_func func. Any function to summarise numeric values. Defaults to
#'   mean.
#' @param gene_name chr. Gene symbol, will use to obtain region_of_interest and
#'   normalise to reduced exons definitions.
#' @param ref gr. gtf annotation imported through \code{rtracklayer}.
#' @param change_chr lgl. Add "chr" to start of seqlevels to match the bw format?
#' @param ... additional arguments to pass into the \code{sum_func}
#'
#' @return tibble with the columns detailing chromosome, position (1 for every
#'   base in region_of_interest) and normalised coverage
#'
#' @export
get_coverage <- function(bw_paths, region_of_interest = NULL, normalise = F, sum_func = mean, gene_name = NULL, ref = NULL, change_chr = F, ...){

  # if user has not put in region, take the genic bounderies
  if(is.null(region_of_interest)){

    if(is.null(gene_name) | is.null(ref)){

      stop("No region of interest inputted, so need both gene name and ref...")

    }

    all_exons <-
      ref[ref$type == "exon" & ref$gene_name == gene_name]

    chromosome_to_plot <- all_exons %>% GenomicRanges::seqnames() %>% as.vector() %>% unique()
    min_start_coord <- GenomicRanges::start(all_exons)%>% min()
    max_end_coord <- GenomicRanges::end(all_exons) %>% max()

    region_of_interest <- GenomicRanges::GRanges(str_c(chromosome_to_plot, ":", min_start_coord, "-", max_end_coord))

  }

  # add chr to seqlevels to match bigwig format
  if(change_chr){

    region_of_interest <- .change_chr_format(region_of_interest)

  }

  if(normalise){

    reduced_exons <-
      ref[ref$type == "exon" & ref$gene_name == gene_name] %>%
      GenomicRanges::reduce()

    if(change_chr) reduced_exons <- .change_chr_format(reduced_exons)

  }

  # generate matrix of columns as samples, rows as positions
  # containing the normalised coverage
  cov_matrix <- matrix(nrow = GenomicRanges::width(region_of_interest),
                       ncol = length(bw_paths))

  for(i in 1:length(bw_paths)){

    print(stringr::str_c(Sys.time(), " - getting",
                         ifelse(normalise, " normalised ", " "),
                         "coverage for sample ", i, "/", length(bw_paths)))

    bw_path <- bw_paths[i]

    region_of_interest_cov <-
      rtracklayer::import(con = bw_path,
                        which = region_of_interest,
                        as = "NumericList") %>%
      unlist()

    if(normalise){

      reduced_exons_sum_cov <-
        rtracklayer::import(con = bw_path,
                            which = reduced_exons,
                            as = "NumericList") %>%
        unlist() %>%
        sum()

      cov_matrix[, i] <- region_of_interest_cov/reduced_exons_sum_cov

    }else{

      cov_matrix[, i] <- region_of_interest_cov

    }

  }

  print(stringr::str_c(Sys.time(), " - summarising coverage..."))

  cov_summarised <-
    dplyr::tibble(chromosome = chromosome_to_plot,
                  position = GenomicRanges::start(region_of_interest):GenomicRanges::end(region_of_interest),
                  coverage = apply(cov_matrix, MARGIN = 1, FUN = sum_func, ...))

  return(cov_summarised)

}

.change_chr_format <- function(x){

  if(stringr::str_detect(GenomeInfoDb::seqlevels(x)[1], "chr")){

    GenomeInfoDb::seqlevels(x) <- GenomeInfoDb::seqlevels(x) %>%
      stringr::str_replace("chr", "")

  }else{

    GenomeInfoDb::seqlevels(x) <- GenomeInfoDb::seqlevels(x) %>%
      stringr::str_c("chr", .)

  }

  return(x)

}
dzhang32/RNAdiagnosR documentation built on Dec. 5, 2019, 2 a.m.