R/DSBs_per_gene_length.R

Defines functions DSBs_per_gene_length

Documented in DSBs_per_gene_length

#' A gene length bin frequency bar plot generator
#'
#' A function that allows you to plot a bar plot of the number of gene hits for each bin of gene length
#' @param samples a list of data frames of annotated peaks
#' @param plot_dir the directory in which you want the plots, default = "plots"
#' @keywords frequencies
#' @import tidyverse gridExtra
#' @export
#' @examples
#' DSBs_per_gene_length(samples, plot_dir)
DSBs_per_gene_length <- function(samples, plot_dir = "plots"){
  count <- 0
  dir.create(file.path(plot_dir), showWarnings = FALSE)
  samples_filtered_ordered <- lapply(samples, function(condition){
    condition %>%
      filter(str_detect(GenomicAnnotation, "Promoter.*|.*Exon.*|Downstream.*")) %>%
      arrange(desc(PeakPvalue))
  })
  pl_gene_length <- lapply(samples_filtered_ordered, function(condition){
    count <<- count + 1
    gene_length <- condition %>%
      select(c("GeneName", "GeneLength"))
    very_short_gene_dsbs <- gene_length %>% filter(GeneLength <= 100)
    short_gene_dsbs <- gene_length %>% filter(GeneLength <= 1000, GeneLength >= 100)
    medium_gene_dsbs <- gene_length %>% filter(GeneLength <= 10000, GeneLength >= 1000)
    long_gene_dsbs <- gene_length %>% filter(GeneLength <= 100000, GeneLength >= 10000)
    very_long_gene_dsbs <- gene_length %>% filter(GeneLength <= 500000, GeneLength >= 100000)
    huge_genes_dsbs <- gene_length %>% filter(GeneLength >= 500000)
    length_window_x_number_of_genes <- data.frame("length_window" = c("1bp-100bp", "100bp-1'000bp", "1'000bp-10'000bp", "10'000bp-100'000bp", "100'000bp-500'000bp", "more_than_500'000bp"),
                                                  "number_of_genes" = c(nrow(very_short_gene_dsbs), nrow(short_gene_dsbs), nrow(medium_gene_dsbs), nrow(long_gene_dsbs), nrow(very_long_gene_dsbs), nrow(huge_genes_dsbs)))
    length_window_x_number_of_genes$length_window <- factor(length_window_x_number_of_genes$length_window, levels = length_window_x_number_of_genes$length_window)
    ggplot(length_window_x_number_of_genes, aes(x=length_window, y=number_of_genes)) +
      geom_col(col = pal_npg()(length(samples))[count], fill = pal_npg("nrc", alpha = 0.8)(length(samples))[count]) +
      ggtitle(names(samples_filtered_ordered)[count]) +
      theme(axis.text = element_text( size = rel(0.4), angle = 25))
  })
  pdf(file.path(plot_dir, "peaks_x_length_window_plot.pdf"))
  do.call(grid.arrange, c(pl_gene_length, nrow = round(length(samples_filtered_ordered)/2)))
  dev.off()

}
riccardo-trozzo/BlissR documentation built on Aug. 1, 2020, 12:23 a.m.