TSRexploreR: TSS and TSR Analysis

Documented in plot_genomic_distribution

#' Genomic Distribution
#'
#' @description
#' Get genomic distributions of TSSs and TSRs.
#'
#' @inheritParams common_params
#' @param data_type Whether to get distribution of TSSs ('tss') or TSRs ('tsr').
#' @param ... Arguments passed to geom_col.
#'
#' @details
#' This plotting function will create a stacked barplot of the proportion of TSSs
#' or TSRs containing within several types of genomic feature: exons, introns, intergenic, 
#' downstream, antisense, and promoter regions. The promoter region is user-defined during annotation.
#'
#' A set of functions to control data structure for plotting are included. 'use_normalized' 
#' will use  normalized scores, which only matters if 'consider_score' is TRUE.
#' 'threshold' defines the minimum number of raw counts a TSS or TSR must have to be 
#' considered. dominant' specifies whether only the dominant TSS or TSR (determined
#' using the 'mark_dominant' function) is considered. For TSSs, this can be either 
#' dominant TSS per TSR or gene/transcript, and for TSRs it is the dominant TSR 
#' per gene/transcript. 'data_conditions' can be used to filter, quantile, order, 
#' and/or group data for plotting.
#'
#' If 'return_table' is TRUE, a data.frame containing the underlying data
#'   for the plot is returned.
#'
#' @return ggplot2 plot with TSS or TSR genomic distribution.
#'   If 'return_table' is TRUE returns a data.frame of underlying stats.
#'
#' @seealso \code{\link{annotate_features}} to annotate TSSs or TSRs.
#'
#' @examples
#' data(TSSs_reduced)
#' annotation <- system.file("extdata", "S288C_Annotation.gtf", package="TSRexploreR")
#'
#' exp <- TSSs_reduced %>%
#'   tsr_explorer(genome_annotation=annotation) %>%
#'   format_counts(data_type="tss") %>%
#'   annotate_features(data_type="tss")
#'
#' p <- plot_genomic_distribution(exp, data_type="tss")
#'
#' @export 

plot_genomic_distribution <- function(
  experiment,
  data_type=c("tss", "tsr", "shift"),
  samples="all",
  threshold=NULL,
  use_normalized=FALSE,
  dominant=FALSE,
  data_conditions=NULL,
  return_table=FALSE,
  ...
) {

  ## Check inputs.
  assert_that(is(experiment, "tsr_explorer"))
  data_type <- match.arg(str_to_lower(data_type), c("tss", "tsr", "shift"))
  assert_that(is.character(samples))
  assert_that(is.null(threshold) || (is.numeric(threshold) && threshold >= 0))
  assert_that(is.flag(dominant))
  assert_that(is.null(data_conditions) || is.list(data_conditions))
  assert_that(is.flag(return_table))

  ## Get samples.
  selected_samples <- experiment %>%
    extract_counts(data_type, samples, use_normalized) %>%
    preliminary_filter(dominant, threshold)

  walk(selected_samples, function(x) {
    x[, simple_annotations := factor(
      simple_annotations,
      levels=c("Promoter", "Exon", "Intron", "Downstream", "Intergenic", "Antisense")
    )]
  })

  ## Apply advanced grouping.
  selected_samples <- condition_data(selected_samples, data_conditions)

  ## Calculate distribution.
  selected_samples <- rbindlist(selected_samples, idcol="sample")

  grouping_status <- case_when(
    !is.null(data_conditions$quantiling) ~ "row_quantile",
    !is.null(data_conditions$grouping) ~ "row_groups",
    TRUE ~ "none"
  )

  genomic_dist <- .calculate_distribution(selected_samples, grouping_status)

  ## Order samples if required.
  if (!all(samples == "all")) {
    genomic_dist[, sample := factor(sample, levels=rev(samples))]
  }

  ## Return table is requested.
  if (return_table) return(as.data.frame(genomic_dist))

  ## Plot the genomic distribution.
  p <- ggplot(genomic_dist, aes(x=.data$sample, y=.data$count, fill=fct_rev(.data$simple_annotations))) +
    geom_col(position="fill", ...) +
    coord_flip() +
    ylab("Fraction") +
    theme_bw() +
    theme(
      axis.title.y=element_blank(),
      panel.grid=element_blank()
    )

  if (grouping_status != "none") {
    p <- p + facet_grid(grouping ~ .)
  }

  return(p)

}

## Calculate Genomic Distribution
##
## @param selected_samples Samples to analyze.
## @param grouping_status Whether data is grouped.

.calculate_distribution <- function(selected_samples, grouping_status) {

  if (grouping_status != "none") {
    setnames(selected_samples, old=grouping_status, new="grouping")
    genomic_distribution <- selected_samples[,
      .(count=.N),
      by=.(sample, simple_annotations, grouping)
    ][,
      .(simple_annotations, count, fraction=count / sum(count)),
      by=.(sample, grouping)
    ]
  } else {
    genomic_distribution <- selected_samples[,
      .(count=.N),
      by=.(sample, simple_annotations)
    ][,
      .(simple_annotations, count, fraction=count / sum(count)),
      by=sample
    ]
  }

  return(genomic_distribution)

}