R/percentage_regions.R

Defines functions region_psite

Documented in region_psite

#' Percentage of P-sites per transcript region.
#' 
#' This function computes the percentage of P-sites falling in the three
#' annotated regions of the transcripts (5' UTR, CDS and 3' UTR) and generates a
#' bar plot of the resulting values. Multiple samples and replicates can be
#' handled.
#' 
#' @param data Either list of data tables or GRangesList object from
#'   \code{\link{psite_info}}.
#' @param annotation Data table as generated by \code{\link{create_annotation}}.
#' @param sample Either character string, character string vector or named list
#'   of character string(s)/character string vector(s) specifying the name of
#'   the sample(s) and replicate(s) of interest. If a list is provided, each
#'   element of the list is considered as an independent sample associated with
#'   one ore multiple replicates. Multiple samples and replicates are handled
#'   and visualised according to \code{multisamples} and \code{plot_style}.
#' @param multisamples Either "average" or "independent". It specifies how to
#'   handle multiple samples and replicates stored in \code{sample}:
#'   * if \code{sample} is a character string vector and \code{multisamples} is
#'   set to "average" the elements of the vector are considered as replicates
#'   of one sample and a single bar plot is returned.
#'   * if \code{sample} is a character string vector and \code{multisamples} is
#'   set to "independent", each element of the vector is analysed independently
#'   of the others.
#'   * if \code{sample} is a list, \code{multisamples} must be set to "average".
#'   Each element of the list is analysed independently of the others, its
#'   replicates averaged and its name reported in the plot.
#'   Note: when this parameter is set to "average" the bar plot associated with
#'   each sample displays the region-specific mean signal computed across the
#'   replicates and, if \code{plot_style} is set to "dodge", the corresponding
#'   standard error is also reported. Default is "average".
#' @param plot_style Either "stack" or "dodge". It specifies how to organize the
#'   bars associated with the three regions of the transcript:
#'   * "stack": bars are placed one on top of the other.
#'   * "dodge": bars are placed one next to the other. In this case, the
#'   standard error obtained by merging multiple samples (if any, see
#'   \code{sample} and \code{multisamples}) is displayed. Default is "stack".
#' @param transcripts Character string vector listing the name of transcripts to
#'   be included in the analysis. Default is NULL, i.e. all transcripts are
#'   used. Please note: transcripts without annotated 5' UTR, CDS and 3' UTR are
#'   automatically discarded.
#' @param length_range Integer or integer vector for restricting the analysis to
#'   a chosen range of read lengths. Default is NULL, i.e. all read lengths are
#'   used. If specified, this parameter prevails over \code{cl}.
#' @param cl Integer value in [1,100] specifying a confidence level for
#'   restricting the plot to an automatically-defined range of read lengths. The
#'   new range is computed according to the most frequent read lengths, which
#'   accounts for the cl% of the sample and is defined by discarding the
#'   (100-cl)% of read lengths falling in the tails of the read lengths
#'   distribution. If multiple samples are analysed, a single range of read
#'   lengths is computed such that at least the cl% of all samples is
#'   represented. Default is 100.
#' @param colour Character string vector of three elements specifying the colour
#'   of the bar associated with the 5' UTR, CDS and 3' UTR, respectively.
#'   Default is a grayscale.
#' @details In the plot, "RNAs" reflects the expected read distribution from
#'   random fragmentation of all transcripts used in the analysis. It can be
#'   used as baseline to asses the enrichment of ribosomes (P-sites) mapping on
#'   the CDS with respect to the UTRs. The three bars are based on the
#'   cumulative nucleotide length of the 5' UTRs, CDSs and 3' UTRs,
#'   respectively, expressed as percentages.
#' @return List containing: one ggplot object(s) and the data table with the
#'   corresponding x-, y-axis values and the z-values, defining the color of the
#'   bars ("plot_dt"); an additional data table with raw and scaled number of
#'   P-sites per frame for each sample ("count_dt").
#' @examples
#' ## data(reads_list)
#' ## data(mm81cdna)
#' ##
#' ## ## Generate fake samples and replicates
#' ## for(i in 2:6){
#' ##   samp_name <- paste0("Samp", i)
#' ##   set.seed(i)
#' ##   reads_list[[samp_name]] <- reads_list[["Samp1"]][sample(.N, 5000)]
#' ## }
#' ##
#' ## ## Compute and add p-site details
#' ## psite_offset <- psite(reads_list, flanking = 6, extremity = "auto")
#' ## reads_psite_list <- psite_info(reads_list, psite_offset)
#' ##
#' ## ## Define the list of samples and replicate to use as input
#' ## input_samples <- list("S1" = c("Samp1", "Samp2"),
#' ##                       "S2" = c("Samp3", "Samp4", "Samp5"),
#' ##                       "S3" = c("Samp6"))
#' ##
#' ## Generate bar plot
#' ## example_psite_per_region <- region_psite(reads_psite_list, mm81cdna,
#' ##                                          sample = input_samples,
#' ##                                          multisamples = "average",
#' ##                                          plot_style = "stack",
#' ##                                          cl = 85,
#' ##                                          colour = c("#333f50", "gray70", "#39827c")) 
#' @import data.table
#' @import ggplot2
#' @export
region_psite <- function(data, annotation, sample, multisamples = "average",
                         plot_style = "stack", transcripts = NULL,
                         length_range = NULL, cl = 100, 
                         colour = c("gray70", "gray40", "gray10")) {
  
  if(class(data[[1]])[1] == "GRanges"){
    data_tmp <- list()
    for(i in names(data)){
      data_tmp[[i]] <- as.data.table(data[[i]])[, c("width", "strand") := NULL
                                                ][, seqnames := as.character(seqnames)]
      setnames(data_tmp[[i]], c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
    }
    data <- data_tmp
  }

  check_sample <- setdiff(unlist(sample), names(data))
  if(length(check_sample) != 0){
    cat("\n")
    stop(sprintf("incorrect sample name(s): \"%s\" not found\n\n",
                 paste(check_sample, collapse = ", ")))
  }

  if(length(sample) == 0){
    cat("\n")
    stop("at least one sample name must be spcified\n\n")
  }

  if(!(multisamples %in% c("average", "independent"))){
    cat("\n")
    warning("parameter multisamples must be either \"average\" or \"independent\".
            Set to default \"average\"\n", call. = FALSE)
    multisamples <- "average"
  }

  if(multisamples == "independent" & is.list(sample)) {
    cat("\n")
    warning("parameter multisamples is set to \"independent\" but parameter sample is a list:
            parameter multisamples will be coerced to default \"average\"\n", call. = FALSE)
    multisamples <- "average"
  }
  
  if(is.character(sample) & length(sample) == 1) {
    multisamples <- "independent"
  }

  if(is.character(sample) & length(sample) > 1 & multisamples == "average") {
    sample <- list("Average" = sample)
    cat("\n")
    warning("Default name of averaged samples is \"Average\":
            consider to use a named list of one element to provide a meaningful plot title\n", call. = FALSE)
  }
  
  if(!(plot_style %in% c("stack", "dodge"))){
    cat("\n")
    warning("parameter plot_style must be either \"stack\" or \"dodge\".
            Set to default \"stack\"\n", call. = FALSE)
    plot_style <- "stack"
  }
  
  if(length(length_range) != 0 & !inherits(length_range, "numeric") & !inherits(length_range, "integer")){
    cat("\n")
    warning("class of length_range is neither numeric nor integer. Set to default NULL\n", call. = FALSE)
    length_range = NULL
  }
  
  # select transcripts
  l_transcripts <- as.character(annotation[l_utr5 > 0 & l_cds > 0 &
                                             l_cds %% 3 == 0 & l_utr3 > 0,
                                           transcript])
  
  if (length(transcripts) == 0) {
    c_transcripts <- l_transcripts
  } else {
    c_transcripts <- intersect(l_transcripts, transcripts)
  }
  
  #define length range taking into account all (unlisted) samples
  if(length(length_range) == 0){
    for(samp in as.character(unlist(sample))){
      dt <- data[[samp]][transcript %in% c_transcripts]
      
      if(length(length_range) == 0){
        length_range <- seq(quantile(dt$length, (1 - cl/100)/2),
                            quantile(dt$length, 1 - (1 - cl/100)/2))
      } else {
        xmin <- min(min(length_range), quantile(dt$length, (1 - cl/100)/2))
        xmax <- max(max(length_range), quantile(dt$length, 1 - (1 - cl/100)/2))
        length_range <- seq(xmin, xmax)
      }
    }
  }
  
  xmin = min(length_range)
  xmax = max(length_range)

  # check if all samples have reads of the specified lengths
  # especially required if only one read length is passed
  if(length(length_range) != 0){
    if(is.list(sample)){
      samp_dt <- data.table(stack(sample))
      setnames(samp_dt, c("sample", "sample_l"))
    } else {
      samp_dt <- data.table("sample" = sample, "sample_l" = sample)
    }
    
    for(samp in samp_dt$sample){
      
      dt <- data[[samp]][transcript %in% c_transcripts]
      
      len_check <- unique(dt$length)
      if(sum(length_range %in% len_check) == 0) {
        cat("\n")
        warning(sprintf("\"%s\" doesn't contain any reads of the selected lengths: sample removed\n", samp), call. = FALSE)
        #select element of sample which include the sample to be removed (useful if sample is a list)
        sel_l_samp <- samp_dt[sample == samp, sample_l]
        #remove the sample from the list/vector
        if(is.list(samp)){
          sample[[sel_l_samp]] <- sample[[sel_l_samp]][sample[[sel_l_samp]] != samp]
        } else {
          sample <- sample[sample != samp]
        }
      }
    }
  }
  
  if(is.null(unlist(sample))){
    cat("\n")
    stop("none of the data tables listed in sample contains any reads of the specified lengths\n\n")
  }
  
  final_dt <- data.table()
  for(samp in as.character(unlist(sample))) {
    
    region_dt <- data[[samp]][as.character(transcript) %in% c_transcripts
                             ][, list(count = .N), by = .(region = psite_region)
                               ][, scaled_count := (count / sum(count)) * 100
                                 ][, tmp_samp := samp]
    
    final_dt <- rbind(final_dt, region_dt)
  }
  
  final_dt[, class := "mapped"]
  
  # compute length of RNAs for the reference bar
  sub_anno <- annotation[as.character(transcript) %in% c_transcripts]
  RNA_reg <- colSums(sub_anno[, list(l_utr5, l_cds, l_utr3)])
  RNA_reg_perc <- (RNA_reg / sum(RNA_reg)) * 100
  
  RNA_table <- data.table(region = c("5utr", "cds", "3utr"),
                          count = RNA_reg,
                          scaled_count = RNA_reg_perc,
                          tmp_samp = rep("RNAs", 3),
                          class = "rna")
  
  final_dt <- rbind(final_dt, RNA_table)
  
  final_dt[, region := factor(region,
                              levels = c("5utr", "cds", "3utr"),
                              labels = c("5' UTR", "CDS", "3' UTR"))]
  
  final_dt <- final_dt[order(match(tmp_samp, c(as.character(unlist(sample)), "RNAs")), region)]
  
  output <- list()
  output[["count_dt"]] <- copy(final_dt[, c("tmp_samp", "region", "count", "scaled_count")])
  setnames(output[["count_dt"]], "tmp_samp", "sample")
  
  # compute mean and se of samples if a list is provided
  if(is.list(sample)){
    
    samp_dt <- data.table(stack(sample))
    samp_dt <- rbind(samp_dt, data.table(values = "RNAs", ind = "RNAs"))
    setnames(samp_dt, c("tmp_samp", "sample"))
    
    # set names of samples as specified in parameter sample  
    final_dt <- merge.data.table(final_dt, samp_dt, sort = FALSE)[, tmp_samp := NULL]
    
    # compute mean and se
    plot_dt <- final_dt[, .(mean_scaled_count = mean(scaled_count),
                            se_scaled_count = sd(scaled_count/sqrt(.N))), by = .(region, sample, class)]
    
    if(identical(plot_style, "stack") | all(lengths(sample) == 1)){
      output[["plot_dt"]] <- copy(plot_dt[,  c("sample", "mean_scaled_count", "region")])
      setnames(output[["plot_dt"]], c("sample", "mean_scaled_count", "region"), c("x", "y", "z"))
    } else {
      output[["plot_dt"]] <- copy(plot_dt[,  c("sample", "mean_scaled_count", "se_scaled_count", "region")])
      setnames(output[["plot_dt"]], c("sample", "mean_scaled_count", "se_scaled_count", "region"), c("x", "y", "y_se", "z"))
    }
    
  } else {
    plot_dt <- final_dt[, sample := tmp_samp
                        ][, se_scaled_count := NA]
    setnames(plot_dt, "scaled_count", "mean_scaled_count")
    
    output[["plot_dt"]] <- copy(plot_dt[,  c("sample", "mean_scaled_count", "region")])
    setnames(output[["plot_dt"]], c("sample", "mean_scaled_count", "region"), c("x", "y", "z"))
  }
  
  plot_dt[, sample := factor(sample, levels = unique(sample))]

  oldw <- getOption("warn")
  options(warn=-1)
  
  bs <- 25
  plot <- ggplot(plot_dt, aes(x = sample, y = mean_scaled_count, fill = region))
  
  if(identical(plot_style, "stack")){
    plot <- plot + geom_bar(stat = "identity", color = "white", width = 0.65,
                            size = 0.025 * bs, position = position_stack(reverse = TRUE))
  } else {
    plot <- plot +
      geom_bar(stat = "identity", color = "white",
                            size = 0.025 * bs, position = position_dodge(0.9)) +
      geom_errorbar(aes(ymin = mean_scaled_count - se_scaled_count,
                        ymax = mean_scaled_count + se_scaled_count,
                        color = region),
                    width = 0.35, linewidth = 1.1, na.rm = T,
                    position = position_dodge(0.9), show.legend = F) +
      scale_color_manual(values = colour)
  }
  
  plot <- plot +
    scale_fill_manual(values = colour) +
    theme_bw(base_size = bs) +
    scale_x_discrete(breaks = unique(plot_dt$sample)) +
    facet_grid( . ~ class, scales = "free", space="free_x") +
    scale_y_continuous("% P-sites", sec.axis = sec_axis(~ . * 1 , name = "% nucleotides")) + 
    theme(axis.title.x = element_blank(), legend.title = element_blank(),
          panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(),
          strip.background = element_blank(), strip.text = element_blank(),
          legend.position = "top", legend.margin = margin(0,0,-5,0),
          legend.box.margin = margin(5,0,-10,0),
          legend.text = element_text(margin = margin(l = -10, unit = "pt")))
  plot
  
  output[["plot"]] <- plot
  
  options(warn = oldw)
  return(output)
}
LabTranslationalArchitectomics/riboWaltz documentation built on Jan. 17, 2024, 12:18 p.m.