R/frames.R

Defines functions frame_psite

Documented in frame_psite

#' Percentage of P-sites per reading frame.
#'
#' This function computes, for each transcript region (5' UTR, coding sequence
#' and 3' UTR), the percentage of P-sites falling in the three possible
#' translation reading frames. It takes into account only transcripts with
#' annotated regions of length > 0 and the resulting values are visualized as
#' bar plots.  Multiple samples and replicates can be handled.
#' 
#' @param data Either list of data tables or GRangesList object from
#'   \code{\link{psite_info}}.
#' @param annotation Data table as generated by \code{\link{create_annotation}}.
#' @param sample Either character string, character string vector or named list
#'   of character string(s)/character string vector(s) specifying the name of
#'   the sample(s) and replicate(s) of interest. If a list is provided, each
#'   element of the list is considered as an independent sample associated with
#'   one ore multiple replicates. Multiple samples and replicates are handled
#'   and visualised according to \code{multisamples} and \code{plot_style}.
#' @param multisamples Either "average" or "independent". It specifies how to
#'   handle multiple samples and replicates stored in \code{sample}:
#'   * if \code{sample} is a character string vector and \code{multisamples} is
#'   set to "average" the elements of the vector are considered as replicates
#'   of one sample and a single bar plot is returned.
#'   * if \code{sample} is a character string vector and \code{multisamples} is
#'   set to "independent", each element of the vector is analysed independently
#'   of the others. The number of plots returned and their organization is
#'   specified by \code{plot_style}.
#'   * if \code{sample} is a list, \code{multisamples} must be set to "average".
#'   Each element of the list is analysed independently of the others, its
#'   replicates averaged and its name reported in the plot. The number of plots
#'   returned and their organization is specified by \code{plot_style}.
#'   Note: when this parameter is set to "average" the bar plot associated with
#'   each sample displays the frame-specific mean signal computed across the
#'   replicates and the corresponding standard error is also reported. Default
#'   is "average".
#' @param plot_style Either "split", "facet", "dodge" or "mirror". It specifies
#'   how to organize and display multiple bar plots:
#'   * "split": one bar plot for each sample is returned as an independent
#'   ggplot object.
#'   * "facet": the bar plots are placed one next to the other, in independent
#'   boxes.
#'   * "dodge": all bar plots are displayed in one box and, for each frame,
#'   samples are placed side by side.
#'   * "mirror": \code{sample} must be either a character string vector or
#'   a list of exactly two elements and the resulting bar plots are mirrored
#'   along the x axis.
#'   Default is "split".
#' @param transcripts Character string vector listing the name of transcripts to
#'   be included in the analysis. Default is NULL, i.e. all transcripts are used.
#' @param region Character string specifying the region(s) of the transcripts to
#'   be analysed. It can be either "5utr", "cds", "3utr", respectively for 5'
#'   UTRs, CDSs and 3' UTRs, or "all", i.e. all regions are considered. Note:
#'   according to this parameter the bar plots are differently arranged to
#'   optimise the organization and the visualization of the data.
#' @param length_range Integer or integer vector for restricting the analysis to
#'   a chosen range of read lengths. Default is NULL, i.e. all read lengths are
#'   used.
#' @param cl Integer value in [1,100] specifying a confidence level for
#'   restricting the analysis to an automatically-defined range of read lengths.
#'   The new range is computed according to the most frequent read lengths,
#'   which accounts for the cl% of the sample and is defined by discarding the
#'   (100-cl)% of read lengths falling in the tails of the  read lengths
#'   distribution. If multiple sample names are provided, one range of read
#'   lengths is computed, such that at least the cl% of all samples is
#'   represented. Default is 100.
#' @param colour Character string or character string vector specifying the
#'   colour of the bar plot(s). If \code{plot_style} is set to either "dodge" or
#'   "mirror", a colour for each sample is required. Default is NULL, i.e. the
#'   default R colour palette is used.
#' @return List containing: one or more ggplot object(s) and the data table with
#'   the corresponding x- and y-axis values ("plot_dt"); an additional data
#'   table with raw and scaled number of P-sites per frame for each sample
#'   ("count_dt").
#' @seealso \code{\link{frame_psite_length}} for a similar plot stratified by
#'   read length.
#' @examples
#' ## data(reads_list)
#' ## data(mm81cdna)
#' ##
#' ## ## Generate fake samples and replicates
#' ## for(i in 2:6){
#' ##   samp_name <- paste0("Samp", i)
#' ##   set.seed(i)
#' ##   reads_list[[samp_name]] <- reads_list[["Samp1"]][sample(.N, 5000)]
#' ## }
#' ##
#' ## ## Compute and add p-site details
#' ## psite_offset <- psite(reads_list, flanking = 6, extremity = "auto")
#' ## reads_psite_list <- psite_info(reads_list, psite_offset)
#' ##
#' ## ## Define the list of samples and replicate to use as input
#' ## input_samples <- list("S1" = c("Samp1", "Samp2"),
#' ##                       "S2" = c("Samp3", "Samp4", "Samp5"),
#' ##                       "S3" = c("Samp6"))
#' ##
#' ## Generate bar plots
#' ## example_frames <- frame_psite(reads_psite_list, mm81cdna,
#' ##                               sample = input_samples,
#' ##                               multisamples = "average",
#' ##                               plot_style = "facet",
#' ##                               region = "all",
#' ##                               colour = c("#333f50", "#39827c", "gray70"))
#' @import data.table
#' @import ggplot2
#' @export
frame_psite <- function(data, annotation, sample, multisamples = "average",
                        plot_style = "split", transcripts = NULL, 
                        region = "all", length_range = NULL, cl = 100,
                        colour = NULL){
  
  if(class(data[[1]])[1] == "GRanges"){
    data_tmp <- list()
    for(i in names(data)){
      data_tmp[[i]] <- as.data.table(data[[i]])[, c("width", "strand") := NULL
                                                ][, seqnames := as.character(seqnames)]
      setnames(data_tmp[[i]], c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
    }
    data <- data_tmp
  }
  #-
  check_sample <- setdiff(unlist(sample), names(data))
  if(length(check_sample) != 0){
    cat("\n")
    stop(sprintf("incorrect sample name(s): \"%s\" not found\n\n",
                 paste(check_sample, collapse = ", ")))
  }
  #-
  if(length(sample) == 0){
    cat("\n")
    stop("at least one sample name must be spcified\n\n")
  }
  #-
  if(!(multisamples %in% c("average", "independent"))){
    cat("\n")
    warning("parameter multisamples must be either \"average\" or \"independent\".
            Set to default \"average\"\n", call. = FALSE)
    multisamples <- "average"
  }
  #-
  if(multisamples == "independent" & is.list(sample)) {
    cat("\n")
    warning("parameter multisamples is set to \"independent\" but parameter sample is a list:
            parameter multisamples will be coerced to default \"average\"\n", call. = FALSE)
    multisamples <- "average"
  }
  #--
  if(length(sample) != 2 & plot_style == "mirror") {
    cat("\n")
    warning("parameter plot_style is set to \"mirror\" but parameter sample is a list of dimension > 2:
            parameter plot_style will be coerced to default \"split\"\n", call. = FALSE)
    plot_style <- "split"
  }
  #- 
  if(is.character(sample) & length(sample) == 1) {
    multisamples <- "independent"
    plot_style <- "split"
  }
  #-
  if(is.character(sample) & length(sample) > 1 & multisamples == "average") {
    sample <- list("Average" = sample)
    plot_style <- "split"
    cat("\n")
    warning("Default name of averaged samples is \"Average\":
            consider to use a named list of one element to provide a meaningful plot title\n", call. = FALSE)
  }
  #-
  if(is.list(sample) & length(sample) == 1){
    plot_style <- "split"
  }
  #--modify
  if(!(plot_style %in% c("mirror", "dodge", "split", "facet"))){
    cat("\n")
    warning("parameter plot_style must be either \"split\", \"facet\", \"dodge\" or \"mirror\".
            Set to default \"split\"\n", call. = FALSE)
    plot_style <- "split"
  }
  
  if(length(length_range) != 0 & !inherits(length_range, "numeric") & !inherits(length_range, "integer")){
    cat("\n")
    warning("class of length_range is neither numeric nor integer. Set to default NULL\n", call. = FALSE)
    length_range = NULL
  }
  
  #define color vector
  if((length(colour) < length(sample)) & 
     ((plot_style %in% c("dodge", "mirror")) | 
      (plot_style %in% c("split", "facet") & length(colour) != 1))){
    
    if(length(colour) !=0){
      warning("Not enough colors specified:\n
            default ggplot color palette will be used\n", call. = FALSE)
    }
    
    default_gg_col <- function(n) {
      hues = seq(15, 375, length = n + 1)
      hcl(h = hues, l = 65, c = 100)[1:n]
    }
    colour <- default_gg_col(length(sample))
    
  } else {
    if(plot_style %in% c("split", "facet") & length(colour) == 1){
      colour <- rep(colour, length(sample))
    }
  }
  
  # select transcripts
  l_transcripts <- as.character(annotation[l_utr5 > 0 & l_cds > 0 &
                                           l_cds %% 3 == 0 & l_utr3 > 0,
                                           transcript])
  
  if (length(transcripts) == 0) {
    c_transcripts <- l_transcripts
  } else {
    c_transcripts <- intersect(l_transcripts, transcripts)
  }
  #-
  #define length range taking into account all (unlisted) samples
  if(length(length_range) == 0){
    for(samp in as.character(unlist(sample))){
      dt <- data[[samp]][transcript %in% c_transcripts]

      if(length(length_range) == 0){
        length_range <- seq(quantile(dt$length, (1 - cl/100)/2),
                            quantile(dt$length, 1 - (1 - cl/100)/2))
      } else {
        xmin <- min(min(length_range), quantile(dt$length, (1 - cl/100)/2))
        xmax <- max(max(length_range), quantile(dt$length, 1 - (1 - cl/100)/2))
        length_range <- seq(xmin, xmax)
      }
    }
  }

  xmin = min(length_range)
  xmax = max(length_range)
  
  # check if all samples have reads of the specified lengths
  # especially required if only one read length is passed
  if(length(length_range) != 0){
    if(is.list(sample)){
      samp_dt <- data.table(stack(sample))
      setnames(samp_dt, c("sample", "sample_l"))
    } else {
      samp_dt <- data.table("sample" = sample, "sample_l" = sample)
    }
    
    for(samp in samp_dt$sample){
      
      dt <- data[[samp]][transcript %in% c_transcripts]
      
      len_check <- unique(dt$length)
      if(sum(length_range %in% len_check) == 0) {
        cat("\n")
        warning(sprintf("\"%s\" doesn't contain any reads of the selected lengths: sample removed\n", samp), call. = FALSE)
        #select element of sample which include the sample to be removed (useful if sample is a list)
        sel_l_samp <- samp_dt[sample == samp, sample_l]
        #remove the sample from the list/vector
        if(is.list(samp)){
          sample[[sel_l_samp]] <- sample[[sel_l_samp]][sample[[sel_l_samp]] != samp]
        } else {
          sample <- sample[sample != samp]
        }
      }
    }
  }
  
  if(is.null(unlist(sample))){
    cat("\n")
    stop("none of the data tables listed in sample contains any reads of the specified lengths\n\n")
  }
  
  if(!region %in% c("all", "cds", "5utr", "3utr")){
    cat("\n")
    warning("parameter region is invalid. Set to default \"all\"\n", call. = FALSE)
    region = "all"
  }
  
  final_dt <- data.table()
  for(samp in as.character(unlist(sample))) {
    
    frame_dt <- data[[samp]][length %in% length_range & 
                               transcript %in% c_transcripts]
    
    if(!identical(region, "all")){
      frame_dt <- frame_dt[psite_region == region]
    }
    
    frame_dt[, frame := psite_from_start %% 3]
    
    setkey(frame_dt, psite_region, frame)
    frame_dt <- frame_dt[CJ(psite_region = unique(psite_region), frame = c(0, 1, 2)),
                         list(count = .N), by = .EACHI]
    frame_dt <- frame_dt[, scaled_count := (count / sum(count)) * 100, by = .(region = psite_region)
                           ][is.na(scaled_count), scaled_count := 0]

    frame_dt[, tmp_samp := samp]
    
    final_dt <- rbind(final_dt, frame_dt)
  }
  
  if(identical(region, "all")){
    final_dt[, psite_region := factor(psite_region,
                                levels = c("5utr", "cds", "3utr"),
                                labels = c("5' UTR", "CDS", "3' UTR"))]
  } else {
    final_dt[, psite_region := ifelse(psite_region == "5utr", "5' UTR",
                                ifelse(psite_region == "cds", "CDS", "3' UTR"))]
  }
  
  final_dt <- final_dt[order(match(tmp_samp, as.character(unlist(sample))), psite_region, frame)]
  setnames(final_dt, "psite_region", "region")
  
  output <- list()
  output[["count_dt"]] <- copy(final_dt[, c("tmp_samp", "region", "frame", "count", "scaled_count")])
  setnames(output[["count_dt"]], "tmp_samp", "sample")
  
  # compute mean and se of samples if a list is provided
  if(is.list(sample)){
    
    samp_dt <- data.table(stack(sample))
    setnames(samp_dt, c("tmp_samp", "sample"))
    
    # set names of samples as specified in parameter sample  
    final_dt <- merge.data.table(final_dt, samp_dt, sort = FALSE)[, tmp_samp := NULL]
    
    # compute mean and se
    plot_dt <- final_dt[, .(mean_scaled_count = mean(scaled_count),
                            se_scaled_count = sd(scaled_count/sqrt(.N))), by = .(region, frame, sample)]
    
    if(any(lengths(sample) != 1)){
      output[["plot_dt"]] <- copy(plot_dt[, c("sample", "region", "frame", "mean_scaled_count", "se_scaled_count")])
      setnames(output[["plot_dt"]], c("frame", "mean_scaled_count", "se_scaled_count"), c("x", "y", "y_se"))
    } else {
      output[["plot_dt"]] <- copy(final_dt[, c("sample", "region", "frame", "scaled_count")])
      setnames(output[["plot_dt"]], c("frame", "scaled_count"), c("x", "y"))
    }
    
  } else {
    plot_dt <- final_dt[, sample := tmp_samp
                        ][, se_scaled_count := NA]
    setnames(plot_dt, "scaled_count", "mean_scaled_count")
    
    output[["plot_dt"]] <- copy(plot_dt[, c("sample", "region", "frame", "mean_scaled_count")])
    setnames(output[["plot_dt"]], c("frame", "mean_scaled_count"), c("x", "y"))
  }
  
  plot_dt[, sample := factor(sample, levels = unique(sample))]
  
  oldw <- getOption("warn")
  options(warn=-1)
  
  if(plot_style == "split"){ 
    i <- 0
    for(samp in unique(plot_dt$sample)){ # generate a plot for each sample and store it
      i <- i + 1
      sel_col = colour[i]
      
      sub_plot_dt <- plot_dt[sample == samp]
      plot <- ggplot(sub_plot_dt, aes(x = as.factor(frame), y = mean_scaled_count)) +
        geom_bar(stat = "identity", fill = sel_col, width = 0.8) +
        geom_errorbar(aes(ymin = mean_scaled_count - se_scaled_count,
                          ymax = mean_scaled_count + se_scaled_count),
                      width = 0.35, linewidth = 1.1, na.rm = T, color = sel_col,
                      show.legend = F) +
        labs(title = samp, x = "Frame", y = "% P-sites") +
        theme_bw(base_size = 27.5) +
        theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(),
              plot.title = element_text(hjust = 0.5))
      
      if(identical(region, "all")) {
        plot <- plot + facet_grid(. ~ region) + 
          theme(strip.background = element_blank())
      }
      
      output[[paste0("plot_", samp)]] <- plot
    }
  } else {
    
    if(identical(plot_style, "mirror")) {
      plot_dt[sample == unique(plot_dt$sample)[2], mean_scaled_count := -mean_scaled_count]
    }
    
    plot <- ggplot(plot_dt, aes(x = as.factor(frame), y = mean_scaled_count, fill = sample))
      
      if(plot_style %in% c("facet", "mirror")) {
        plot <- plot + geom_bar(stat = "identity", width = 0.8) +
          geom_errorbar(aes(ymin = mean_scaled_count - se_scaled_count,
                            ymax = mean_scaled_count + se_scaled_count, color = sample),
                        width = 0.35, linewidth = 1.1, na.rm = T)
        if(identical(plot_style, "mirror")){
          plot <- plot + geom_hline(yintercept = 0, linetype = 2, color = "gray20")
        }
      } else {
        plot <- plot + geom_bar(stat = "identity", position = position_dodge(0.9)) +
          geom_errorbar(aes(ymin = mean_scaled_count - se_scaled_count,
                            ymax = mean_scaled_count + se_scaled_count, color = sample),
                        width = 0.35, linewidth = 1.1, na.rm = T, position = position_dodge(0.9))
      }
    
    plot <- plot + labs(x = "Frame", y = "% P-sites") +
      theme_bw(base_size = 27.5) +
      theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) +
      scale_fill_manual(values = colour) + 
      scale_color_manual(values = colour) +
      scale_y_continuous(labels = abs)
    
    if(uniqueN(colour) > 1 & plot_style != "facet"){
      plot <- plot + theme(legend.position = "right", legend.title = element_blank(),
                           legend.background = element_blank())
    } else {
      plot <- plot + theme(legend.position = "none")
    }
    
    if(identical(region, "all")) {
      if(identical(plot_style, "facet")){
        plot <- plot + facet_grid(sample ~ region)
      } else {
        plot <- plot + facet_grid(. ~ region)
      }
      plot <- plot + theme(strip.background = element_blank())
    } else {
      if(identical(plot_style, "facet")){
        plot <- plot + facet_wrap(sample ~ ., ncol = ceiling(sqrt(length(sample))))
      }
      plot <- plot + theme(strip.background = element_blank())
    }
    
    output[["plot"]] <- plot
  }
  
  options(warn = oldw)
  return(output)
}



#' Percentage of P-sites per reading frame stratified by read length.
#'
#' This function computes, for each transcript region (5' UTR, coding sequence
#' and 3' UTR), the percentage of P-sites falling in the three possible
#' translation reading frames, stratified by read length. It takes into account
#' only transcripts with annotated regions of length > 0 and the resulting
#' values are visualized as heatmaps. Multiple samples and replicates can be
#' handled in several ways.
#'
#' @param data Either list of data tables or GRangesList object from
#'   \code{\link{psite_info}}.
#' @param annotation Data table as generated by \code{\link{create_annotation}}.
#' @param sample Either character string, character string vector or named list
#'   of character string(s)/character string vector(s) specifying the name of
#'   the sample(s) and replicate(s) of interest. If a list is provided, each
#'   element of the list is considered as an independent sample associated with
#'   one ore multiple replicates. Multiple samples and replicates are handled
#'   and visualised according to \code{multisample} and \code{plot_style}.
#' @param multisamples Either "average" or "independent". It specifies how to
#'   handle multiple samples and replicates stored in \code{sample}:
#'   * if \code{sample} is a character string vector and \code{multisample} is
#'   set to "average" the elements of the vector are considered as replicates
#'   of one sample and a single heatmap is returned.
#'   * if \code{sample} is a character string vector and \code{multisample} is
#'   set to "independent", each element of the vector is analysed independently
#'   of the others. The number of plots returned and their organization is
#'   specified by \code{plot_style}.
#'   * if \code{sample} is list, \code{multisample} must be set to "average".
#'   Each element of the list is analysed independently of the others, its
#'   replicates averaged and its name reported in the plot. The number of plots
#'   returned and their organization is specified by \code{plot_style}.
#'   Note: when this parameter is set to "average" the heatmap associated with
#'   each sample displays the length- and frame- specific mean signal
#'   computed across the replicates. Default is "average".
#' @param plot_style Either "split" or "facet". It specifies how to organize and
#'   display multiple heatmaps:
#'   * "split": heatmap(s) for each sample are returned as an independent
#'   ggplot object.
#'   * "facet": the heatmap(s) for each sample are placed one below the other
#'   (when \code{region} is "all") or one next to the other (when \code{region}
#'   is set to either "5utr", "cds", "3utr") in independent boxes. Default is
#'   "split".
#' @param transcripts Character string vector listing the name of transcripts to
#'   be included in the analysis. Default is NULL, i.e. all transcripts are used.
#' @param region Character string specifying the region(s) of the transcripts to
#'   be analysed. It can be either "5utr", "cds", "3utr", respectively for 5'
#'   UTRs, CDSs and 3' UTRs, or "all", i.e. all regions are considered. Note:
#'   according to this parameter the heatmaps are differently arranged to
#'   optimise the organization and the visualization of the data.
#' @param length_range Integer or integer vector for restricting the plot to
#'   a chosen range of read lengths. Default is NULL, i.e. all read lengths are
#'   used.
#' @param cl Integer value in [1,100] specifying a confidence level for
#'   restricting the plot to an automatically-defined range of read lengths. The
#'   new range is computed according to the most frequent read lengths, which
#'   accounts for the cl% of the sample and is defined by discarding the
#'   (100-cl)% of read lengths falling in the tails of the read lengths
#'   distribution. If multiple samples are analysed, a single range of read
#'   lengths is computed such that at least the cl% of all sample are
#'   represented. Default is 100.
#' @param colour Character string specifying the colour of the plot. The colour
#'   scheme is as follow: tiles corresponding to the lowest signal are always
#'   white, tiles corresponding to the highest signal are of the specified
#'   colour and the progression between these two colours follows a linear
#'   gradient. Default is dark blue.
#' @return List containing: one or more ggplot object(s) and the data table with
#'   the corresponding x- and y-axis values and the z-values, defining the color
#'   of the tiles ("plot_dt"); an additional data table with raw and scaled
#'   number of read extremities mapping around the start and the stop codon, per
#'   length, for each sample ("count_dt").
#' @seealso \code{\link{frame_psite}} for a similar plot where read lengths are
#'   collapsed.
#' @examples
#' ## data(reads_list)
#' ## data(mm81cdna)
#' ##
#' ## ## Generate fake samples and replicates
#' ## for(i in 2:6){
#' ##   samp_name <- paste0("Samp", i)
#' ##   set.seed(i)
#' ##   reads_list[[samp_name]] <- reads_list[["Samp1"]][sample(.N, 5000)]
#' ## }
#' ##
#' ## ## Compute and add p-site details
#' ## psite_offset <- psite(reads_list, flanking = 6, extremity = "auto")
#' ## reads_psite_list <- psite_info(reads_list, psite_offset)
#' ##
#' ## ## Define the list of samples and replicate to use as input
#' ## input_samples <- list("S1" = c("Samp1", "Samp2"),
#' ##                       "S2" = c("Samp3", "Samp4", "Samp5"),
#' ##                       "S3" = c("Samp6"))
#' ##
#' ## Generate heatmaps
#' ## example_frames_stratified <- frame_psite_length(reads_psite_list, mm81cdna,
#' ##                                                 sample = input_samples,
#' ##                                                 multisamples = "average",
#' ##                                                 plot_style = "split",
#' ##                                                 region = "all",
#' ##                                                 colour = "#333f50")
#' @import data.table
#' @import ggplot2
#' @export
frame_psite_length <- function(data, annotation, sample,
                               multisamples = "average", plot_style = "split",
                               transcripts = NULL, region = "all",
                               length_range = NULL, cl = 100,
                               colour = "#172969"){
  
  if(class(data[[1]])[1] == "GRanges"){
    data_tmp <- list()
    for(i in names(data)){
      data_tmp[[i]] <- as.data.table(data[[i]])[, c("width", "strand") := NULL
                                                ][, seqnames := as.character(seqnames)]
      setnames(data_tmp[[i]], c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
    }
    data <- data_tmp
  }
  
  check_sample <- setdiff(unlist(sample), names(data))
  if(length(check_sample) != 0){
    cat("\n")
    stop(sprintf("incorrect sample name(s): \"%s\" not found\n\n",
                 paste(check_sample, collapse = ", ")))
  }
  
  if(length(sample) == 0){
    cat("\n")
    stop("at least one sample name must be spcified\n\n")
  }
  
  if(!(multisamples %in% c("average", "independent"))){
    cat("\n")
    warning("parameter multisamples must be either \"average\" or \"independent\".
            Set to default \"average\"\n", call. = FALSE)
    multisamples <- "average"
  }
  
  if(multisamples == "independent" & is.list(sample)) {
    cat("\n")
    warning("parameter multisamples is set to \"independent\" but parameter sample is a list:
            parameter multisamples will be coerced to default \"average\"\n", call. = FALSE)
    multisamples <- "average"
  }
  if(is.character(sample) & length(sample) == 1) {
    multisamples <- "independent"
    plot_style <- "split"
  }
  
  if(is.character(sample) & length(sample) > 1 & multisamples == "average") {
    sample <- list("Average" = sample)
    plot_style <- "split"
    cat("\n")
    warning("Default name of averaged samples is \"Average\":
            consider to use a named list of one element to provide a meaningful plot title\n", call. = FALSE)
  }
  
  if(is.list(sample) & length(sample) == 1){
    plot_style <- "split"
  }
  
  if(!(plot_style %in% c("split", "facet"))){
    cat("\n")
    warning("parameter plot_style must be either \"split\" or \"facet\".
            Set to default \"split\"\n", call. = FALSE)
    plot_style <- "split"
  }
  
  if(length(length_range) != 0 & !inherits(length_range, "numeric") & !inherits(length_range, "integer")){
    cat("\n")
    warning("class of length_range is neither numeric nor integer. Set to default NULL\n", call. = FALSE)
    length_range = NULL
  }
  
  # select transcripts
  l_transcripts <- as.character(annotation[l_utr5 > 0 & l_cds > 0 &
                                           l_cds %% 3 == 0 & l_utr3 > 0,
                                           transcript])
  
  if (length(transcripts) == 0) {
    c_transcripts <- l_transcripts
  } else {
    c_transcripts <- intersect(l_transcripts, transcripts)
  }
  
  #define length range taking into account all (unlisted) samples
  if(length(length_range) == 0){
    for(samp in as.character(unlist(sample))){
      dt <- data[[samp]][transcript %in% c_transcripts]

      if(length(length_range) == 0){
        length_range <- seq(quantile(dt$length, (1 - cl/100)/2),
                            quantile(dt$length, 1 - (1 - cl/100)/2))
      } else {
        xmin <- min(min(length_range), quantile(dt$length, (1 - cl/100)/2))
        xmax <- max(max(length_range), quantile(dt$length, 1 - (1 - cl/100)/2))
        length_range <- seq(xmin, xmax)
      }
    }
  }
  
  minlen = min(length_range)
  maxlen = max(length_range)
  
  # check if all samples have reads of the specified lengths
  # especially required if only one read length is passed
  if(length(length_range) != 0){
    if(is.list(sample)){
      samp_dt <- data.table(stack(sample))
      setnames(samp_dt, c("sample", "sample_l"))
    } else {
      samp_dt <- data.table("sample" = sample, "sample_l" = sample)
    }
    
    for(samp in samp_dt$sample){
      
      dt <- data[[samp]][transcript %in% c_transcripts]
      
      len_check <- unique(dt$length)
      if(sum(length_range %in% len_check) == 0) {
        cat("\n")
        warning(sprintf("\"%s\" doesn't contain any reads of the selected lengths: sample removed\n", samp), call. = FALSE)
        #select element of sample which include the sample to be removed (useful if sample is a list)
        sel_l_samp <- samp_dt[sample == samp, sample_l]
        #remove the sample from the list/vector
        if(is.list(samp)){
          sample[[sel_l_samp]] <- sample[[sel_l_samp]][sample[[sel_l_samp]] != samp]
        } else {
          sample <- sample[sample != samp]
        }
      }
    }
  }
  
  if(is.null(unlist(sample))){
    cat("\n")
    stop("none of the data tables listed in sample contains any reads of the specified lengths\n\n")
  }
  
  if(!region %in% c("all", "cds", "5utr", "3utr")){
    cat("\n")
    warning("parameter region is invalid. Set to default \"all\"\n", call. = FALSE)
    region = "all"
  }
  
  final_dt <- data.table()
  for(samp in as.character(unlist(sample))) {
    
    frame_dt <- data[[samp]][length %in% length_range
                       ][transcript %in% c_transcripts]

    if(!identical(region, "all")){
      frame_dt <- frame_dt[psite_region == region]
    }
    
    frame_dt[, frame := psite_from_start %% 3]
    
    setkey(frame_dt, psite_region, length, frame)
    frame_dt <- frame_dt[CJ(psite_region = unique(psite_region),
                            length = length_range, frame = c(0, 1, 2)),
                         list(count = .N), by = .EACHI]
    frame_dt <- frame_dt[, scaled_count := (count / sum(count)) * 100, by = .(length, psite_region)
                         ][is.na(scaled_count), scaled_count := 0]
    
    frame_dt[, tmp_samp := samp]
    
    final_dt <- rbind(final_dt, frame_dt)
  }
  
  if(identical(region, "all")){
    final_dt[, psite_region := factor(psite_region,
                                      levels = c("5utr", "cds", "3utr"),
                                      labels = c("5' UTR", "CDS", "3' UTR"))]
  } else {
    final_dt[, psite_region := ifelse(region == "5utr", "5' UTR",
                                      ifelse(region == "cds", "CDS", "3' UTR"))]
  }
  
  final_dt <- final_dt[order(match(tmp_samp, as.character(unlist(sample))), length, psite_region, frame)]
  setnames(final_dt, "psite_region", "region")
  
  output <- list()
  output[["count_dt"]] <- copy(final_dt[, c("tmp_samp", "length", "region", "frame", "count", "scaled_count")])
  setnames(output[["count_dt"]], "tmp_samp", "sample")
  
  # compute mean and se of samples if a list is provided
  if(is.list(sample)){
    
    samp_dt <- data.table(stack(sample))
    setnames(samp_dt, c("tmp_samp", "sample"))
    
    # set names of samples as specified in parameter sample  
    final_dt <- merge.data.table(final_dt, samp_dt, sort = FALSE)[, tmp_samp := NULL]
    
    # compute mean and se
    plot_dt <- final_dt[, .(mean_scaled_count = mean(scaled_count)), by = .(length, region, frame, sample)]
  } else {
    plot_dt <- final_dt[, sample := tmp_samp
                        ][, se_scaled_count := NA]
    setnames(plot_dt, "scaled_count", "mean_scaled_count")
  }
  
  output[["plot_dt"]] <- copy(plot_dt[, c("sample", "region", "frame", "length", "mean_scaled_count")])
  setnames(output[["plot_dt"]], c("frame", "length", "mean_scaled_count"), c("x", "y", "z"))
  
  plot_dt[, sample := factor(sample, levels = unique(sample))]
  zmin <- min(plot_dt$mean_scaled_count)
  zmax <- max(plot_dt$mean_scaled_count)
  
  oldw <- getOption("warn")
  options(warn=-1)
  
  if(plot_style == "split"){ 
    for(samp in unique(plot_dt$sample)){ # generate a plot for each sample and store it
      sub_plot_dt <- plot_dt[sample == samp]
      
      plot <- ggplot(sub_plot_dt, aes(as.factor(frame), length)) +
        geom_tile(aes(fill = mean_scaled_count)) +
        scale_fill_gradient("% P-sites", low = "white", high = colour,
                            limits = c(zmin, zmax),
                            breaks = c(zmin, zmin/2 + zmax/2, zmax),
                            labels = c(round(zmin), round(zmin/2 + zmax/2), round(zmax))) +
        labs(title = samp, x = "Frame", y = "Read length") +
        theme_bw(base_size = 20) +
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
              legend.position = "right", plot.title = element_text(hjust = 0.5)) +
        scale_y_continuous(limits = c(minlen - 0.5, maxlen + 0.5),
                           breaks = seq(minlen + ((minlen) %% 2),maxlen,by = max(2, floor((maxlen - minlen) / 7))))
      
      if(identical(region, "all")) {
        plot <- plot + facet_grid(. ~ region) + 
          theme(strip.background = element_blank())
      }
      
      output[[paste0("plot_", samp)]] <- plot
    }
  }  else {
    plot <- ggplot(plot_dt, aes(frame, length)) +
      geom_tile(aes(fill = mean_scaled_count)) +
      scale_fill_gradient("% P-sites", low = "white", high = colour,
                          limits = c(zmin, zmax),
                          breaks = c(zmin, zmin/2 + zmax/2, zmax),
                          labels = c(round(zmin), round(zmin/2 + zmax/2), round(zmax))) +
      labs(x = "Frame", y = "Read length") +
      theme_bw(base_size = 20) +
      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
            legend.position = "top", legend.margin=margin(0,0,0,0), legend.box.margin=margin(5,0,-5,0)) +
      scale_y_continuous(limits = c(minlen - 0.5, maxlen + 0.5),
                         breaks = seq(minlen + ((minlen) %% 2),maxlen,by = max(2, floor((maxlen - minlen) / 7))))
    
    if(identical(region, "all")) {
      if(identical(plot_style, "facet")){
        plot <- plot + facet_grid(sample ~ region)
      } else {
        plot <- plot + facet_grid(. ~ region)
      }
      plot <- plot + theme(strip.background = element_blank())
    } else {
      if(identical(plot_style, "facet")){
        plot <- plot + facet_wrap(sample ~ .)
      }
      plot <- plot + theme(strip.background = element_blank())
    }
    
    output[["plot"]] <- plot
  }
  
  options(warn = oldw)
  return(output)
}
LabTranslationalArchitectomics/riboWaltz documentation built on Jan. 17, 2024, 12:18 p.m.