R/codon_coverage.R

Defines functions codon_coverage

Documented in codon_coverage

#' Number of reads per codon.
#'
#' This function computes transcript-specific codon coverages, defined as the
#' number of either read footprints or P-sites mapping on each triplet of coding
#' sequences and UTRs (see \emph{Details}). The resulting data table contains,
#' for each triplet: i) the name of the corresponding reference sequence (i.e.
#' of the transcript to which it belongs); ii) its leftmost and rightmost
#' position with respect to the 1st nucleotide of the reference sequence; iii)
#' its position with respect to the 1st and the last codon of the annotated CDS
#' of the reference sequence; iv) the region of the transcript (5' UTR, CDS, 3'
#' UTR) it is in; v) the number of read footprints or P-sites falling in that
#' region for all samples.
#'
#' @param data Either list of data tables or GRangesList object from
#'   \code{\link{psite_info}}. Data tables and GRanges objects generated by
#'   \code{\link{bamtolist}} and \code{\link{bedtolist}} can be used if
#'   \code{psite} is FALSE (the default).
#' @param annotation Data table as generated by \code{\link{create_annotation}}.
#' @param sample Character string vector specifying the name of the sample(s) of
#'   interest. Default is NULL i.e. all samples in \code{data} are processed.
#' @param psite Logical value whether to return the number of P-sites per codon.
#'   Default is TRUE. If FALSE, the number of read footprints per codon is
#'   returned instead.
#' @param min_overlap Positive integer specifying the minimum number of
#'   overlapping positions (in nucleotides) between reads and codons to be
#'   considered overlapping. If \code{psite} is TRUE this parameter must be 1
#'   (the default).
#' @param output_class Either "datatable" or "granges". It specifies the format
#'   of the output i.e. a list of data tables or a GRangesList object. Default
#'   is "datatable".
#' @details The sequence of every transcript is divided in triplets starting
#'   from the annotated translation initiation site (if any) and proceeding
#'   towards the UTRs extremities, possibly discarding the exceeding 1 or 2
#'   nucleotides at the extremities of the transcript. Please note: transcripts
#'   not associated to any annotated 5' UTR, CDS and 3'UTR and transcripts whose
#'   coding sequence length is not divisible by 3 are automatically discarded.
#' @return A data table or GRanges object.
#' @examples
#' ## data(reads_list)
#' ## data(mm81cdna)
#' ##
#' ## ## compute and add p-site datails
#' ## psite_offset <- psite(reads_list, flanking = 6, extremity = "auto")
#' ## reads_psite_list <- psite_info(reads_list, psite_offset)
#' ##
#' ## Compute the codon coverage based on the number of ribosome footprint per
#' ## codon, setting the minimum overlap between reads and triplets to 3 nts:
#' ## coverage_dt <- codon_coverage(reads_psite_list, mm81cdna, min_overlap = 3)
#' ## 
#' ## Compute the coverage based on the number of P-sites per codon:
#' ## coverage_dt <- codon_coverage(reads_psite_list, mm81cdna, psite = TRUE)
#' @import data.table
#' @export
codon_coverage <- function(data, annotation, sample = NULL, psite = FALSE,
                           min_overlap = 1, output_class = "datatable") {
  
  if((psite == TRUE || psite == T) & min_overlap != 1){
    cat("\n")
    stop("invalid value for min_over when psite is TRUE\n\n")
  }
  
  if(length(sample) == 0){
    sample <- names(data)
  }
  
  bin <- 3
  
  cat("1. creating codon table and computing distance from start/stop codon\n")
  
  l_transcripts <- as.character(annotation[l_utr5 > 0 & 
                                             l_cds > 0 &
                                             l_cds %% 3 == 0 &
                                             l_utr3 > 0, transcript])
  
  bin_coverage_tab <- annotation[as.character(transcript) %in% l_transcripts
                                 ][order(transcript)
                                   ][, transcript := factor(transcript, levels = transcript)
                                     ][, start := l_utr5 %% bin
                                       ][, l_utr5 := l_utr5 - (l_utr5 %% bin)
                                         ][, l_cds := l_cds - (l_cds %% bin)
                                           ][, l_utr3 := l_utr3 - (l_utr3 %% bin)
                                             ][, len := l_utr5 + l_cds + l_utr3
                                               ][, list(start = seq(from = start, to = start + len - bin, by = bin),
                                                        end = seq(from = start, to = start + len - bin, by = bin) + bin,
                                                        from_cds_start = (seq(from = start, to = start + len - bin, by = bin) - (start + l_utr5)) / bin,
                                                        from_cds_stop = (seq(from = start, to = start + len - bin, by = bin) - (start + l_utr5 + l_cds - bin)) / bin),
                                                        by = transcript
                                                 ]
  
  gr_interval <- GenomicRanges::GRanges(seqnames = bin_coverage_tab$transcript,
                                        IRanges::IRanges(bin_coverage_tab$start + 1,  width = bin),
                                        strand = "+")
  
  cat("2. acquiring region information\n")
  bin_coverage_tab[, region := "5utr"
                   ][from_cds_start >= 0 & from_cds_stop <= 0, region := "cds"
                     ][from_cds_stop > 0, region := "3utr"]

  if(psite == T || psite == TRUE){
    cat("3. computing codon coverage based on P-sites\n")
  } else {
    cat("3. computing codon coverage based on read footprints\n")
  }
  
  for(samp in sample){
    cat(sprintf("sample : %s\n", samp))
    
    dt <- data[[samp]]
    
    if(class(dt)[1] == "GRanges"){
      dt <- as.data.table(dt)[, c("width", "strand") := NULL
                              ][, seqnames := as.character(seqnames)]
      setnames(dt, c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
    }
    
    dt <- dt[as.character(transcript) %in% levels(bin_coverage_tab$transcript)
             ][, transcript := factor(transcript, levels = levels(bin_coverage_tab$transcript))]
    
    if(psite == T || psite == TRUE){
      gr_read <- GenomicRanges::GRanges(seqnames = dt$transcript,
                                        IRanges::IRanges(dt$psite, dt$psite),
                                        strand="+")
    } else {
      gr_read <- GenomicRanges::GRanges(seqnames = dt$transcript,
                                        IRanges::IRanges(dt$end5, dt$end3),
                                        strand="+")
    }
    
    bin_coverage_tab[, (samp) := GenomicRanges::countOverlaps(gr_interval, gr_read, minoverlap = min_overlap)]
  }
  
  if(output_class == "granges"){
    bin_coverage_tab <- GenomicRanges::makeGRangesFromDataFrame(bin_coverage_tab,
                                                                keep.extra.columns = TRUE,
                                                                ignore.strand = TRUE,
                                                                seqnames.field = c("transcript"),
                                                                start.field = "end5",
                                                                end.field = "end3",
                                                                strand.field = "strand",
                                                                starts.in.df.are.0based = FALSE)
    GenomicRanges::strand(bin_coverage_tab) <- "+"
  }
  
  return(bin_coverage_tab)
}
LabTranslationalArchitectomics/riboWaltz documentation built on Jan. 17, 2024, 12:18 p.m.