R/cds_coverage.R

Defines functions cds_coverage

Documented in cds_coverage

#' Number of in-frame P-sites per coding sequence or transcript.
#'
#' This function generates a data table that for each transcript contains at
#' least i) its name; ii) its length or the length of its annotated coding
#' sequence (if any); iii) the number of P-sites falling in its annotated coding
#' sequence or in the whole transcript for all samples. When the coverage is
#' computed for coding sequences and not whole transcripts, a chosen number of
#' nucleotides at the beginning and/or at the end of the CDSs can be excluded
#' for restricting the analysis to a portion of the original coding sequence. In
#' this case the output data table includes an additional column reporting the
#' length of the selected region. Moreover, either all P-sites or only in-frame
#' P-sites can be considered. Finally, transcripts without annotated CDS are
#' automatically discarded.
#'
#' @param data Either list of data tables or GRangesList object from
#'   \code{\link{psite_info}}.
#' @param annotation Data table as generated by \code{\link{create_annotation}}.
#' @param start_nts Positive integer specifying the number of nucleotides at the
#'   beginning of the coding sequences to be excluded from the analisys. Default
#'   is 0. This parameter is considered only if \code{whole_transcript} is
#'   FALSE.
#' @param stop_nts Positive integer specifying the number of nucleotides at the
#'   end of the coding sequences to be excluded from the analisys. Default is 0.
#'   This parameter is considered only if \code{whole_transcript} is FALSE.
#' @param in_frame Logical value whether to consider only in-frame P-sites when
#'   computing the coverage. Default is TRUE. This parameter is considered only
#'   if \code{whole_transcript} is FALSE.
#' @param whole_transcript Logical value whether to compute the coverage based
#'   on P-sites falling in any region of the transcript, i.e. CDS and UTRs.
#'   Default is FALSE.
#' @return A data table.
#' @examples
#' ## data(reads_list)
#' ## data(mm81cdna)
#' ##
#' ## ## compute and add p-site datails
#' ## psite_offset <- psite(reads_list, flanking = 6, extremity = "auto")
#' ## reads_psite_list <- psite_info(reads_list, psite_offset)
#' ##
#' ## ## Compute the number of in-frame P-sites per whole coding sequences.
#' ## psite_cds <- cds_coverage(reads_psite_list, mm81cdna)
#' ## 
#' ## ## Compute the number of in-frame P-sites per coding sequences exluding
#' ## ## the first 15 nucleotides and the last 10 nucleotides.
#' ## psite_cds <- cds_coverage(reads_psite_list, mm81cdna,
#' ##                           start_nts = 15, stop_nts = 10)
#' ## 
#' ## ## Compute the number of P-sites per transcripts.
#' ## psite_cds <- cds_coverage(reads_psite_list, mm81cdna,
#' ##                           whole_transcript = TRUE)
#' @import data.table
#' @export
cds_coverage <- function(data, annotation, start_nts = 0, stop_nts = 0,
                         in_frame = TRUE, whole_transcript = FALSE) {

  if(whole_transcript == TRUE & (start_nts != 0 | stop_nts != 0)){
    cat("\n")
    warning("either start_nts or stop_nts are not 0 but parameter whole_transcript = TRUE: start_nts and stop_nts won't be considered\n", call. = FALSE)
  }
  
  if(whole_transcript == FALSE){
    psite_cds <- annotation[l_cds > start_nts + stop_nts,
                            list(transcript, length_cds = l_cds)]
    
    if(start_nts != 0 | stop_nts != 0){
      psite_cds[, length_selection := length_cds - start_nts - stop_nts]
    }
  } else {
    psite_cds <- annotation[, list(transcript, length_tr = l_tr)]
  }
  
  for (n in names(data)) {
    cat(sprintf("processing %s\n", n))
    
    dt <- data[[n]]
    
    if(class(dt)[1] == "GRanges"){
      dt <- as.data.table(dt)[, c("width", "strand") := NULL
                              ][, seqnames := as.character(seqnames)]
      setnames(dt, c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
    }

    if(whole_transcript == FALSE){
      dt <- dt[psite_from_start >= start_nts &
                 psite_from_stop <= -stop_nts]
      if(in_frame == TRUE){ 
        dt <- dt[psite_from_start %% 3 == 0]
      }
    }
    
    temp_count <- dt[, .N, by = transcript]
    psite_cds <- merge(psite_cds, temp_count, by = "transcript", all.x = TRUE)
    setnames(psite_cds, old = "N", new = n)
  }
  psite_cds[is.na(psite_cds)] <- 0
  return(psite_cds)
}
LabTranslationalArchitectomics/riboWaltz documentation built on Jan. 17, 2024, 12:18 p.m.