#' Number of reads per codon.
#'
#' This function computes transcript-specific codon coverages, defined as the
#' number of either read footprints or P-sites mapping on each triplet of coding
#' sequences and UTRs (see \emph{Details}). The resulting data table contains,
#' for each triplet: i) the name of the corresponding reference sequence (i.e.
#' of the transcript to which it belongs); ii) its leftmost and rightmost
#' position with respect to the 1st nucleotide of the reference sequence; iii)
#' its position with respect to the 1st and the last codon of the annotated CDS
#' of the reference sequence; iv) the region of the transcript (5' UTR, CDS, 3'
#' UTR) it is in; v) the number of read footprints or P-sites falling in that
#' region for all samples.
#'
#' @param data Either list of data tables or GRangesList object from
#' \code{\link{psite_info}}. Data tables and GRanges objects generated by
#' \code{\link{bamtolist}} and \code{\link{bedtolist}} can be used if
#' \code{psite} is FALSE (the default).
#' @param annotation Data table as generated by \code{\link{create_annotation}}.
#' @param sample Character string vector specifying the name of the sample(s) of
#' interest. Default is NULL i.e. all samples in \code{data} are processed.
#' @param psite Logical value whether to return the number of P-sites per codon.
#' Default is TRUE. If FALSE, the number of read footprints per codon is
#' returned instead.
#' @param min_overlap Positive integer specifying the minimum number of
#' overlapping positions (in nucleotides) between reads and codons to be
#' considered overlapping. If \code{psite} is TRUE this parameter must be 1
#' (the default).
#' @param output_class Either "datatable" or "granges". It specifies the format
#' of the output i.e. a list of data tables or a GRangesList object. Default
#' is "datatable".
#' @details The sequence of every transcript is divided in triplets starting
#' from the annotated translation initiation site (if any) and proceeding
#' towards the UTRs extremities, possibly discarding the exceeding 1 or 2
#' nucleotides at the extremities of the transcript. Please note: transcripts
#' not associated to any annotated 5' UTR, CDS and 3'UTR and transcripts whose
#' coding sequence length is not divisible by 3 are automatically discarded.
#' @return A data table or GRanges object.
#' @examples
#' ## data(reads_list)
#' ## data(mm81cdna)
#' ##
#' ## ## compute and add p-site datails
#' ## psite_offset <- psite(reads_list, flanking = 6, extremity = "auto")
#' ## reads_psite_list <- psite_info(reads_list, psite_offset)
#' ##
#' ## Compute the codon coverage based on the number of ribosome footprint per
#' ## codon, setting the minimum overlap between reads and triplets to 3 nts:
#' ## coverage_dt <- codon_coverage(reads_psite_list, mm81cdna, min_overlap = 3)
#' ##
#' ## Compute the coverage based on the number of P-sites per codon:
#' ## coverage_dt <- codon_coverage(reads_psite_list, mm81cdna, psite = TRUE)
#' @import data.table
#' @export
codon_coverage <- function(data, annotation, sample = NULL, psite = FALSE,
min_overlap = 1, output_class = "datatable") {
if((psite == TRUE || psite == T) & min_overlap != 1){
cat("\n")
stop("invalid value for min_over when psite is TRUE\n\n")
}
if(length(sample) == 0){
sample <- names(data)
}
bin <- 3
cat("1. creating codon table and computing distance from start/stop codon\n")
l_transcripts <- as.character(annotation[l_utr5 > 0 &
l_cds > 0 &
l_cds %% 3 == 0 &
l_utr3 > 0, transcript])
bin_coverage_tab <- annotation[as.character(transcript) %in% l_transcripts
][order(transcript)
][, transcript := factor(transcript, levels = transcript)
][, start := l_utr5 %% bin
][, l_utr5 := l_utr5 - (l_utr5 %% bin)
][, l_cds := l_cds - (l_cds %% bin)
][, l_utr3 := l_utr3 - (l_utr3 %% bin)
][, len := l_utr5 + l_cds + l_utr3
][, list(start = seq(from = start, to = start + len - bin, by = bin),
end = seq(from = start, to = start + len - bin, by = bin) + bin,
from_cds_start = (seq(from = start, to = start + len - bin, by = bin) - (start + l_utr5)) / bin,
from_cds_stop = (seq(from = start, to = start + len - bin, by = bin) - (start + l_utr5 + l_cds - bin)) / bin),
by = transcript
]
gr_interval <- GenomicRanges::GRanges(seqnames = bin_coverage_tab$transcript,
IRanges::IRanges(bin_coverage_tab$start + 1, width = bin),
strand = "+")
cat("2. acquiring region information\n")
bin_coverage_tab[, region := "5utr"
][from_cds_start >= 0 & from_cds_stop <= 0, region := "cds"
][from_cds_stop > 0, region := "3utr"]
if(psite == T || psite == TRUE){
cat("3. computing codon coverage based on P-sites\n")
} else {
cat("3. computing codon coverage based on read footprints\n")
}
for(samp in sample){
cat(sprintf("sample : %s\n", samp))
dt <- data[[samp]]
if(class(dt)[1] == "GRanges"){
dt <- as.data.table(dt)[, c("width", "strand") := NULL
][, seqnames := as.character(seqnames)]
setnames(dt, c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
}
dt <- dt[as.character(transcript) %in% levels(bin_coverage_tab$transcript)
][, transcript := factor(transcript, levels = levels(bin_coverage_tab$transcript))]
if(psite == T || psite == TRUE){
gr_read <- GenomicRanges::GRanges(seqnames = dt$transcript,
IRanges::IRanges(dt$psite, dt$psite),
strand="+")
} else {
gr_read <- GenomicRanges::GRanges(seqnames = dt$transcript,
IRanges::IRanges(dt$end5, dt$end3),
strand="+")
}
bin_coverage_tab[, (samp) := GenomicRanges::countOverlaps(gr_interval, gr_read, minoverlap = min_overlap)]
}
if(output_class == "granges"){
bin_coverage_tab <- GenomicRanges::makeGRangesFromDataFrame(bin_coverage_tab,
keep.extra.columns = TRUE,
ignore.strand = TRUE,
seqnames.field = c("transcript"),
start.field = "end5",
end.field = "end3",
strand.field = "strand",
starts.in.df.are.0based = FALSE)
GenomicRanges::strand(bin_coverage_tab) <- "+"
}
return(bin_coverage_tab)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.