#' Empirical codon usage indexes.
#'
#' This function computes empirical codon usage indexes based on either the
#' ribosome P-sites, A-site or E-site frequency associated with in-frame P-sites
#' falling in the coding sequence. For each sample it computes 64
#' triplet-specific codon usage indexes ranging from 0 to 1 and optionally
#' normalized for the frequency of the corresponding triplet within the CDSs.
#' This function also allows to compare either two sets of codon usage indexes
#' from different samples or the set of codon usage indexes computed for one
#' sample and 64 triplet-specific values provided by the user. Multiple samples
#' and replicates can be handled.
#'
#' @param data Either list of data tables or GRangesList object from
#' \code{\link{psite_info}}. Each data table may or may not include one or
#' more columns among \emph{p_site_codon}, \emph{a_site_codon} and
#' \emph{e_site_codon} reporting the three nucleotides covered by the P-site,
#' A-site and E-site, respectively. These columns can be previously generated
#' by the \code{\link{psite_info}} function. Otherwise, the column of interest
#' must be specified by \code{site} and it is automatically generated starting
#' from a FASTA file or a BSgenome data package.
#' @param annotation Data table as generated by \code{\link{create_annotation}}.
#' @param sample Either character string, character string vector or named list
#' of character string(s)/character string vector(s) specifying the name of
#' the sample(s) and replicate(s) of interest. If a list is provided, each
#' element of the list is considered as an independent sample associated with
#' one ore multiple replicates. Multiple samples and replicates are handled
#' and organized according to \code{multisamples} and \code{plot_style}.
#' @param multisamples Either "average" or "independent". It specifies how to
#' handle multiple samples and replicates stored in \code{sample}:
#' * if \code{sample} is a character string vector and \code{multisamples} is
#' set to "average", the elements of the vector are considered as replicates
#' of one sample.
#' * if \code{sample} is a character string vector and \code{multisamples} is
#' set to "independent", each element of the vector is analysed independently
#' of the others.
#' * if \code{sample} is a list, \code{multisamples} must be set to "average".
#' Each element of the list is analysed independently of the others, its
#' replicates averaged and its name reported in the plot.
#' Note: when this parameter is set to "average" the bar plot associated with
#' each sample displays the triplet-specific mean signal computed across the
#' replicates and the corresponding standard error is also reported. Scatter
#' plots (see \code{contrast_sample}) display the mean signal but not the
#' standard error to streamline the plot.
#' Default is "average".
#' @param plot_style Either "split" or "facet". It specifies how to organize and
#' display multiple bar plots:
#' * "split": one bar plot for each sample is returned as an independent
#' ggplot object. In each plot the bars are arranged in increasing order
#' according to the codon usage indexes.
#' * "facet": the bar plots are placed one below the other, in independent
#' boxes. The bars of all plots are arranged following the alphabetical order
#' of the codons.
#' Note: this parameter is considered only if \code{contrast_sample} is not
#' specified and thus bar plots are generated.
#' Default is "split".
#' @param site Either "psite, "asite", "esite". It specifies if the empirical
#' codon usage indexes should be based on ribosome P-sites ("psite"), A-sites
#' ("asite") or E-sites ("esite"). Default is "psite".
#' @param frequency_normalization Logical value whether to normalize the
#' 64 codon usage indexes for the corresponding codon frequencies in coding
#' sequences. Default is TRUE.
#' Note: the low frequency of the three stop codons often leads to codon usage
#' indexes higher for these triplets than for the others. To discard the stop
#' codons from the plots and avoid potential biases in the interpretation of
#' the data, see \code{include_stop_codons}.
#' @param contrast_sample Either character string or character string vector. It
#' specifies the sample(s) (if any) to be considered for the comparison of
#' codon usage indexes between:
# * two samples: \code{contrast_sample} must be a character string vector of
#' exactly two elements from those listed either in \code{sample} (if
#' \code{sample} is a character string or a character string vector) or in
#' names(\code{sample}) (if \code{sample} is a list).
#' * one sample and 64 triplet-specific values: \code{contrast_sample} must be
#' one character string from those listed either in \code{sample} (if
#' \code{sample} is a character string or a character string vector) or in
#' names(\code{sample}) (if \code{sample} is a list). Moreover,
#' \code{codon_values} must be provided.
#' Note: when this parameter is specified the function generates a scatter
#' plot instead of one or multiple bar plots, it performs a linear regression
#' and computes the Pearson correlation coefficient. See also
#' \code{label_scatter}, \code{label_number} and \code{label_aminoacid}.
#' Default is "NULL".
#' @param codon_values Data table containing 64 triplet-specific values to be
#' compared with the empirical codon usage indexes computed for the sample
#' specified in \code{contrast_sample}. The data table must contain the DNA or
#' RNA nucleotide sequence of the 64 codons and the corresponding values
#' arranged in two columns named \emph{codon} and \emph{value}, respectively.
#' Additional columns are ignored. This parameter is considered only if
#' \code{contrast_sample} is specified. Default is NULL.
#' @param fastapath Character string specifying the FASTA file used in the
#' alignment step, including its path, name and extension. This file can
#' contain reference nucleotide sequences either of genome asseblies
#' (chromosome sequences) or of transcripts (see \code{Details} and
#' \code{fasta_genome}). Please make sure the sequences derive from the same
#' release of the annotation file used in the \code{\link{create_annotation}}
#' function. Note: either \code{fastapath} or \code{bsgenome} is required to
#' compute the frequency of each codon along sequences, used as normalization
#' factors, even if \code{data} includes one or more columns among
#' \emph{p_site_codon}, \emph{a_site_codon} and \emph{e_site_codon}. Default
#' is NULL.
#' @param fasta_genome Logical value whether the FASTA file specified by
#' \code{fastapath} contains nucleotide sequences of genome asseblies
#' (chromosome sequences). If TRUE (the default), an annotation object is
#' required (see \code{gtfpath} and \code{txdb}). FALSE implies nucleotide
#' sequences of transcripts are provided instead.
#' @param refseq_sep Character specifying the separator between reference
#' sequences' name and additional information to discard, stored in the
#' headers of the FASTA file specified by \code{fastapath} (if any). It might
#' be required for matching the reference sequences' identifiers reported in
#' the input list of data tables. All characters before the first occurrence
#' of the specified separator are kept. Default is NULL i.e. no string
#' splitting is performed.
#' @param bsgenome Character string specifying the BSgenome data package with
#' the genome sequences to be loaded. If not already present in the system, it
#' is automatically installed through the biocLite.R script (check the list of
#' available BSgenome data packages by running the
#' \code{\link[BSgenome]{available.genomes}} function of the BSgenome
#' package). This parameter must be coupled with an annotation object (see
#' \code{gtfpath} and \code{txdb}). Please make sure the sequences included in
#' the specified BSgenome data pakage are in agreement with the sequences used
#' in the alignment step. Note: either \code{fastapath} or \code{bsgenome} is
#' required to compute the frequency of each codon along sequences, used as
#' normalization factors, even if \code{data} includes one or more columns
#' among \emph{p_site_codon}, \emph{a_site_codon} and \emph{e_site_codon}.
#' Default is NULL.
#' @param gtfpath Character string specifying the location of a GTF file,
#' including its path, name and extension. Please make sure the GTF file and
#' the sequences specified by \code{fastapath} or \code{bsgenome} derive from
#' the same release. Note that either \code{gtfpath} or \code{txdb} is
#' required if and only if nucleotide sequences of genome assemblies
#' (chromosome sequences) are provided (see \code{fastapath} or
#' \code{bsgenome}). Default is NULL.
#' @param txdb Character string specifying the TxDb annotation package to be
#' loaded. If not already present in the system, it is automatically installed
#' through the biocLite.R script (check
#' \href{http://bioconductor.org/packages/release/BiocViews.html#___TxDb}{here}
#' the list of available TxDb annotation packages). Please make sure the TxDb
#' annotation package and the sequences specified by \code{fastapath} or
#' \code{bsgenome} derive from the same release. Note that either
#' \code{gtfpath} or \code{txdb} is required if and only if nucleotide
#' sequences of genome assemblies (chromosome sequences) are provided (see
#' \code{fastapath} or \code{bsgenome}). Default is NULL.
#' @param dataSource Optional character string describing the origin of the GTF
#' data file. This parameter is considered only if \code{gtfpath} is
#' specified. For more information about this parameter please refer to the
#' description of \emph{dataSource} of the
#' \code{\link[GenomicFeatures]{makeTxDbFromGFF}} function included in the
#' \code{GenomicFeatures} package.
#' @param organism Optional character string reporting the genus and species of
#' the organism of the GTF data file. This parameter is considered only if
#' \code{gtfpath} is specified. For more information about this parameter
#' please refer to the description of \emph{organism} of the
#' \code{\link[GenomicFeatures]{makeTxDbFromGFF}} function included in the
#' \code{GenomicFeatures} package.
#' @param transcripts Character string vector listing the name of transcripts to
#' be included in the analysis. Default is NULL i.e. all transcripts are used.
#' Please note: transcripts without annotated CDS and transcripts whose coding
#' sequence length is not divisible by 3 are automatically discarded.
#' @param length_range Integer or integer vector for restricting the plot to a
#' chosen range of read lengths. Default is NULL, i.e. all read lengths are
#' used. If specified, this parameter prevails over \code{cl}.
#' @param cl Integer value in [1,100] specifying a confidence level for
#' restricting the plot to an automatically-defined range of read lengths. The
#' new range is computed according to the most frequent read lengths, which
#' accounts for the cl% of the sample and is defined by discarding the
#' (100-cl)% of read lengths falling in the tails of the read lengths
#' distribution. If multiple samples are analysed, a single range of read
#' lengths is computed such that at least the cl% of all samples is
#' represented. Default is 100.
#' @param include_stop_codons Logical value whether to include the three stop
#' codons in the plots. Default is TRUE.
#' @param label_scatter Logical value whether to label the dots in the scatter
#' plot. Each dot is labeled using either the nucleotide sequence of the codon
#' or the corresponding amino acid symbol (see \code{label_aminoacid}). This
#' parameter is considered only if \code{contrast_sample} is specified.
#' Default is FALSE.
#' @param label_number Integer value in [1,64] specifying how many dots in the
#' scatter plot should be labeled. Dots farthest from the confident interval
#' of the regression line are automatically identified and labeled. Default is
#' 64 i.e. all dots are labeled. This parameter is considered only if
#' \code{label_scatter} is TRUE.
#' @param label_aminoacid Logical value whether to use amino acid symbols to
#' label the dots of the scatter plot. Default is FALSE i.e. codon nucleotide
#' sequences are used instead. This parameter is considered only if
#' \code{label_scatter} is TRUE.
#' @return List containing: one or more ggplot object(s) and the data table with
#' the corresponding x- and y-axis values ("plot_dt"); an additional data
#' table with raw, normalized and scaled number of P-sites associated with the
#' 64 triplets for each sample contributing to the plot ("count_dt").
#' @details \strong{riboWaltz} only works for read alignments based on
#' transcript coordinates. This choice is due to the main purpose of RiboSeq
#' assays to study translational events through the isolation and sequencing
#' of ribosome protected fragments. Most reads from RiboSeq are supposed to
#' map on mRNAs and not on introns and intergenic regions. BAM based on
#' transcript coordinates can be generated in two ways: i) aligning directly
#' against transcript sequences; ii) aligning against sequences of genome
#' assemblies i.e. standard chromosome sequences, thus requiring the outputs
#' to be translated in transcript coordinates. The first option can be easily
#' handled by many aligners (e.g. Bowtie), given a reference FASTA file where
#' each sequence represents a transcript, from the beginning of the 5' UTR to
#' the end of the 3' UTR. The second procedure is based on reference FASTA
#' files where each sequence represents a chromosome, usually coupled with
#' comprehensive gene annotation files (GTF or GFF). The STAR aligner, with
#' its option --quantMode TranscriptomeSAM (see Chapter 6 of its
#' \href{http://labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STAR.posix/doc/STARmanual.pdf}{manual}),
#' is an example of tool providing such a feature.
#' @examples
#' ## data(reads_list)
#' ## data(mm81cdna)
#' ##
#' ## ## Generate fake samples and replicates
#' ## for(i in 2:6){
#' ## samp_name <- paste0("Samp", i)
#' ## set.seed(i)
#' ## reads_list[[samp_name]] <- reads_list[["Samp1"]][sample(.N, 5000)]
#' ## }
#' ##
#' ## ## Compute and add p-site details
#' ## psite_offset <- psite(reads_list, flanking = 6, extremity = "auto")
#' ## reads_psite_list <- psite_info(reads_list, psite_offset)
#' ##
#' ## ## Define the list of samples and replicate to use as input
#' ## input_samples <- list("S1" = c("Samp1", "Samp2"),
#' ## "S2" = c("Samp3", "Samp4", "Samp5"),
#' ## "S3" = c("Samp6"))
#' ##
#' ## Generate bar plots, alignment on transcript sequences
#' ## example_cu_barplot <- codon_usage_psite(reads_psite_list, mm81cdna,
#' ## sample = input_samples,
#' ## multisamples = "average",
#' ## plot_style = "facet",
#' ## fastapath = "path/to/transcriptome/FASTA/file",
#' ## fasta_genome = FALSE)
#' ##
#' ## Generate bar plots, alignment on chromosome sequences
#' ## example_cu_barplot <- codon_usage_psite(reads_psite_list, mm81cdna,
#' ## sample = input_samples,
#' ## multisamples = "average",
#' ## plot_style = "facet",
#' ## fastapath = "path/to/chromosome/FASTA/file",
#' ## fasta_genome = TRUE)
#' ##
#' ## Generate scatterplot comparing two samples
#' ## example_cu_barplot <- codon_usage_psite(reads_psite_list, mm81cdna,
#' ## sample = input_samples,
#' ## contrast_sample = c("S1", "S2"),
#' ## fastapath = "path/to/transcriptome/FASTA/file",
#' ## fasta_genome = FALSE,
#' ## frequency_normalization = FALSE)
#' @import data.table
#' @import ggplot2
#' @export
codon_usage_psite <- function(data, annotation, sample, multisamples = "average",
plot_style = "split", site = "psite",
frequency_normalization = TRUE,
contrast_sample = NULL, codon_values = NULL,
fastapath = NULL, fasta_genome = TRUE,
refseq_sep = NULL, bsgenome = NULL, gtfpath = NULL,
txdb = NULL, dataSource = NA, organism = NA,
transcripts = NULL, length_range = NULL, cl = 100,
include_stop_codons = TRUE, label_scatter = FALSE,
label_number = 64, label_aminoacid = FALSE) {
if(class(data[[1]])[1] == "GRanges"){
data_tmp <- list()
for(i in names(data)){
data_tmp[[i]] <- as.data.table(data[[i]])[, c("width", "strand") := NULL
][, seqnames := as.character(seqnames)]
setnames(data_tmp[[i]], c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
}
data <- data_tmp
}
check_sample <- setdiff(unlist(sample), names(data))
if(length(check_sample) != 0){
cat("\n")
stop(sprintf("incorrect sample name(s): \"%s\" not found\n\n",
paste(check_sample, collapse = ", ")))
}
if(length(sample) == 0){
cat("\n")
stop("at least one sample name must be spcified\n\n")
}
if(!is.null(contrast_sample)){
if(is.list(sample) & !all(contrast_sample %in% names(sample))){
cat("\n")
stop("contrast_sample includes at least one character string not listed in names(sample)\n\n")
}
if(!is.list(sample) & !all(contrast_sample %in% sample)){
cat("\n")
stop("contrast_sample includes at least one character string not listed in parameter sample\n\n")
}
if(length(contrast_sample) >= 2 & length(sample) > 1 & is.character(sample) & multisamples == "average"){
cat("\n")
warning("contrast_sample includes samples which are averaged according to
multisamples = \"average\". Parameter multisamples will be coerced to
\"independent\"\n", call. = FALSE)
multisamples <- "independent"
}
if(length(contrast_sample) == 1 & is.null(codon_values)){
cat("\n")
warning("length of contrast_sample is 1 but codon_values is NULL:
contrast_sample will not be considered.\n", call. = FALSE)
contrast_sample <- NULL
}
if(length(contrast_sample) > 2){
cat("\n")
warning("number of samples specified in contrast_sample > 2:
the first two samples will be considered\n", call. = FALSE)
contrast_sample <- contrast_sample[1:2]
}
if(length(contrast_sample) == 2 & length(sample) == 1){
cat("\n")
stop("length of contrast_sample is 2 but only one element is listed in sample.\n\n")
}
if(!is.null(contrast_sample)){
if(is.list(sample)){
sample <- sample[contrast_sample]
} else {
sample <- contrast_sample
}
}
}
if(!is.null(codon_values)){
if(length(contrast_sample) == 2){
cat("\n")
warning("codon_values is specified but two samples are listed in contrast_sample:
codon_values will not be considered\n", call. = FALSE)
codon_values <- NULL
}
if(is.null(contrast_sample)){
cat("\n")
warning("codon_values is specified but contrast_sample is NULL:
codon_values will not be considered\n", call. = FALSE)
codon_values <- NULL
}
if (!is.null(codon_values) & uniqueN(as.character(codon_values$codon)) < 64){
cat("\n")
stop("number of different triplets in codon_values < 64. At least one codon is missing\n\n")
} else {
if (!is.null(codon_values) & uniqueN(as.character(codon_values$codon)) > 64){
cat("\n")
stop("number of different triplets in codon_values > 64. Too many codons\n\n")
}
}
}
if(!(site %in% c("psite", "asite", "esite"))){
cat("\n")
warning("parameter site must be either \"psite\", \"asite\" or \"esite\":
parameter site will be coerced to default \"psite\"\n", call. = FALSE)
site <- "psite"
}
if(length(length_range) != 0 & !inherits(length_range, "numeric") & !inherits(length_range, "integer")){
cat("\n")
warning("class of length_range is neither numeric nor integer. Set to default NULL\n", call. = FALSE)
length_range = NULL
}
if(!(multisamples %in% c("average", "independent"))){
cat("\n")
warning("parameter multisamples must be either \"average\" or \"independent\".
Set to default \"average\"\n", call. = FALSE)
multisamples <- "average"
}
if(multisamples == "independent" & is.list(sample)) {
cat("\n")
warning("parameter multisamples is set to \"independent\" but parameter sample is a list:
parameter multisamples will be coerced to default \"average\"\n", call. = FALSE)
multisamples <- "average"
}
if(is.character(sample) & length(sample) == 1) {
multisamples <- "independent"
plot_style <- "split"
}
if(is.character(sample) & length(sample) > 1 & multisamples == "average") {
sample <- list("Average" = sample)
plot_style <- "split"
cat("\n")
warning("Default name of averaged samples is \"Average\":
consider to use a named list of one element to provide a meaningful plot title\n", call. = FALSE)
}
if(is.list(sample) & length(sample) == 1){
plot_style <- "split"
}
if(!is.null(contrast_sample)){
plot_style <- "split"
}
if(!(plot_style %in% c("split", "facet"))){
cat("\n")
warning("parameter plot_style must be either \"split\" or \"facet\".
Set to default \"split\"\n", call. = FALSE)
plot_style <- "split"
}
# select transcripts
l_transcripts <- as.character(annotation[l_cds > 0 & l_cds %% 3 == 0, transcript])
if (length(transcripts) == 0) {
c_transcripts <- l_transcripts
} else {
c_transcripts <- intersect(l_transcripts, transcripts)
}
# define length range taking into account all (unlisted) samples
if(length(length_range) == 0){
for(samp in as.character(unlist(sample))){
dt <- data[[samp]][transcript %in% c_transcripts]
if(length(length_range) == 0){
length_range <- seq(quantile(dt$length, (1 - cl/100)/2),
quantile(dt$length, 1 - (1 - cl/100)/2))
} else {
xmin <- min(min(length_range), quantile(dt$length, (1 - cl/100)/2))
xmax <- max(max(length_range), quantile(dt$length, 1 - (1 - cl/100)/2))
length_range <- seq(xmin, xmax)
}
}
}
# check if all samples have reads of the specified lengths
# especially required if only one read length is passed
if(length(length_range) != 0){
if(is.list(sample)){
samp_dt <- data.table(stack(sample))
setnames(samp_dt, c("sample", "sample_l"))
} else {
samp_dt <- data.table("sample" = sample, "sample_l" = sample)
}
for(samp in samp_dt$sample){
dt <- data[[samp]][cds_start != 0 & cds_stop !=0]
if(length(c_transcripts) != 0) {
dt <- dt[transcript %in% c_transcripts]
}
len_check <- unique(dt$length)
if(sum(length_range %in% len_check) == 0) {
cat("\n")
warning(sprintf("\"%s\" doesn't contain any reads of the selected lengths: sample removed\n", samp), call. = FALSE)
#select element of sample which include the sample to be removed (useful if sample is a list)
sel_l_samp <- samp_dt[sample == samp, sample_l]
#remove the sample from the list/vector
if(is.list(samp)){
sample[[sel_l_samp]] <- sample[[sel_l_samp]][sample[[sel_l_samp]] != samp]
} else {
sample <- sample[sample != samp]
}
}
}
}
if(is.null(unlist(sample))){
cat("\n")
stop("none of the data tables listed in sample contains any reads of the specified lengths\n\n")
}
# if the "site column" is not present in all specified samples or frequency_normalization == T
# add the "site column"
if((sum(lapply(data[as.character(unlist(sample))], function(x)
(gsub("site", "_site_codon", site) %in% colnames(x))) == TRUE) < length(as.character(unlist(sample)))) |
((sum(lapply(data[as.character(unlist(sample))], function(x)
(gsub("site", "_site_codon", site) %in% colnames(x))) == TRUE) == length(as.character(unlist(sample)))) &
(frequency_normalization == TRUE | frequency_normalization == T))){
if(length(fastapath) == 0 & length(bsgenome) == 0){
cat("\n")
if(sum(lapply(data[as.character(unlist(sample))], function(x)
(gsub("site", "_site_codon", site) %in% colnames(x))) == TRUE) < length(as.character(unlist(sample)))){
stop(paste0("unable to add ", gsub("site", "_site_codon", site), " column. Either fastpath or bsgenome must be specified\n\n"))
} else {
cat("\n")
warning("no nucleotide sequences provided.
Either fastpath or bsgenome must be specified to normalize the data for triplet frequencies:
non-normalized codon usage indexes will be returned\n", .call = FALSE)
frequency_normalization = FALSE
}
}
if(((length(fastapath) != 0 & (fasta_genome == TRUE | fasta_genome == T)) |
length(bsgenome) != 0) &
length(gtfpath) == 0 & length(txdb) == 0){
cat("\n")
stop("genome annotation file not specified. Either gtfpath or txdb object must be specified\n\n")
}
if(length(fastapath) != 0 & length(bsgenome) != 0){
cat("\n")
warning("both fastapath and bsgenome are specified. Only fastapath will be considered\n")
bsgenome = NULL
}
if(length(gtfpath) != 0 & length(txdb) != 0){
cat("\n")
warning("both gtfpath and txdb are specified. Only gtfpath will be considered\n")
txdb = NULL
}
if((length(gtfpath) != 0 | length(txdb) != 0) &
((length(fastapath) == 0 & length(bsgenome) == 0) |
(length(fastapath) != 0 & (fasta_genome == FALSE | fasta_genome == F)))){
cat("\n")
stop("genome annotation file specified but no sequences from genome assembly provided\n\n")
}
# read the gtf
if(length(gtfpath) != 0 | length(txdb) != 0){
if(length(gtfpath) != 0){
path_to_gtf <- gtfpath
suppressWarnings(txdbanno <- GenomicFeatures::makeTxDbFromGFF(file = path_to_gtf,
format = "gtf",
dataSource = dataSource,
organism = organism))
} else {
if(txdb %in% rownames(installed.packages())){
library(txdb, character.only = TRUE)
} else {
source("https://bioconductor.org/biocLite.R")
biocLite(txdb, suppressUpdates = TRUE)
library(txdb, character.only = TRUE)
}
txdbanno <- get(txdb)
}
}
#read the FASTA and (if needed) move from chromosome sequences to transcript sequences
if(length(fastapath) != 0 | length(bsgenome) != 0){
if(length(fastapath) != 0) {
if(fasta_genome == TRUE | fasta_genome == T){
temp_sequences <- Biostrings::readDNAStringSet(fastapath, format = "fasta", use.names = TRUE)
if(length(refseq_sep) != 0){
names(temp_sequences) <- tstrsplit(names(temp_sequences), refseq_sep, fixed = TRUE, keep = 1)[[1]]
}
exon <- suppressWarnings(GenomicFeatures::exonsBy(txdbanno, by = "tx", use.names = TRUE))
exon <- as.data.table(exon[unique(names(exon))])
sub_exon_plus <- exon[as.character(seqnames) %in% names(temp_sequences) & strand == "+"]
sub_exon_minus <- exon[as.character(seqnames) %in% names(temp_sequences) & strand == "-"
][, new_end := Biostrings::width(temp_sequences[as.character(seqnames)]) - start + 1
][, new_start := Biostrings::width(temp_sequences[as.character(seqnames)]) - end + 1]
seq_dt_plus <- sub_exon_plus[, nt_seq := "emp"
][, nt_seq := as.character(Biostrings::subseq(temp_sequences[as.character(seqnames)],
start = start,
end = end))
][, list(seq = paste(nt_seq, collapse = "")), by = group_name]
revcompl_temp_sequences <- Biostrings::reverseComplement(temp_sequences)
seq_dt_minus <- sub_exon_minus[, nt_seq := "emp"
][, nt_seq := as.character(Biostrings::subseq(revcompl_temp_sequences[as.character(seqnames)],
start = new_start,
end = new_end))
][, list(seq = paste(nt_seq, collapse = "")), by = group_name]
sequences <- Biostrings::DNAStringSet(c(seq_dt_plus$seq, seq_dt_minus$seq))
names(sequences) <- c(unique(sub_exon_plus$group_name), unique(sub_exon_minus$group_name))
} else {
sequences <- Biostrings::readDNAStringSet(fastapath, format = "fasta", use.names = TRUE)
if(!is.null(refseq_sep)){
names(sequences) <- tstrsplit(names(sequences), refseq_sep, fixed = TRUE, keep = 1)[[1]]
}
}
} else {
if(bsgenome %in% installed.genomes()){
library(bsgenome, character.only = TRUE)
} else {
source("http://www.bioconductor.org/biocLite.R")
biocLite(bsgenome, suppressUpdates = TRUE)
library(bsgenome, character.only = TRUE)
}
sequences <- GenomicFeatures::extractTranscriptSeqs(get(bsgenome), txdbanno, use.names = T)
}
}
# add the codon sequence associated with each P-site
for(samp in as.character(unlist(sample))){
if(!(gsub("site", "_site_codon", site) %in% colnames(data[[samp]]))){
if(site == "psite" & !("p_site_codon" %in% colnames(data[[samp]]))){
data[[samp]] <- data[[samp]][, p_site_codon := as.character(Biostrings::subseq(sequences[as.character(data[[samp]]$transcript)],
start = data[[samp]]$psite,
end = data[[samp]]$psite + 2))]
}
if(site == "asite" & !("a_site_codon" %in% colnames(data[[samp]]))){
data[[samp]] <- data[[samp]][, a_site_codon := as.character(Biostrings::subseq(sequences[as.character(data[[samp]]$transcript)],
start = data[[samp]]$psite + 3,
end = data[[samp]]$psite + 5))]
}
if(site == "esite" & !("e_site_codon" %in% colnames(data[[samp]]))){
data[[samp]] <- data[[samp]][, e_site_codon := as.character(Biostrings::subseq(sequences[as.character(data[[samp]]$transcript)],
start = data[[samp]]$psite - 3,
end = data[[samp]]$psite - 1))]
}
}
}
}
# if frequency_normalization == TRUE compute the frequency of all
# triplets in the selected transcripts
if(frequency_normalization == TRUE | frequency_normalization == T){
if(setequal(names(sequences), as.character(annotation$transcript)) == FALSE){
exc_seq <- length(setdiff(names(sequences), as.character(annotation$transcript)))
exc_anno <- length(setdiff(as.character(annotation$transcript), names(sequences)))
if(exc_seq > 0){
cat("\n")
warning(sprintf("more reference sequences (from FASTA or BSgenome) than transcript IDs (from annotation data table):
%s reference sequences discarded", exc_seq), call. = FALSE)
}
if(exc_anno > 0){
cat("\n")
warning(sprintf("more transcript IDs (from annotation data table) than sequences (from FASTA or BSgenome):
%s transcript IDs discarded", exc_seq), call. = FALSE)
}
}
c_transcripts <- intersect(c_transcripts, names(sequences))
sub_sequences <- sequences[c_transcripts]
cds_biost <- Biostrings::subseq(sub_sequences,
start = annotation[transcript %in% names(sub_sequences), l_utr5] + 1,
end = annotation[transcript %in% names(sub_sequences), l_utr5] +
annotation[transcript %in% names(sub_sequences), l_cds])
seq_freq <- (Biostrings::trinucleotideFrequency(cds_biost, step = 3,
as.prob = TRUE,
with.labels = TRUE,
simplify.as = "collapsed")) * 1000
names(seq_freq) <- gsub("T", "U", names(seq_freq))
}
# data.table for conversion codon-aa
cod_aa <- data.table(codon=c("GCC", "GCG", "GCU", "GCA", "AGA", "CGG", "AGG", "CGA", "CGC", "CGU", "AAC", "AAU", "GAC", "GAU", "UGC", "UGU", "CAA", "CAG", "GAG", "GAA", "GGC", "GGU", "GGA", "GGG", "CAC", "CAU", "AUA", "AUC", "AUU", "CUG", "CUA", "UUA", "CUU", "UUG", "CUC", "AAA", "AAG", "AUG", "UUC", "UUU", "CCG", "CCC", "CCU", "CCA", "AGC", "UCG", "UCU", "UCA", "UCC", "AGU", "UAG", "UAA", "UGA", "ACA", "ACC", "ACG", "ACU", "UGG", "UAU", "UAC", "GUA", "GUG", "GUU", "GUC"),
aa=c("A", "A", "A", "A", "R", "R", "R", "R", "R", "R", "N", "N", "D", "D", "C", "C", "Q", "Q", "E", "E", "G", "G", "G", "G", "H", "H", "I", "I", "I", "L", "L", "L", "L", "L", "L", "K", "K", "M", "F", "F", "P", "P", "P", "P", "S", "S", "S", "S", "S", "S", "*", "*", "*", "T", "T", "T", "T", "W", "Y", "Y", "V", "V", "V", "V"))
cod_lev <- gsub("U", "T", cod_aa$codon)
# compute triplet coverage
final_dt <- data.table()
for(samp in as.character(unlist(sample))) {
dt <- data[[samp]][as.character(transcript) %in% c_transcripts & psite_from_start %% 3 == 0]
if(site == "psite"){
dt <- dt[psite_region == "cds"]
temp_table <- data.table(table(factor(dt$p_site_codon, levels = cod_lev)))
} else {
if(site == "asite"){
dt <- dt[psite_from_start >= -3 & psite_from_stop <= -3]
temp_table <- data.table(table(factor(dt$a_site_codon, levels = cod_lev)))
} else {
dt <- dt[psite_from_start >= 3 & psite_from_stop <= 3]
temp_table <- data.table(table(factor(dt$e_site_codon, levels = cod_lev)))
}
}
colnames(temp_table) <- c("codon", "count")
temp_table[, codon := gsub("T", "U", codon)]
if(frequency_normalization == TRUE | frequency_normalization == T){
temp_table[, norm_count := count / seq_freq[codon]]
} else {
temp_table[, norm_count := count]
}
temp_table[, scaled_count := norm_count - min(norm_count)
][, scaled_count := scaled_count / max(scaled_count)
][, tmp_samp := samp]
final_dt <- rbind(final_dt, temp_table)
}
output <- list()
output[["count_dt"]] <- copy(final_dt[, c("tmp_samp", "codon", "count", "norm_count", "scaled_count")])
if(!(frequency_normalization == TRUE | frequency_normalization == T)){
output[["count_dt"]][, norm_count := NULL]
}
setnames(output[["count_dt"]], "tmp_samp", "sample")
if(length(contrast_sample) == 1){
codon_values <- codon_values[, list(codon, value)]
codon_values[, scaled_user_count := value - min(value)
][, scaled_user_count := scaled_user_count / max(scaled_user_count)
][, codon := gsub("T", "U", codon)]
}
#add codon classes
final_dt <- final_dt[, class := "cds"
][codon == "AUG", class := "start"
][codon %in% c("UAA", "UGA", "UAG"), class := "stop"]
# compute mean samples if a list is provided
if(is.list(sample)){
samp_dt <- data.table(stack(sample))
setnames(samp_dt, c("tmp_samp", "sample"))
# set names of samples as specified in parameter sample
final_dt <- merge.data.table(final_dt, samp_dt, sort = FALSE)[, tmp_samp := NULL]
# compute mean and se
plot_dt <- final_dt[, .(mean_scaled_count = mean(scaled_count),
se_scaled_count = sd(scaled_count/sqrt(.N))), by = .(codon, class, sample)]
if(is.null(contrast_sample)){
if(any(lengths(sample) != 1)){
output[["plot_dt"]] <- copy(plot_dt[, c("sample", "codon", "class", "mean_scaled_count", "se_scaled_count")])
setnames(output[["plot_dt"]], c("codon", "mean_scaled_count", "se_scaled_count"), c("x", "y", "y_se"))
} else {
output[["plot_dt"]] <- copy(final_dt[, c("sample", "codon", "class", "scaled_count")])
setnames(output[["plot_dt"]], c("codon", "scaled_count"), c("x", "y"))
}
}
} else {
plot_dt <- final_dt[, sample := tmp_samp
][, se_scaled_count := NA]
setnames(plot_dt, "scaled_count", "mean_scaled_count")
if(is.null(contrast_sample)){
output[["plot_dt"]] <- copy(plot_dt[, c("sample", "codon", "class", "mean_scaled_count")])
setnames(output[["plot_dt"]], c("codon", "mean_scaled_count"), c("x", "y"))
}
}
plot_dt[, sample := factor(sample, levels = unique(sample))]
#define plot dt and output dt when a scatter plot is generated
if(!is.null(contrast_sample)) {
if(length(contrast_sample) == 1){
plot_scatter_dt <- merge.data.table(plot_dt, codon_values, by = "codon")
} else {
plot_scatter_dt <- plot_dt[sample == contrast_sample[1]]
plot_scatter_dt[, scaled_user_count := plot_dt[sample == contrast_sample[2], mean_scaled_count]]
}
plot_dt <- plot_scatter_dt
output[["plot_dt"]] <- copy(plot_dt[, c("codon", "class", "mean_scaled_count", "scaled_user_count")]
)[order(codon, class)]
setnames(output[["plot_dt"]], c("mean_scaled_count", "scaled_user_count"), c("x", "y"))
}
plot_dt <- merge.data.table(plot_dt, cod_aa, by = "codon", sort = FALSE)
if(include_stop_codons == TRUE){
plot_dt <- plot_dt[, class := factor(class, levels = c("start", "cds", "stop"),
labels = c("Start codon", "CDS", "Stop codon"))]
} else {
output[["plot_dt"]] <- output[["plot_dt"]][class != "stop"]
plot_dt <- plot_dt[class != "stop"
][, class := factor(class, levels = c("start", "cds"),
labels = c("Start codon", "CDS"))]
}
oldw <- getOption("warn")
options(warn=-1)
if(is.null(contrast_sample)){
if(plot_style == "split"){
for(samp in unique(plot_dt$sample)){
sub_plot_dt <- plot_dt[sample == samp
][order(mean_scaled_count)
][, codon_aa := paste0(codon, "-", aa)
][, codon_aa := factor(codon_aa, levels = codon_aa)]
colour_codon <- ifelse(sub_plot_dt$codon == "AUG", "#333f50",
ifelse(sub_plot_dt$codon %in% c("UAA", "UGA", "UAG"),
"#8f1115", "gray40"))
bs <- 30
plot <- ggplot(sub_plot_dt, aes(x = codon_aa, y = mean_scaled_count, fill = class)) +
geom_bar(stat = "identity", alpha = 1, color = "white") +
geom_errorbar(aes(ymin = mean_scaled_count - se_scaled_count,
ymax = mean_scaled_count + se_scaled_count, color = class),
width = 0.35, linewidth = 1.1, na.rm = T, show.legend = F)
if(include_stop_codons == TRUE){
plot <- plot + scale_fill_manual(breaks = c("Start codon","Stop codon"),
values = c("#333f50", "#8f1115"), limits = c("Start codon","Stop codon")) +
scale_color_manual(breaks = c("Start codon","Stop codon"),
values = c("#333f50", "#8f1115"), limits = c("Start codon","Stop codon"))
} else {
plot <- plot + scale_fill_manual(breaks = c("Start codon", "CDS"),
values = c("#333f50", "gray40"), limits = c("Start codon", "CDS")) +
scale_color_manual(breaks = c("Start codon", "CDS"),
values = c("#333f50", "gray40"), limits = c("Start codon", "CDS"))
}
plot <- plot + theme_bw(base_size = bs) +
theme(legend.position = "top", legend.margin=margin(0,0,0,0), legend.box.margin=margin(5,0,-15,0),
legend.text = element_text(margin = margin(l = -8, unit = "pt")),
legend.title = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = bs * 0.5, colour = colour_codon),
panel.grid.major.x = element_blank(), panel.grid.minor.y = element_blank(),
plot.title = element_text(hjust = 0.5)) +
labs(title = samp, x = "Codon") +
scale_y_continuous("Usage index", limits = c(0, 1.025), breaks = seq(0, 1, 0.25))
output[[paste0("plot_", samp)]] <- plot
}
} else {
plot_dt <- plot_dt[order(class, codon)
][, codon_aa := paste0(codon, "-", aa)
][, codon_aa := factor(codon_aa, levels = unique(codon_aa))]
colour_codon <- ifelse(unique(plot_dt$codon) == "AUG", "#333f50",
ifelse(unique(plot_dt$codon) %in% c("UAA", "UGA", "UAG"),
"#8f1115", "gray40"))
bs <- 30
plot <- ggplot(plot_dt, aes(x = codon_aa, y = mean_scaled_count, fill = class)) +
geom_bar(stat = "identity", alpha = 1, color = "white") +
geom_errorbar(aes(ymin = mean_scaled_count - se_scaled_count,
ymax = mean_scaled_count + se_scaled_count, color = class),
width = 0.35, linewidth = 1.1, na.rm = T, show.legend = F)
if(include_stop_codons == TRUE){
plot <- plot + scale_fill_manual(breaks = c("Start codon","Stop codon"),
values = c("#333f50", "#8f1115"), limits = c("Start codon","Stop codon")) +
scale_color_manual(breaks = c("Start codon","Stop codon"),
values = c("#333f50", "#8f1115"), limits = c("Start codon","Stop codon"))
} else {
plot <- plot + scale_fill_manual(breaks = c("Start codon", "CDS"),
values = c("#333f50", "gray40"), limits = c("Start codon", "CDS")) +
scale_color_manual(breaks = c("Start codon", "CDS"),
values = c("#333f50", "gray40"), limits = c("Start codon", "CDS"))
}
plot <- plot + theme_bw(base_size = bs) +
facet_grid(sample ~ ., switch = "x") +
theme(legend.position = "top", legend.margin=margin(0,0,0,0), legend.box.margin=margin(5,0,-15,0),
legend.text = element_text(margin = margin(l = -12, unit = "pt")),
legend.title = element_blank(), strip.background = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = bs * 0.5, colour = colour_codon),
panel.grid.major.x = element_blank(), panel.grid.minor.y = element_blank()) +
labs(x = "Codon") +
scale_y_continuous("Usage index", limits = c(0, 1.025), breaks = seq(0, 1, 0.25))
output[["plot"]] <- plot
}
} else {
correlation <- round(cor(plot_dt$mean_scaled_count, plot_dt$scaled_user_count), 3)
bs <- 25
plot <- ggplot(plot_dt, aes(x = mean_scaled_count, y = scaled_user_count, colour = class)) +
geom_smooth(method = "lm", se = T, color = "gray80", fill = "gray80", linetype = 1,
formula = y ~ x, level = 0.99, fullrange = TRUE) +
geom_point(alpha = 0.9, size = bs * 0.14)
if(include_stop_codons == TRUE){
plot <- plot + scale_colour_manual(breaks = c("Start codon","Stop codon"),
values = c("#333f50", "#8f1115"), limits = c("Start codon","Stop codon"))
} else {
plot <- plot + scale_color_manual(breaks = c("Start codon", "CDS"),
values = c("#333f50", "gray40"), limits = c("Start codon", "CDS"))
}
plot <- plot + theme_bw(base_size = bs) +
theme(legend.position = "top", legend.margin=margin(0,0,0,0), legend.box.margin=margin(5,0,-15,0),
legend.text = element_text(margin = margin(l = -12, unit = "pt")),
panel.grid.minor = element_blank(), legend.title = element_blank()) +
scale_x_continuous(limits = c(-0.3,1.3), breaks = seq(0, 1, 0.25), expand = c(0,0)) +
scale_y_continuous(limits = c(-0.3,1.3), breaks = seq(0, 1, 0.25), expand = c(0,0)) +
coord_cartesian(xlim = c(-0.05, 1.05), ylim = c(-0.05, 1.05)) +
annotate("text", x = 1, y = 0, label = paste0("R=",correlation),
vjust = -0.2, size = bs * 0.2, hjust = 1, color = "black")
if(length(contrast_sample) == 1){
plot <- plot + labs(x = contrast_sample, y = "User value")
} else {
plot <- plot + labs(x = contrast_sample[1], y = contrast_sample[2])
}
if (label_scatter == T || label_scatter == TRUE) {
if (label_number != 64){
fit <- lm(plot_dt$scaled_user_count ~ plot_dt$mean_scaled_count)
outlier_tab <- data.table(predict.lm(fit, interval = "confidence", level = 0.95)
)[, codon := plot_dt$codon
][, scaled_user_count := plot_dt$scaled_user_count
][scaled_user_count < lwr | scaled_user_count > upr]
outlier_cod <- as.character(outlier_tab[, max_dist := min(abs(scaled_user_count - lwr), abs(scaled_user_count - upr)),
by = 1:nrow(outlier_tab)
][order(-max_dist)
][1:label_number, codon])
lab_table <- plot_dt[, codon_out := ""
][as.character(codon) %in% outlier_cod, codon_out := codon
][, aa_out := ""
][codon_out != "", aa_out := aa]
if (label_aminoacid == T || label_aminoacid == TRUE) {
label_col <- "aa_out"
} else {
label_col <- "codon_out"
}
} else {
lab_table <- plot_dt
if (label_aminoacid == T || label_aminoacid == TRUE) {
label_col <- "aa"
} else {
label_col <- "codon"
}
}
plot <- plot +
ggrepel::geom_text_repel(data = lab_table,
aes_string("mean_scaled_count", "scaled_user_count",
label = as.character(label_col)), show.legend = F)
}
output[["plot"]] <- plot
}
options(warn = oldw)
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.