Nothing
##########################################################################
# This file contains a series of R functions for calculating the tRNA
# adaptation index (tAI), the S value (or correlation between tAI and the
# adjusted Nc) and other auxiliary functions.
#
# Copyright (c) 2016-2003 Mario dos Reis
#
# This file forms part of tAI, a package for the analysis of codon
# usage in DNA coding sequences.
#
# tAI is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# Mario dos Reis
# mariodosreis@gmail.com
#
# version 0.21 (Dec 2016)
###########################################################################
##########################################################################
# Adjusted Nc
##########################################################################
#' Adjusted effective number of codons (Nc)
#'
#' The adjusted Nc is f(gc3s) - Nc
#'
#' @param nc a vector of length n with the effective number of codons for genes
#' @param gc3 a vector of length n with corresponding GC composition at third codon positions
#'
#' @details The adjusted Nc is calculated as described in dos Reis et al. (2004).
#'
#' @references
#' dos Reis M., Savva R., and Wernisch L. (2004) Solving the riddle of codon
#' usage preferences: a test for translational selection. \emph{Nucleic Acids Res.},
#' \bold{32:} 5036--44.
#'
#' @seealso \code{\link{nc.f}} for the function used to calculate f(gc3s)
#'
#' @return A vector of length n with adjusted Nc values
#' @author Mario dos Reis
#'
#' @examples
#'
#' eco.ncadj <- nc.adj(ecolik12$w$Nc, ecolik12$w$GC3s)
#' plot(eco.ncadj ~ ecolik12$w$Nc, xlab="Nc", ylab="Nc adjusted")
#'
#' @export
nc.adj <- function(nc, gc3) {
a = -6.0 # a, b, and c have already been optimised
b = 34.0
c = 1.025
x = a + gc3 + (b/(gc3^2 + (c - gc3)^2)) - nc
return(x)
}
##########################################################################
# Function to calculate relative adaptiveness values
##########################################################################
# non-optimised s-values:
# s <- c(0, 0, 0, 0, 0.5, 0.5, 0.75, 0.5, 0.5)
#' Relative adaptiveness values
#'
#' Calculates the relative adaptiveness values of codons based on the number
#' of tRNA genes.
#'
#' @param tRNA a vector of length 64 with tRNA gene copy numbers
#' @param s a vector of length 9 with selection penalties for codons
#' @param sking a vector of length 1 indicating the superkingdom
#'
#' @details The relative adaptiveness values are calculated as described in
#' dos Reis et al. (2003, 2004). If \code{s = NULL}, the s values are set to
#' the optimised values of dos Reis et al. (2004). \code{sking} indicates the
#' superkingdom, with 0 indicating Eukaryota, and 1 Prokaryota.
#'
#' @return A vector of length 60 of relative adaptiveness values.
#'
#' @author Mario dos Reis
#'
#' @examples
#' eco.ws <- get.ws(tRNA=ecolik12$trna, sking=1)
#'
#' @references
#' dos Reis M., Wernisch L., and Savva R. (2003) Unexpected correlations
#' between gene expression and codon usage bias from microarray data for the
#' whole \emph{Escherichia coli} K-12 genome. \emph{Nucleic Acids Res.},
#' \bold{31:} 6976--85.
#'
#' dos Reis M., Savva R., and Wernisch L. (2004) Solving the riddle of codon
#' usage preferences: a test for translational selection. \emph{Nucleic Acids Res.},
#' \bold{32:} 5036--44.
#'
#' @export
get.ws <- function(tRNA, # tRNA gene copy number
s = NULL, # selective constraints
sking) # super kingdom: 0-eukaryota, 1-prokaryota
{
# optimised s-values:
if(is.null(s)) s <- c(0.0, 0.0, 0.0, 0.0, 0.41, 0.28, 0.9999, 0.68, 0.89)
p = 1 - s
# initialise w vector
W = NULL # don't confuse w (lowercase) and W (uppercase)
# obtain absolute adaptiveness values (Ws)
for (i in seq(1, 61, by=4))
W = c(W,
p[1]*tRNA[i] + p[5]*tRNA[i+1], # INN -> NNT, NNC, NNA
p[2]*tRNA[i+1] + p[6]*tRNA[i], # GNN -> NNT, NNC
p[3]*tRNA[i+2] + p[7]*tRNA[i], # TNN -> NNA, NNG
p[4]*tRNA[i+3] + p[8]*tRNA[i+2]) # CNN -> NNG
# check methionine
W[36] = p[4]*tRNA[36]
# if bacteria, modify isoleucine ATA codon
if(sking == 1) W[35] = p[9]
# get rid of stop codons (11, 12, 15) and methionine (36)
W = W[-c(11,12,15,36)]
# get ws
w = W/max(W)
if(sum(w == 0) > 0) {
ws <- w[w != 0] # zero-less ws
gm <- exp(sum(log(ws))/length(ws)) # geometric mean
w[w == 0] = gm # substitute 0-ws by gm
}
return(w)
}
#############################################################################
# After calculating all w's, the tRNA adaptation index (tAI) can then be
# calculated. x is a matrix with all the codon frequencies per ORF in any
# analysed genome
#############################################################################
#' The tRNA adaptation index
#'
#' Calculates the tRNA adaptation index (tAI) of dos Reis et al. (2003, 2004).
#'
#' @param x an n by 60 matrix of codon frequencies for n open reading frames.
#' @param w a vector of length 60 of relative adaptiveness values for codons.
#'
#' @details The tRNA adaptation index (tAI) is a measure of the level of
#' co-adaptation between the set of tRNA genes and the codon usage bias of
#' protein-coding genes in a given genome. STOP and methionine codons are
#' ignored. The standard genetic code is assumed.
#'
#' @return A vector of length n of tAI values.
#'
#' @author Mario dos Reis
#'
#' @references
#' dos Reis M., Wernisch L., and Savva R. (2003) Unexpected correlations
#' between gene expression and codon usage bias from microarray data for the
#' whole \emph{Escherichia coli} K-12 genome. \emph{Nucleic Acids Res.},
#' \bold{31:} 6976--85.
#'
#' dos Reis M., Savva R., and Wernisch L. (2004) Solving the riddle of codon
#' usage preferences: a test for translational selection. \emph{Nucleic Acids Res.},
#' \bold{32:} 5036--44.
#'
#' @seealso \code{\link{get.ws}}
#'
#' @examples
#' # Calculate relative adaptiveness values (ws) for E. coli K-12
#' eco.ws <- get.ws(tRNA=ecolik12$trna, sking=1)
#'
#' # Calculate tAI for a set of 49 E. coli K-12 coding genes
#' eco.tai <- get.tai(ecolik12$m[,-33], eco.ws)
#'
#' # Plot tAI vs. effective number of codons (Nc)
#' plot(eco.tai, ecolik12$w$Nc, xlab="tAI", ylab="Nc")
#'
#' @export
get.tai <- function(x,w) {
w = log(w) #calculate log of w
n = apply(x,1,'*',w) #multiply each row of by the weights
n = t(n) #transpose
n = apply(n,1,sum) #sum rows
L = apply(x,1,sum) #get each ORF length
tAI = exp(n/L) #get tai
return(tAI)
}
############################################################################
# After calculating tAI, we can then calculate S, i.e. the correlation
# between tAI and g(GC3s) - Nc (aka nc.adj)
############################################################################
#' Correlation between tAI and Nc adjusted
#'
#' Calculates the correlation between tAI and Nc (adjusted for GC content at
#' third codon positions).
#'
#' @param tAI a vector of length n with tAI values for genes
#' @param nc a vector of length n with Nc values for genes
#' @param gc3 a vector of length n with GC content at third codon positions for genes
#'
#' @return Numeric of length one with the correlation between tAI and Nc adjusted
#' @author Mario dos Reis
#' @importFrom stats cor
#' @export
get.s <- function(tAI, nc, gc3) {
nc.ad <- nc.adj(nc, gc3)
ts <- cor(tAI, nc.ad, use='p')
return(ts)
}
#############################################################################
# Statistical test for tAI
#############################################################################
#' Monte Carlo test of correlation between tAI and Nc adjusted
#'
#' Calculates the p-value (using a Monte Carlo or randomisation test) that the
#' correlation (the S value) between tAI and the adjusted Nc for a set of
#' genes is different from zero.
#'
#' @param m a k by 60 matrix of codon frequencies for k genes
#' @param ws vector of length 60 of relative adaptiveness values of codons
#' @param nc vector of length k of Nc values for genes
#' @param gc3s vector of length k of GC content at third codon position for genes
#' @param ts.obs vector of length 1 with observed correlation between tAI and Nc adjusted for the k genes
#' @param samp.size a vector of length 1 with the number of genes to be sampled from m (see details)
#' @param n the number of permutations of ws in the randomisation test
#'
#' @details The Monte Carlo test is described in dos Reis et al. (2004). When
#' working with complete genomes, matrix \code{m} can have a very large number
#' of rows (large k). In this case it may be advisable to choose \code{samp.size}
#' < k to speed up the computation.
#'
#' @return A list with elements \code{p.value}, the p-value for the test, and
#' \code{ts.simulated}, a vector of length \code{n} with the simulated
#' correlations between tAI and adjusted Nc.
#'
#' @examples
#' eco.ws <- get.ws(tRNA=ecolik12$trna, sking=1)
#' eco.tai <- get.tai(ecolik12$m[,-33], eco.ws)
#' ts.obs <- get.s(eco.tai, ecolik12$w$Nc, ecolik12$w$GC3s)
#'
#' # The S-value (dos Reis et al. 2004):
#' ts.obs # [1] 0.9065442
#'
#' # There seems to be a high correlation between tAI and Nc adjusted for
#' # the 49 genes in ecolik12$m. Is the correlation statistically significant?
#' ts.mc <- ts.test(ecolik12$m[,-33], eco.ws, ecolik12$w$Nc, ecolik12$w$GC3s,
#' ts.obs, samp.size=dim(ecolik12$m)[1])
#' # The p-value is zero:
#' ts.mc$p.value # [1] 0
#'
#' # Histogram of simulated S-values:
#' hist(ts.mc$ts.simulated, n=50, xlab = "Simulated S values",
#' xlim=c(min(ts.mc$ts.simulated), ts.obs))
#' # Add the observed S-value as a red vertical line:
#' abline(v=ts.obs, col="red")
#'
#' @references
#' dos Reis M., Savva R., and Wernisch L. (2004) Solving the riddle of codon
#' usage preferences: a test for translational selection. \emph{Nucleic Acids Res.},
#' \bold{32:} 5036--44.
#'
#' @author Mario dos Reis
#' @export
ts.test <- function(m, ws, nc, gc3s, ts.obs, samp.size, n=1000) {
# create a matrix of randomly permuted w-values:
ws.permuted <- matrix(rep(ws, n), ncol = 60, byrow = T)
ws.permuted <- t(apply(ws.permuted, 1, sample))
# initialise 'translational selection' (ts) vector:
ts = numeric(length(ws.permuted[,1]))
# work with a smaller m matrix to reduce computation time:
samp <- sample(nrow(m), samp.size)
m.samp <- m[samp,]
nc <- nc[samp]
gc3s <- gc3s[samp]
nc.ad <- nc.adj(nc, gc3s)
# calculate simulated ts values:
for(i in 1:length(ws.permuted[,1])) {
tai <- get.tai(m.samp, ws.permuted[i,])
co <- cor(nc.ad, tai, use = 'p')
ts[i] <- co
}
# p-values is:
p.value <- sum(ts > ts.obs)
return(list(p.value=p.value, ts.simulated=ts))
}
#############################################################################
# Nc plotting function
#############################################################################
#' Nc vs. GC3s
#'
#' Calculates the expected Nc value of a gene for a given GC content at the
#' third codon positions.
#'
#' @param x a vector of GC contents at third codon positions
#'
#' @details Without selection on codon bias, the expected value of Nc as a
#' function of GC content at third positions, x, is given by
#' \deqn{f(x) = -6 + x + 34/(x^2 + (1.025 - x)^2).} This equation
#' follows dos Reis et al. (2004, see also Wright 1990 for the original).
#'
#' @return A vector of Nc values for the given GC contents.
#'
#' @author Mario dos Reis
#'
#' @references
#' Wright F. (1990) The 'effective number of codons' used in a gene. \emph{Gene},
#' \bold{87:} 23--9.
#'
#' dos Reis M., Savva R., and Wernisch L. (2004) Solving the riddle of codon
#' usage preferences: a test for translational selection. \emph{Nucleic Acids Res.},
#' \bold{32:} 5036--44.
#'
#' @examples
#' curve(nc.f(x), xlab="GC3s content", ylab="Nc")
#' points(ecolik12$w$GC3s, ecolik12$w$Nc, pch=19)
#'
#' @export
nc.f <- function(x) {
-6 + x + 34/(x^2 + (1.025 - x)^2)
}
# plot.nc <- function(nc, gc3s) {
# curve(nc.f, xlim = c(0, 1), ylim = c(20, 62),
# xlab = "GC3s", ylab = "Nc")
# points(nc ~ gc3s, pch = '+', cex = 0.5)
# }
#############################################################################
# Data documentation
#############################################################################
#' E. coli K-12 codon bias and tRNA numbers
#'
#' A list with elements \code{trna}, a vector of length 64 of tRNA gene copy numbers
#' in the Escherichia coli K-12 genome, \code{w}, a data frame with some codon bias
#' statistics for 49 E. coli K-12 coding genes, and \code{m}, a 49 by 61 matrix of
#' codon frequencies for the 49 genes in question.
#'
#' @author Mario dos Reis
#'
#' @examples
#' # 87 tRNA genes in the E. coli K-12 genome:
#' sum(ecolik12$trna)
#'
#' # Two copies are isoacceptors for Phe, with anticodon GAA (codon TTC)
#' ecolik12$trna[2]
#'
#' # ecolik12$w, a data frame with codon bias statistics
#' names(ecolik12$w)
#'
#' # Effective number of codons vs. gene length (in codons)
#' plot(ecolik12$w$Nc, ecolik12$w$L_aa, xlab="Nc", ylab="Gene length")
"ecolik12"
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.