Nothing
#' Optimize codon usage in coding sequences
#'
#' \code{codon_optimize} redesigns a coding sequence by replacing each codon
#' with its optimal synonymous alternative, while maintaining the same amino
#' acid sequence. This function supports multiple optimization methods and is
#' useful for improving protein expression in heterologous systems.
#'
#' @param seq A coding sequence as a DNAString object or any object that can
#' be coerced to DNAString. The sequence should not include stop codons.
#' @param optimal_codons A table of optimal codons as generated by
#' \code{est_optimal_codons()}, containing optimality information for each codon.
#' @param codon_table A codon table defining the genetic code, derived from
#' \code{get_codon_table()} or \code{create_codon_table()}.
#' @param level Character string specifying optimization level: "subfam" (default,
#' within codon subfamilies) or "amino_acid" (within amino acid groups).
#' Required for "naive" and "IDT" methods.
#' @param method Character string specifying the optimization algorithm:
#' \itemize{
#' \item \code{"naive"} (default): Simple replacement with optimal codons
#' \item \code{"IDT"}: Method from Integrated DNA Technologies tool
#' \item \code{"CodonTransformer"}: Neural network-based optimization
#' }
#' @param cf Matrix of codon frequencies from \code{count_codons()}. Required
#' for "IDT" method to determine codon frequency distributions.
#' @param envname Environment name for "CodonTransformer" method. Should match
#' your conda environment name (default: "cubar_env").
#' @param organism Organism identifier (integer ID or string name) for
#' "CodonTransformer" method. Must be from ORGANISM2ID in CodonUtils
#' (e.g., "Escherichia coli general").
#' @param attention_type Attention mechanism type for "CodonTransformer":
#' \itemize{
#' \item \code{"original_full"} (default): Standard attention
#' \item \code{"block_sparse"}: Memory-efficient sparse attention
#' }
#' @param deterministic Logical. For "CodonTransformer" method:
#' \itemize{
#' \item \code{TRUE} (default): Deterministic decoding (most likely tokens)
#' \item \code{FALSE}: Probabilistic sampling based on temperature
#' }
#' @param temperature Numeric. Controls randomness in non-deterministic mode
#' for "CodonTransformer". Lower values (0.2, default) are conservative;
#' higher values (0.8) increase diversity. Must be positive.
#' @param top_p Numeric. Nucleus sampling threshold (0-1) for "CodonTransformer".
#' Only tokens with cumulative probability up to this value are considered.
#' Default: 0.95.
#' @param num_sequences Integer. Number of different optimized sequences to
#' generate (default: 1). For "CodonTransformer" with deterministic=FALSE,
#' each sequence is independently sampled.
#' @param match_protein Logical. For "CodonTransformer", constrains predictions
#' to maintain exact amino acid sequence. Recommended for unusual proteins
#' or high temperature settings (default: FALSE).
#' @param spliceai Logical. Whether to predict splice sites using SpliceAI
#' (default: FALSE). Requires appropriate environment setup.
#' @return The return type depends on parameters:
#' \itemize{
#' \item Single DNAString: When num_sequences=1 and spliceai=FALSE
#' \item DNAStringSet: When num_sequences>1 and spliceai=FALSE
#' \item data.table: When spliceai=TRUE (includes sequences and splice predictions)
#' }
#' @importFrom data.table .SD
#' @references
#' Fallahpour A, Gureghian V, Filion GJ, Lindner AB, Pandi A. CodonTransformer:
#' a multispecies codon optimizer using context-aware neural networks. Nat Commun. 2025 Apr 3;16(1):3205.
#'
#' Jaganathan K, Panagiotopoulou S K, McRae J F, et al. Predicting splicing from primary sequence
#' with deep learning. Cell, 2019, 176(3): 535-548.e24.
#' @export
#' @export
#' @examples
#' cf_all <- count_codons(yeast_cds)
#' optimal_codons <- est_optimal_codons(cf_all)
#' seq <- 'ATGCTACGA'
#' # method "naive":
#' codon_optimize(seq, optimal_codons)
#' # method "IDT":
#' codon_optimize(seq, cf = cf_all, method = "IDT")
#' codon_optimize(seq, cf = cf_all, method = "IDT", num_sequences = 10)
#'
#' # # The following examples requires pre-installation of python package SpliceAI or Codon
#' # # Transformer. see the codon optimization vignette for further details.
#' # seq_opt <- codon_optimize(seq, method = "CodonTransformer",
#' # organism = "Saccharomyces cerevisiae")
#' # seqs_opt <- codon_optimize(seq, method = "CodonTransformer",
#' # organism = "Saccharomyces cerevisiae", num_sequences = 10,
#' # deterministic =FALSE, temperature = 0.4)
#' # seqs_opt <- codon_optimize(seq, cf = cf_all, method = "IDT",
#' # num_sequences = 10, spliceai = TRUE)
#' # seq_opt <- codon_optimize(seq, method = "CodonTransformer",
#' # organism = "Saccharomyces cerevisiae", spliceai = TRUE)
codon_optimize <- function(
seq, optimal_codons = optimal_codons, cf = NULL, codon_table = get_codon_table(),
level = 'subfam', method = 'naive', num_sequences = 1, organism = NULL, envname = "cubar_env",
attention_type = "original_full", deterministic = TRUE, temperature = 0.2, top_p = 0.95,
match_protein = FALSE, spliceai = FALSE){
. <- aa_code <- codon <- coef <- freq <- Possible_splice_junction <- NULL
Candidate_optimized_sequence <- spliceai_run <- NULL
if(!level %in% c('amino_acid', 'subfam')){
stop('Possible values for `level` are "amino_acid" and "subfam"')
}
if(!inherits(seq, 'DNAString')){
seq <- Biostrings::DNAString(seq)
}
tryCatch({
protein <- as.character(Biostrings::translate(seq))
}, warning = function(w){
num_ignored <- length(seq) %% 3
message(sprintf(
"Warning: due to incomplete codon, the last %d base%s ignored.",
num_ignored,
ifelse(num_ignored == 1, " was", "s were")))
})
stop_codon <- strsplit(protein, "")[[1]] == "*"
if(sum(stop_codon) != 0){
stop('Found termination codon at the following location: ', paste(3*which(stop_codon)-2, collapse = ", "))
}
c2l <- codon_table[[level]]
names(c2l) <- codon_table[['codon']]
codons_old <- seq_to_codons(seq)
seq_codons_fam <- c2l[codons_old]
if(method == "naive"){
optimal_codons <- data.table::as.data.table(optimal_codons)
# if there are multiple codons for the same amino acid or subfamily,
# cubar will first rank by the `coef` (if exists) column to keep codons
# with the largest `coef` value.
if('coef' %in% colnames(optimal_codons)){
optimal_codons <- optimal_codons[order(-optimal_codons$coef)]
}
optimal_codons_u <- unique(optimal_codons, by = level)
l2c <- optimal_codons_u$codon
names(l2c) <- optimal_codons_u[[level]]
codons_new <- l2c[seq_codons_fam]
seq_opt <- Biostrings::DNAString(paste0(codons_new, collapse = ''))
}
if(method == "IDT"){
if(is.null(cf)){
stop('cf is required when method is set to "IDT".')
}
cf_freq <- data.table::data.table(codon = colnames(cf), freq = colSums(cf))
codon_table <- merge(codon_table, cf_freq, by = "codon", all.x = TRUE)
codon_table <- codon_table[
, .SD[freq > 0.1 * sum(freq)], by = level
]
codon_fam_list <- split(codon_table, by = level)
sampled_codons_list <- replicate(num_sequences, {
sapply(seq_codons_fam, function(fam) {
dt <- codon_fam_list[[fam]]
dt[sample(.N, 1, prob = freq), codon]
})
}, simplify = FALSE)
seq_opt <- unique(Biostrings::DNAStringSet(
unlist(lapply(sampled_codons_list, paste0, collapse = ""))
))
# If only 1 sequence is requested, return as DNAString
if(num_sequences == 1) {
seq_opt <- seq_opt[[1]]
}
}
if(method == "CodonTransformer"){
reticulate::use_condaenv(envname)
if(is.null(organism)){
stop('Organism ID (integer) or name (string) (e.g., "Escherichia coli general")')
}
deterministic <- ifelse(deterministic, "True", "False")
match_protein <- ifelse(match_protein, "True", "False")
Sys.setenv(HF_ENDPOINT = "https://hf-mirror.com")
predict_opt_sequence <- function(protein, organism, attention_type, deterministic,
temperature, top_p, num_sequences, match_protein) {
reticulate::py_run_string(sprintf('
import torch
from transformers import AutoTokenizer, BigBirdForMaskedLM
from CodonTransformer.CodonPrediction import predict_dna_sequence
from CodonTransformer.CodonJupyter import format_model_output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer").to(device)
output = predict_dna_sequence(
protein="%s",
organism="%s",
device=device,
tokenizer=tokenizer,
model=model,
attention_type="%s",
deterministic=%s,
temperature=%s,
top_p=%s,
num_sequences=%s,
match_protein = %s
)
if type(output) is not list:
r_output = output.predicted_dna
else:
r_output = list()
for i in range(0, len(output)):
r_output.append(output[i].predicted_dna)
', protein, organism, attention_type, deterministic, temperature, top_p, num_sequences, match_protein))
return(reticulate::py$r_output)
}
predict_dna <- predict_opt_sequence(
protein = protein, organism = organism, attention_type = attention_type,
deterministic = deterministic, temperature = temperature, top_p = top_p,
num_sequences = num_sequences, match_protein = match_protein
)
if(num_sequences == 1){
seq_opt <- Biostrings::DNAString(predict_dna)
} else{
seq_opt <- unique(Biostrings::DNAStringSet(predict_dna))
}
}
if(spliceai == TRUE){
reticulate::use_condaenv(envname)
spliceai_run <- function(seq){
reticulate::py_run_string(sprintf('
from keras.models import load_model
from pkg_resources import resource_filename
from spliceai.utils import one_hot_encode
import numpy as np
input_sequence = "%s"
# Replace this with your custom sequence
context = 10000
paths = ("models/spliceai{}.h5".format(x) for x in range(1, 6))
models = [load_model(resource_filename("spliceai", x)) for x in paths]
x = one_hot_encode("N"*(context//2) + input_sequence + "N"*(context//2))[None, :]
y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
flag = False if np.sum(y[0,:,0] <= 0.5) == 0 else True
', seq))
return(reticulate::py$flag)
}
seq_opt <- data.table::data.table(Candidate_optimized_sequence = as.character(seq_opt))
seq_opt[, Possible_splice_junction := vapply(Candidate_optimized_sequence, spliceai_run, logical(1))]
}
return(seq_opt)
}
#' Differential codon usage analysis
#'
#' \code{codon_diff} takes two set of coding sequences and
#' perform differential codon usage analysis.
#'
#' @param seqs1 DNAStringSet, or an object that can be coerced to a DNAStringSet
#' @param seqs2 DNAStringSet, or an object that can be coerced to a DNAStringSet
#' @param codon_table a table of genetic code derived from \code{get_codon_table} or
#' \code{create_codon_table}.
#' @return a data.table of the differential codon usage analysis. Global tests examine wthether a codon
#' is used differently relative to all the other codons. Family tests examine whether a codon is used
#' differently relative to other codons that encode the same amino acid. Subfamily tests examine whether
#' a codon is used differently relative to other synonymous codons that share the same first two nucleotides.
#' Odds ratio > 1 suggests a codon is used at higher frequency in \code{seqs1} than in \code{seqs2}.
#' @importFrom data.table ':='
#' @export
#' @examples
#' yeast_exp_sorted <- yeast_exp[order(yeast_exp$fpkm),]
#' seqs1 <- yeast_cds[names(yeast_cds) %in% head(yeast_exp_sorted$gene_id, 1000)]
#' seqs2 <- yeast_cds[names(yeast_cds) %in% tail(yeast_exp_sorted$gene_id, 1000)]
#' cudiff <- codon_diff(seqs1, seqs2)
#'
codon_diff <- function(seqs1, seqs2, codon_table = get_codon_table()){
. <- codon <- cnt1 <- cnt2 <- NULL
global_other1 <- global_other2 <- global_p <- global_padj <- NULL
amino_acid <- fam_other1 <- fam_other2 <- fam_p <- fam_padj <- NULL
subfam <- subfam_other1 <- subfam_other2 <- subfam_or <- subfam_p <- subfam_padj <- NULL
# count codons in the two sets of sequences
if(!inherits(seqs1, 'DNAStringSet')){
seqs1 <- Biostrings::DNAStringSet(seqs1)
}
if(!inherits(seqs2, 'DNAStringSet')){
seqs2 <- Biostrings::DNAStringSet(seqs2)
}
cf1 <- count_codons(seqs1)
cf2 <- count_codons(seqs2)
codon_table <- data.table::as.data.table(codon_table)
codon_table[, `:=`(cnt1 = colSums(cf1)[codon], cnt2 = colSums(cf2)[codon])]
# global test
codon_table[, `:=`(global_other1 = sum(cnt1) - cnt1,
global_other2 = sum(cnt2) - cnt2)]
codon_table[, c('global_or', 'global_p') := {
m <- matrix(c(cnt1, global_other1, cnt2, global_other2), ncol = 2)
ft <- stats::fisher.test(m)
list(global_or = ft$estimate, global_p = ft$p.value)
}, by = .(codon)]
codon_table[, global_padj := stats::p.adjust(global_p, method = 'BH')]
# family (encoding the same AA)-level test
codon_table[, `:=`(fam_other1 = sum(cnt1) - cnt1,
fam_other2 = sum(cnt2) - cnt2), by = .(amino_acid)]
codon_table[, c('fam_or', 'fam_p') := {
m <- matrix(c(cnt1, fam_other1, cnt2, fam_other2), ncol = 2)
ft <- stats::fisher.test(m)
list(ft$estimate, ft$p.value)
}, by = .(codon)]
codon_table[, fam_padj := stats::p.adjust(fam_p, method = 'BH')]
# subfamily (first two nucleotides are the same)-level test
codon_table[, `:=`(subfam_other1 = sum(cnt1) - cnt1,
subfam_other2 = sum(cnt2) - cnt2), by = .(subfam)]
codon_table[, c('subfam_or', 'subfam_p') := {
m <- matrix(c(cnt1, subfam_other1, cnt2, subfam_other2), ncol = 2)
ft <- stats::fisher.test(m)
list(ft$estimate, ft$p.value)
}, by = .(codon)]
codon_table[, subfam_padj := stats::p.adjust(subfam_p, method = 'BH')]
return(codon_table[])
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.