R/utils.R

Defines functions codon_diff codon_optimize

Documented in codon_diff codon_optimize

#' Optimize codon usage in coding sequences
#'
#' \code{codon_optimize} redesigns a coding sequence by replacing each codon 
#' with its optimal synonymous alternative, while maintaining the same amino 
#' acid sequence. This function supports multiple optimization methods and is 
#' useful for improving protein expression in heterologous systems.
#'
#' @param seq A coding sequence as a DNAString object or any object that can 
#'   be coerced to DNAString. The sequence should not include stop codons.
#' @param optimal_codons A table of optimal codons as generated by 
#'   \code{est_optimal_codons()}, containing optimality information for each codon.
#' @param codon_table A codon table defining the genetic code, derived from 
#'   \code{get_codon_table()} or \code{create_codon_table()}.
#' @param level Character string specifying optimization level: "subfam" (default, 
#'   within codon subfamilies) or "amino_acid" (within amino acid groups). 
#'   Required for "naive" and "IDT" methods.
#' @param method Character string specifying the optimization algorithm:
#'   \itemize{
#'     \item \code{"naive"} (default): Simple replacement with optimal codons
#'     \item \code{"IDT"}: Method from Integrated DNA Technologies tool
#'     \item \code{"CodonTransformer"}: Neural network-based optimization
#'   }
#' @param cf Matrix of codon frequencies from \code{count_codons()}. Required 
#'   for "IDT" method to determine codon frequency distributions.
#' @param envname Environment name for "CodonTransformer" method. Should match 
#'   your conda environment name (default: "cubar_env").
#' @param organism Organism identifier (integer ID or string name) for 
#'   "CodonTransformer" method. Must be from ORGANISM2ID in CodonUtils 
#'   (e.g., "Escherichia coli general").
#' @param attention_type Attention mechanism type for "CodonTransformer":
#'   \itemize{
#'     \item \code{"original_full"} (default): Standard attention
#'     \item \code{"block_sparse"}: Memory-efficient sparse attention
#'   }
#' @param deterministic Logical. For "CodonTransformer" method:
#'   \itemize{
#'     \item \code{TRUE} (default): Deterministic decoding (most likely tokens)
#'     \item \code{FALSE}: Probabilistic sampling based on temperature
#'   }
#' @param temperature Numeric. Controls randomness in non-deterministic mode 
#'   for "CodonTransformer". Lower values (0.2, default) are conservative; 
#'   higher values (0.8) increase diversity. Must be positive.
#' @param top_p Numeric. Nucleus sampling threshold (0-1) for "CodonTransformer". 
#'   Only tokens with cumulative probability up to this value are considered. 
#'   Default: 0.95.
#' @param num_sequences Integer. Number of different optimized sequences to 
#'   generate (default: 1). For "CodonTransformer" with deterministic=FALSE, 
#'   each sequence is independently sampled.
#' @param match_protein Logical. For "CodonTransformer", constrains predictions 
#'   to maintain exact amino acid sequence. Recommended for unusual proteins 
#'   or high temperature settings (default: FALSE).
#' @param spliceai Logical. Whether to predict splice sites using SpliceAI 
#'   (default: FALSE). Requires appropriate environment setup.
#' @return The return type depends on parameters:
#'   \itemize{
#'     \item Single DNAString: When num_sequences=1 and spliceai=FALSE
#'     \item DNAStringSet: When num_sequences>1 and spliceai=FALSE  
#'     \item data.table: When spliceai=TRUE (includes sequences and splice predictions)
#'   }
#' @importFrom data.table .SD
#' @references 
#' Fallahpour A, Gureghian V, Filion GJ, Lindner AB, Pandi A. CodonTransformer:
#' a multispecies codon optimizer using context-aware neural networks. Nat Commun. 2025 Apr 3;16(1):3205.
#'
#' Jaganathan K, Panagiotopoulou S K, McRae J F, et al. Predicting splicing from primary sequence
#'  with deep learning. Cell, 2019, 176(3): 535-548.e24.
#' @export
#' @export
#' @examples
#' cf_all <- count_codons(yeast_cds)
#' optimal_codons <- est_optimal_codons(cf_all)
#' seq <- 'ATGCTACGA'
#' # method "naive":
#' codon_optimize(seq, optimal_codons)
#' # method "IDT":
#' codon_optimize(seq, cf = cf_all, method = "IDT")
#' codon_optimize(seq, cf = cf_all, method = "IDT", num_sequences = 10)
#'
#' # # The following examples requires pre-installation of python package SpliceAI or Codon
#' #  # Transformer. see the codon optimization vignette for further details.
#' # seq_opt <- codon_optimize(seq, method = "CodonTransformer",
#' #     organism = "Saccharomyces cerevisiae")
#' # seqs_opt <- codon_optimize(seq, method = "CodonTransformer",
#' #     organism = "Saccharomyces cerevisiae", num_sequences = 10,
#' #     deterministic =FALSE, temperature = 0.4)
#' # seqs_opt <- codon_optimize(seq, cf = cf_all, method = "IDT",
#' #     num_sequences = 10, spliceai = TRUE)
#' # seq_opt <- codon_optimize(seq, method = "CodonTransformer",
#' #     organism = "Saccharomyces cerevisiae", spliceai = TRUE)

codon_optimize <- function(
    seq, optimal_codons = optimal_codons, cf = NULL, codon_table = get_codon_table(),
    level = 'subfam', method = 'naive', num_sequences = 1, organism = NULL, envname = "cubar_env",
    attention_type = "original_full", deterministic = TRUE, temperature = 0.2, top_p = 0.95,
    match_protein = FALSE, spliceai = FALSE){
  . <- aa_code <- codon <- coef <- freq <- Possible_splice_junction <- NULL
  Candidate_optimized_sequence <- spliceai_run <- NULL
  if(!level %in% c('amino_acid', 'subfam')){
    stop('Possible values for `level` are "amino_acid" and "subfam"')
  }
  if(!inherits(seq, 'DNAString')){
    seq <- Biostrings::DNAString(seq)
  }
  tryCatch({
    protein <- as.character(Biostrings::translate(seq))
  }, warning = function(w){
    num_ignored <- length(seq) %% 3
    message(sprintf(
      "Warning: due to incomplete codon, the last %d base%s ignored.",
      num_ignored,
      ifelse(num_ignored == 1, " was", "s were")))
  })
  stop_codon <- strsplit(protein, "")[[1]] == "*"
  if(sum(stop_codon) != 0){
    stop('Found termination codon at the following location: ', paste(3*which(stop_codon)-2, collapse = ", "))
  }
  c2l <- codon_table[[level]]
  names(c2l) <- codon_table[['codon']]
  codons_old <- seq_to_codons(seq)
  seq_codons_fam <- c2l[codons_old]

  if(method == "naive"){
    optimal_codons <- data.table::as.data.table(optimal_codons)
    # if there are multiple codons for the same amino acid or subfamily,
    # cubar will first rank by the `coef` (if exists) column to keep codons
    # with the largest `coef` value.
    if('coef' %in% colnames(optimal_codons)){
      optimal_codons <- optimal_codons[order(-optimal_codons$coef)]
    }
    optimal_codons_u <- unique(optimal_codons, by = level)
    l2c <- optimal_codons_u$codon
    names(l2c) <- optimal_codons_u[[level]]
    codons_new <- l2c[seq_codons_fam]
    seq_opt <- Biostrings::DNAString(paste0(codons_new, collapse = ''))
  }

  if(method == "IDT"){
    if(is.null(cf)){
      stop('cf is required when method is set to "IDT".')
    }
    cf_freq <- data.table::data.table(codon = colnames(cf), freq = colSums(cf))
    codon_table <- merge(codon_table, cf_freq, by = "codon", all.x = TRUE)
    codon_table <- codon_table[
      , .SD[freq > 0.1 * sum(freq)], by = level
    ]
    codon_fam_list <- split(codon_table, by = level)
    sampled_codons_list <- replicate(num_sequences, {
      sapply(seq_codons_fam, function(fam) {
        dt <- codon_fam_list[[fam]]
        dt[sample(.N, 1, prob = freq), codon]
      })
    }, simplify = FALSE)
    seq_opt <- unique(Biostrings::DNAStringSet(
      unlist(lapply(sampled_codons_list, paste0, collapse = ""))
    ))
    # If only 1 sequence is requested, return as DNAString
    if(num_sequences == 1) {
      seq_opt <- seq_opt[[1]]
    }
  }

  if(method == "CodonTransformer"){
    reticulate::use_condaenv(envname)
    if(is.null(organism)){
      stop('Organism ID (integer) or name (string) (e.g., "Escherichia coli general")')
    }
    deterministic <- ifelse(deterministic, "True", "False")
    match_protein <- ifelse(match_protein, "True", "False")
    Sys.setenv(HF_ENDPOINT = "https://hf-mirror.com")
    predict_opt_sequence <- function(protein, organism, attention_type, deterministic,
                                       temperature, top_p, num_sequences, match_protein) {
      reticulate::py_run_string(sprintf('
import torch
from transformers import AutoTokenizer, BigBirdForMaskedLM
from CodonTransformer.CodonPrediction import predict_dna_sequence
from CodonTransformer.CodonJupyter import format_model_output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer").to(device)
output = predict_dna_sequence(
    protein="%s",
    organism="%s",
    device=device,
    tokenizer=tokenizer,
    model=model,
    attention_type="%s",
    deterministic=%s,
    temperature=%s,
    top_p=%s,
    num_sequences=%s,
    match_protein = %s
)
if type(output) is not list:
    r_output = output.predicted_dna
else:
    r_output = list()
    for i in range(0, len(output)):
        r_output.append(output[i].predicted_dna)
  ', protein, organism, attention_type, deterministic, temperature, top_p, num_sequences, match_protein))
return(reticulate::py$r_output)
    }
    predict_dna <- predict_opt_sequence(
      protein = protein, organism = organism, attention_type = attention_type,
      deterministic = deterministic, temperature = temperature, top_p = top_p,
      num_sequences = num_sequences, match_protein = match_protein
    )
    if(num_sequences == 1){
      seq_opt <- Biostrings::DNAString(predict_dna)
    } else{
      seq_opt <- unique(Biostrings::DNAStringSet(predict_dna))
    }
  }

  if(spliceai == TRUE){
    reticulate::use_condaenv(envname)
    spliceai_run <- function(seq){
      reticulate::py_run_string(sprintf('
from keras.models import load_model
from pkg_resources import resource_filename
from spliceai.utils import one_hot_encode
import numpy as np

input_sequence = "%s"
# Replace this with your custom sequence

context = 10000
paths = ("models/spliceai{}.h5".format(x) for x in range(1, 6))
models = [load_model(resource_filename("spliceai", x)) for x in paths]
x = one_hot_encode("N"*(context//2) + input_sequence + "N"*(context//2))[None, :]
y = np.mean([models[m].predict(x) for m in range(5)], axis=0)

flag = False if np.sum(y[0,:,0] <= 0.5) == 0 else True
  ', seq))
      return(reticulate::py$flag)
    }
    seq_opt <- data.table::data.table(Candidate_optimized_sequence = as.character(seq_opt))
    seq_opt[, Possible_splice_junction := vapply(Candidate_optimized_sequence, spliceai_run, logical(1))]
  }
  return(seq_opt)
}

#' Differential codon usage analysis
#'
#' \code{codon_diff} takes two set of coding sequences and
#' perform differential codon usage analysis.
#'
#' @param seqs1 DNAStringSet, or an object that can be coerced to a DNAStringSet
#' @param seqs2 DNAStringSet, or an object that can be coerced to a DNAStringSet
#' @param codon_table a table of genetic code derived from \code{get_codon_table} or
#'   \code{create_codon_table}.
#' @return a data.table of the differential codon usage analysis. Global tests examine wthether a codon
#'   is used differently relative to all the other codons. Family tests examine whether a codon is used
#'   differently relative to other codons that encode the same amino acid. Subfamily tests examine whether
#'   a codon is used differently relative to other synonymous codons that share the same first two nucleotides.
#'   Odds ratio > 1 suggests a codon is used at higher frequency in \code{seqs1} than in \code{seqs2}.
#' @importFrom data.table ':='
#' @export
#' @examples
#' yeast_exp_sorted <- yeast_exp[order(yeast_exp$fpkm),]
#' seqs1 <- yeast_cds[names(yeast_cds) %in% head(yeast_exp_sorted$gene_id, 1000)]
#' seqs2 <- yeast_cds[names(yeast_cds) %in% tail(yeast_exp_sorted$gene_id, 1000)]
#' cudiff <- codon_diff(seqs1, seqs2)
#'
codon_diff <- function(seqs1, seqs2, codon_table = get_codon_table()){
    . <- codon <- cnt1 <- cnt2 <- NULL
    global_other1 <- global_other2 <- global_p <- global_padj <- NULL
    amino_acid <- fam_other1 <- fam_other2 <- fam_p <- fam_padj <- NULL
    subfam <- subfam_other1 <- subfam_other2 <- subfam_or <- subfam_p <- subfam_padj <- NULL
    # count codons in the two sets of sequences
    if(!inherits(seqs1, 'DNAStringSet')){
        seqs1 <- Biostrings::DNAStringSet(seqs1)
    }
    if(!inherits(seqs2, 'DNAStringSet')){
        seqs2 <- Biostrings::DNAStringSet(seqs2)
    }
    cf1 <- count_codons(seqs1)
    cf2 <- count_codons(seqs2)
    codon_table <- data.table::as.data.table(codon_table)
    codon_table[, `:=`(cnt1 = colSums(cf1)[codon], cnt2 = colSums(cf2)[codon])]

    # global test
    codon_table[, `:=`(global_other1 = sum(cnt1) - cnt1,
                       global_other2 = sum(cnt2) - cnt2)]
    codon_table[, c('global_or', 'global_p') := {
        m <- matrix(c(cnt1, global_other1, cnt2, global_other2), ncol = 2)
        ft <- stats::fisher.test(m)
        list(global_or = ft$estimate, global_p = ft$p.value)
    }, by = .(codon)]
    codon_table[, global_padj := stats::p.adjust(global_p, method = 'BH')]

    # family (encoding the same AA)-level test
    codon_table[, `:=`(fam_other1 = sum(cnt1) - cnt1,
                       fam_other2 = sum(cnt2) - cnt2), by = .(amino_acid)]
    codon_table[, c('fam_or', 'fam_p') := {
        m <- matrix(c(cnt1, fam_other1, cnt2, fam_other2), ncol = 2)
        ft <- stats::fisher.test(m)
        list(ft$estimate, ft$p.value)
    }, by = .(codon)]
    codon_table[, fam_padj := stats::p.adjust(fam_p, method = 'BH')]

    # subfamily (first two nucleotides are the same)-level test
    codon_table[, `:=`(subfam_other1 = sum(cnt1) - cnt1,
                       subfam_other2 = sum(cnt2) - cnt2), by = .(subfam)]
    codon_table[, c('subfam_or', 'subfam_p') := {
        m <- matrix(c(cnt1, subfam_other1, cnt2, subfam_other2), ncol = 2)
        ft <- stats::fisher.test(m)
        list(ft$estimate, ft$p.value)
    }, by = .(codon)]
    codon_table[, subfam_padj := stats::p.adjust(subfam_p, method = 'BH')]

    return(codon_table[])
}

Try the cubar package in your browser

Any scripts or data that you put into this service are public.

cubar documentation built on Aug. 21, 2025, 5:40 p.m.