R/utils.R
In cubar: Codon Usage Bias Analysis

Documented in codon_diff codon_optimize

#' Optimize codons
#'
#' \code{codon_optimize} takes a coding sequence (without stop codon) and replace
#' each codon to the corresponding synonymous optimal codon.
#'
#' @param seq DNAString, or an object that can be coerced to a DNAString.
#' @param optimal_codons table optimze codons as generated by \code{est_optimal_codons}.
#' @param codon_table a table of genetic code derived from \code{get_codon_table} or
#'   \code{create_codon_table}.
#' @param level "subfam" (default) or "amino_acid". Optimize codon usage at which level.
#' @return a DNAString of the optimized coding sequence.
#' @export
#' @examples
#' cf_all <- count_codons(yeast_cds)
#' optimal_codons <- est_optimal_codons(cf_all)
#' seq <- 'ATGCTACGA'
#' codon_optimize(seq, optimal_codons)
#'
codon_optimize <- function(seq, optimal_codons,
                           codon_table = get_codon_table(), level = 'subfam'){
    . <- aa_code <- codon <- coef <- NULL
    if(!level %in% c('amino_acid', 'subfam')){
        stop('Possible values for `level` are "amino_acid" and "subfam"')
    }
    if(!inherits(seq, 'DNAString')){
        seq <- Biostrings::DNAString(seq)
    }
    optimal_codons <- data.table::as.data.table(optimal_codons)
    # if there are multiple codons for the same amino acid or subfamily,
    # cubar will first rank by the `coef` (if exists) column to keep codons
    # with the largest `coef` value.
    if('coef' %in% colnames(optimal_codons)){
        optimal_codons <- optimal_codons[order(-optimal_codons$coef)]
    }
    optimal_codons <- unique(optimal_codons, by = level)
    l2c <- optimal_codons$codon
    names(l2c) <- optimal_codons[[level]]

    c2l <- codon_table[[level]]
    names(c2l) <- codon_table[['codon']]
    codons_old <- seq_to_codons(seq)
    codons_new <- l2c[c2l[codons_old]]

    return(Biostrings::DNAString(paste0(codons_new, collapse = '')))
}

#' Differential codon usage analysis
#'
#' \code{codon_diff} takes two set of coding sequences and
#' perform differential codon usage analysis.
#'
#' @param seqs1 DNAStringSet, or an object that can be coerced to a DNAStringSet
#' @param seqs2 DNAStringSet, or an object that can be coerced to a DNAStringSet
#' @param codon_table a table of genetic code derived from \code{get_codon_table} or
#'   \code{create_codon_table}.
#' @return a data.table of the differential codon usage analysis. Global tests examine wthether a codon
#'   is used differently relative to all the other codons. Family tests examine whether a codon is used
#'   differently relative to other codons that encode the same amino acid. Subfamily tests examine whether
#'   a codon is used differently relative to other synonymous codons that share the same first two nucleotides.
#'   Odds ratio > 1 suggests a codon is used at higher frequency in \code{seqs1} than in \code{seqs2}.
#' @importFrom data.table ':='
#' @export
#' @examples
#' yeast_exp_sorted <- yeast_exp[order(yeast_exp$fpkm),]
#' seqs1 <- yeast_cds[names(yeast_cds) %in% head(yeast_exp_sorted$gene_id, 1000)]
#' seqs2 <- yeast_cds[names(yeast_cds) %in% tail(yeast_exp_sorted$gene_id, 1000)]
#' cudiff <- codon_diff(seqs1, seqs2)
#'
codon_diff <- function(seqs1, seqs2, codon_table = get_codon_table()){
    . <- codon <- cnt1 <- cnt2 <- NULL
    global_other1 <- global_other2 <- global_p <- global_padj <- NULL
    amino_acid <- fam_other1 <- fam_other2 <- fam_p <- fam_padj <- NULL
    subfam <- subfam_other1 <- subfam_other2 <- subfam_or <- subfam_p <- subfam_padj <- NULL
    # count codons in the two sets of sequences
    if(!inherits(seqs1, 'DNAStringSet')){
        seqs1 <- Biostrings::DNAStringSet(seqs1)
    }
    if(!inherits(seqs2, 'DNAStringSet')){
        seqs2 <- Biostrings::DNAStringSet(seqs2)
    }
    cf1 <- count_codons(seqs1)
    cf2 <- count_codons(seqs2)
    codon_table <- data.table::as.data.table(codon_table)
    codon_table[, `:=`(cnt1 = colSums(cf1)[codon], cnt2 = colSums(cf2)[codon])]

    # global test
    codon_table[, `:=`(global_other1 = sum(cnt1) - cnt1,
                       global_other2 = sum(cnt2) - cnt2)]
    codon_table[, c('global_or', 'global_p') := {
        m <- matrix(c(cnt1, global_other1, cnt2, global_other2), ncol = 2)
        ft <- stats::fisher.test(m)
        list(global_or = ft$estimate, global_p = ft$p.value)
    }, by = .(codon)]
    codon_table[, global_padj := stats::p.adjust(global_p, method = 'BH')]

    # family (encoding the same AA)-level test
    codon_table[, `:=`(fam_other1 = sum(cnt1) - cnt1,
                       fam_other2 = sum(cnt2) - cnt2), by = .(amino_acid)]
    codon_table[, c('fam_or', 'fam_p') := {
        m <- matrix(c(cnt1, fam_other1, cnt2, fam_other2), ncol = 2)
        ft <- stats::fisher.test(m)
        list(ft$estimate, ft$p.value)
    }, by = .(codon)]
    codon_table[, fam_padj := stats::p.adjust(fam_p, method = 'BH')]

    # subfamily (first two nucleotides are the same)-level test
    codon_table[, `:=`(subfam_other1 = sum(cnt1) - cnt1,
                       subfam_other2 = sum(cnt2) - cnt2), by = .(subfam)]
    codon_table[, c('subfam_or', 'subfam_p') := {
        m <- matrix(c(cnt1, subfam_other1, cnt2, subfam_other2), ncol = 2)
        ft <- stats::fisher.test(m)
        list(ft$estimate, ft$p.value)
    }, by = .(codon)]
    codon_table[, subfam_padj := stats::p.adjust(subfam_p, method = 'BH')]

    return(codon_table[])
}