R/tuples.R

Defines functions amino_acids.character amino_acids.default amino_acids tuples_freq normalize as.char.vector all_tuples nuc_bases

Documented in all_tuples amino_acids amino_acids.character amino_acids.default as.char.vector normalize nuc_bases tuples_freq

# Copyright 2021 by the authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' Vector of DNA nucleotide bases as characters.
#' @export
nuc_bases = function() c("A", "T", "C", "G")

#' Create all tuples of size `n`.
#' @param tsize Tuple size (e.g. 3 for codons)
#' @param alphabet The alphabet. Default are DNA bases.
#' @examples codons = all_tuples(3)
#' @export
all_tuples = function(tsize, alphabet = nuc_bases()) {
  # TODO see also seqinr::words
  r_all_tuples(tsize, alphabet)
}

#' Convert string into vector of characters.
#'
#' @param s String, e.g. sequence
#' @return String as vector of characters
as.char.vector = function(s) {
  strsplit(s, "")[[1]] # as char vector
}

#' Normalize a nucleotide sequence. Default is DNA bases and upper-case letters.
#'
#' Note that unknown bases (e.g. letter K) do not raise any exceptions.
#' @param seqv A vector of sequences as a string to be processed.
#' @param RNA If true, RNA bases are used, i.e. T becomes U.
#' If false (default), DNA bases are used, i.e. U becomes T.
#' @param lowercase If true, all letters are converted to lowercase (ATG -> atg).
#' IF false (default), all letters will be uppercase (atg -> ATG)
#' @return Normalized string
#' @examples normalize("auggcc") # will yield ATGGCC.
#' @export
normalize = function(seqv, RNA = FALSE, lowercase = FALSE) {
  # TODO impl. in Rust
  r = sapply(seqv, function(seq) {
    seq1 = if (lowercase) tolower(seq) else toupper(seq)
    seq2 = if (lowercase) {
      if (RNA) gsub("t", "u", seq1) else gsub("u", "t", seq1)
    } else {
      if (RNA) gsub("T", "U", seq1) else gsub("U", "T", seq1)
    }
    seq2
  })
  as.vector(r)
}

#' Frequencies of tuples (codon).
#'
#' Calculates the absolute frequencies of tuples in a vector.
#'
#' @param tuples Vector of tuples (codons)
#' @return Data frame with two columns:
#' 1. tuple, 2. number of tuples in vector
#' @export
tuples_freq = function(tuples) {
  tsize = nchar(tuples[1])
  ftuples = factor(tuples, levels = all_tuples(tsize))

  df = data.frame(ftuples)
  df = as.data.frame(table(ftuples))
  colnames(df) = c("tuple", "freq")
  class(df) = append("gcat.codon.usage", class(df)) # order is important.
  df
}

# Register new generic function:

#' Generic function for amino acids.
#' @param x Generic parameter
#' @param ... Not used
#' @export
amino_acids = function(x, ...) UseMethod("amino_acids", x)

#' Default implementation of the generic function for amino acids.
#' It will throw an stop-exception.
#' @param x Generic parameter
#' @param value Generic parameter
#' @inheritDotParams amino_acids
#' @export
amino_acids.default = function(x, value, ...) {
  stop("Implementation for this type not supported.")
}

#' Translate codons into amino acids.
#' @param x Vector of codons.
#' @param numcode The ncbi genetic code number for translation.
#' By default the standard genetic code (1) is used.
#' See https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
#' @inheritDotParams amino_acids
#' @export
amino_acids.character = function(x, numcode = 1, ...) {
  aa = sapply(x, function(codon) {
    ncodon = normalize(codon, RNA = FALSE, lowercase = FALSE)
    cc = seqinr::s2c(ncodon)
    seqinr::translate(cc, numcode = numcode)
  })
  aa
}
informatik-mannheim/gcat-base documentation built on Nov. 7, 2023, 7:18 a.m.