Nothing
#' Degenerate protein sequence
#'
#' 'Degenerates' amino acid or nucleic sequence by aggregating
#' elements to bigger groups.
#'
#' @param seq \code{character} vector or matrix representing single sequence.
#' @param element_groups encoding of elements: list of groups to which elements
#' of sequence should be aggregated. Must have unique names.
#' @keywords manip
#' @return A \code{character} vector or matrix (if input is a matrix)
#' containing aggregated elements.
#' @note
#' Characters not present in the \code{element_groups} will be converted to NA with a
#' warning.
#' @export
#' @seealso \code{\link{l2n}} to easily convert information stored in biological sequences from
#' letters to numbers.
#' \code{\link{calc_ed}} to calculate distance between encodings.
#' @keywords manip
#' @examples
#' sample_seq <- c(1, 3, 1, 3, 4, 4, 3, 1, 2)
#' table(sample_seq)
#'
#' # aggregate sequence to purins and pyrimidines
#' deg_seq <- degenerate(sample_seq, list(w = c(1, 4), s = c(2, 3)))
#' table(deg_seq)
degenerate <- function(seq, element_groups) {
tmp_seq <- seq
if (!all(unique(tmp_seq) %in% unlist(element_groups))) {
warning("'seq' contains elements not present in 'element_groups'.
Element(s): ",
paste0(setdiff(unique(tmp_seq), unlist(element_groups)), collapse = ", "),
" will be replaced by NA.")
tmp_seq[!(tmp_seq %in% unlist(element_groups))] <- NA
}
if(is.null(names(element_groups))) {
warning("'element_groups' is unnamed. Assumed names of groups are their ordinal numbers.")
names(element_groups) <- 1L:length(element_groups)
}
if(length(unique(names(element_groups))) != length(names(element_groups))) {
stop("'element_groups' must have unique names.")
}
for (i in 1L:length(element_groups)) {
tmp_seq[tmp_seq %in% element_groups[[i]]] <- names(element_groups)[i]
}
if(is.matrix(seq))
dim(tmp_seq) <- dim(seq)
tmp_seq
}
#' Degenerate n-grams
#'
#' 'Degenerates' n-grams by aggregating amino acid or nucleotide elements
#' into bigger groups.
#' @param x object containing n-grams.
#' @param element_groups encoding of elements: list of groups to which elements
#' of n-grams should be aggregated. Must have unique names.
#' @param binarize logical indicating if n-grams should be binarized
#' @return A \code{character} vector or matrix (if input is a matrix)
#' containing degenerated n-grams.
#' @export
degenerate_ngrams <- function(x, element_groups, binarize = FALSE) {
if ('_' %in% unlist(element_groups)) {
stop("'element_groups' cannot contain '_'.")
}
decoded <- strsplit(decode_ngrams(colnames(x)), "")
degenerated <- lapply(decoded, degenerate, element_groups = c(element_groups, c("_" = "_")))
deg_ngrams <- lapply(lapply(degenerated, paste0, collapse = ""), code_ngrams)
res <- do.call(cbind, lapply(unique(deg_ngrams), function(ith_ngram) {
row_sums(x[, ith_ngram == deg_ngrams, drop = FALSE])
}))
colnames(res) <- unique(deg_ngrams)
if(binarize)
res <- binarize(res)
if(inherits(x, "simple_triplet_matrix"))
res <- as.simple_triplet_matrix(res)
res
}
#' Convert letters to numbers
#'
#' Converts biological sequence from letter to number notation.
#' @inheritParams degenerate
#' @param seq_type the type of sequence. Can be \code{rna}, \code{dna} or \code{prot}.
#' @keywords manip
#' @return a \code{numeric} vector or matrix containing converted elements.
#' @export
#' @keywords manip
#' @seealso
#' \code{l2n} is a wrapper around \code{\link{degenerate}}.
#'
#' Inverse function: \code{\link{n2l}}.
#' @examples
#' sample_seq <- c("a", "d", "d", "g", "a", "g", "n", "a", "l")
#' l2n(sample_seq, "prot")
l2n <- function(seq, seq_type) {
elements_list <- return_elements(seq_type)
names(elements_list) <- 1L:length(elements_list)
seq <- tolower(seq)
deg_seq <- as.numeric(degenerate(seq, elements_list))
if(is.matrix(seq))
deg_seq <- matrix(deg_seq, ncol = ncol(seq))
deg_seq
}
#' Convert numbers to letters
#'
#' Converts biological sequence from number to letter notation.
#' @param seq \code{integer} vector or matrix representing single sequence.
#' @param seq_type the type of sequence. Can be \code{rna}, \code{dna} or \code{prot}.
#' @keywords manip
#' @return a \code{character} vector or matrix containing converted elements.
#' @export
#' @keywords manip
#' @seealso
#' \code{n2l} is a wrapper around \code{\link{degenerate}}.
#'
#' Inverse function: \code{\link{l2n}}.
#' @examples
#' sample_seq <- c(1, 3, 3, 6, 1, 6, 12, 1, 10)
#' n2l(sample_seq, "prot")
n2l <- function(seq, seq_type) {
names_list <- return_elements(seq_type)
elements_list <- 1L:length(names_list)
names(elements_list) <- names_list
degenerate(seq, elements_list)
}
#' Convert encoding from full to simple format
#'
#' Converts an encoding from the full format to the simple format.
#' @param x encoding.
#' @export
#' @examples
#' aa1 = list(`1` = c("g", "a", "p", "v", "m", "l", "i"),
#' `2` = c("k", "h"),
#' `3` = c("d", "e"),
#' `4` = c("f", "r", "w", "y", "s", "t", "c", "n", "q"))
#' full2simple(aa1)
#'
full2simple <- function(x) {
single_enc <- x
element_df <- do.call(rbind, lapply(1L:length(single_enc), function(i) {
data.frame(gr = rep(names(single_enc[i]), length(single_enc[[i]])),
element = single_enc[[i]], stringsAsFactors = FALSE)
}))
element_df <- element_df[order(element_df[["element"]]), ]
res <- element_df[["gr"]]
names(res) <- element_df[["element"]]
res
}
#' Convert encoding from simple to full format
#'
#' Converts an encoding from the simple format to the full format.
#' @param x encoding (see Details).
#' @details The encoding should be named. Each name should correspond to a different
#' amino acid or nucleotide.
#' @export
#' @examples
#' aa1 = structure(c("1", "4", "3", "3", "4", "1", "2", "1", "2", "1",
#' "1", "4", "1", "4", "4", "4", "4", "1", "4", "4"),
#' .Names = c("a", "c", "d", "e", "f", "g", "h", "i",
#' "k", "l", "m", "n", "p", "q",
#' "r", "s", "t", "v", "w", "y"))
#' simple2full(aa1)
#'
simple2full <- function(x) {
if(is.null(names(x)))
stop("'x' must be named.")
single_enc <- x
gr <- unique(sort(single_enc))
res <- lapply(gr, function(i)
names(x[x == i]))
names(res) <- gr
res
}
# an internal function returning elements for a specific sequence type: aa, dna, rna
return_elements <- function(seq_type) {
if (!(seq_type %in% c("prot", "dna", "rna")))
stop("The value of 'what' must be: 'dna', 'rna' or 'prot'.")
switch(seq_type,
rna = c("a", "c", "g", "u"),
dna = c("a", "c", "g", "t"),
prot = c("a", "c", "d", "e", "f",
"g", "h", "i", "k", "l",
"m", "n", "p", "q", "r",
"s", "t", "v", "w", "y"))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.