Nothing
############################# By Pattern #############################
#' Detect the presence of patterns in sequences
#'
#' @param x a DNA, RNA or AA vector.
#' @param pattern a DNA, RNA or AA vectors (but same as \code{x})
#' or a character vector of regular expressions, or a list.
#' See section Patterns.
#' @param max_error numeric value ranging from 0 to 1 and giving the
#' maximum error rate allowed between the target sequence and the pattern.
#' Error rate is relative to the length of the pattern.
#'
#'
#' @section Patterns:
#' It is important to understand how patterns are treated in \pkg{bioseq}.
#'
#' Patterns are recycled along the sequences (usually the \code{x} argument).
#' This means that if a pattern (vector or list) is of length > 1, it will be
#' replicated until it is the same length as \code{x}.
#' The reverse is not true and a vector of patterns longer than
#' a vector of sequences will raise a warning.
#'
#' Patterns can be DNA, RNA or AA vectors
#' (but they must be from the same class as
#' the sequences they are matched against).
#' If patterns are DNA, RNA or AA vectors,
#' they are disambiguated prior to matching.
#' For example pattern dna("ARG") will match AAG or AGG.
#'
#' Alternatively, patterns can be a simple character vector
#' containing regular expressions.
#'
#' Vectors of patterns (DNA, RNA, AA or regex) can also be provided in a list.
#' In that case, each vector of the list will be collapsed prior matching,
#' which means that each vector element will be used as an alternative pattern.
#' For example pattern list(c("AAA", "CCC"), "GG")
#' will match AAA or CCC in the first sequence,
#' GG in the second sequence, AAA or CCC in the third,
#' and so on following the recycling rule.
#'
#' @section Fuzzy matching:
#' When \code{max_error} is greater than zero, the function perform
#' fuzzy matching. Fuzzy matching does not support regular expression.
#'
#' @seealso
#' \code{\link[stringi]{stri_detect}} from \pkg{stringi},
#' \code{\link[stringr]{str_detect}} from \pkg{stringr} and
#' \code{\link[stringdist]{afind}} from \pkg{stringdist}
#' for the underlying implementation.
#'
#' @return A logical vector.
#' @family string operations
#' @export
#'
#' @examples
#'
#' x <- dna(c("ACGTTAGTGTAGCCGT", "CTCGAAATGA"))
#' seq_detect_pattern(x, dna(c("CCG", "AAA")))
#'
#' # Regular expression
#' seq_detect_pattern(x, "^A.{2}T")
#'
#' # Fuzzy matching
#' seq_detect_pattern(x, dna("AGG"), max_error = 0.2)
#' # No match. The pattern has three character, the max_error
#' # has to be > 1/3 to allow one character difference.
#'
#' seq_detect_pattern(x, dna("AGG"), max_error = 0.4)
#' # Match
#'
seq_detect_pattern <- function(x, pattern, max_error = 0) {
check_dna_rna_aa(x)
if(any(max_error > 0)) {
res <- seq_detect_fuzzypattern(x = x,
pattern = pattern,
max_dist = max_error)
return(res)
}
pattern <- check_and_prepare_pattern(x, pattern)
res <- stringr::str_detect(string = x, pattern = pattern)
return(res)
}
#' Crop sequences using delimiting patterns
#'
#' @param x a DNA, RNA or AA vector to be cropped.
#' @param pattern_in patterns defining the beginning (left-side).
#' @param pattern_out patterns defining the end (right-side).
#' @param max_error_in,max_error_out numeric values ranging from
#' 0 to 1 and giving the maximum error rate allowed between the
#' target sequence and \code{pattern_in}/\code{pattern_out}.
#' Error rate is relative to the length of the pattern.
#' @param include_patterns logical. Should the matched pattern
#' sequence included in the returned sequences?
#'
#' @inheritSection seq_detect_pattern Patterns
#'
#' @section Fuzzy matching:
#' When \code{max_error_in} or \code{max_error_out} are greater
#' than zero, the function perform fuzzy matching.
#' Fuzzy matching does not support regular expression.
#'
#' @return A cropped DNA, RNA or AA vector.
#' Sequences where patterns are not detected returns \code{NA}.
#'
#' @family string operations
#' @seealso
#' \code{\link[stringi]{stri_extract}} from \pkg{stringi},
#' \code{\link[stringr]{str_extract}} from \pkg{stringr} and
#' \code{\link[stringdist]{afind}} from \pkg{stringdist}
#' for the underlying implementation.
#'
#' @export
#'
#' @examples
#'
#' x <- dna("ACGTTAAAAAGTGTAGCCCCCGT", "CTCGAAATGA")
#' seq_crop_pattern(x, pattern_in = "AAAA", pattern_out = "CCCC")
seq_crop_pattern <- function(x, pattern_in, pattern_out,
max_error_in = 0, max_error_out = 0,
include_patterns = TRUE) {
check_dna_rna_aa(x)
if(any(max_error_in > 0) | any(max_error_out > 0)) {
res <- seq_crop_fuzzypattern(x = x,
pattern_in = pattern_in,
pattern_out = pattern_out,
max_dist_in = max_error_in,
max_dist_out = max_error_out,
include_patterns = include_patterns)
return(res)
}
pattern_in <- check_and_prepare_pattern(x, pattern_in)
pattern_out <- check_and_prepare_pattern(x, pattern_out)
if(include_patterns) {
rgx <- paste0("(?=(?:", pattern_in, ")).*(?:", pattern_out, ")")
} else {
rgx <- paste0("(?<=", pattern_in, ").*(?=", pattern_out, ")")
}
res <- stringr::str_extract(string = x, pattern = rgx)
res <- coerce_seq_as_input(res, x)
return(res)
}
#' Extract matching patterns from sequences
#'
#' @inheritParams seq_detect_pattern
#' @inheritSection seq_detect_pattern Patterns
#' @return A list of vectors of same class as \code{x}.
#' @family string operations
#' @seealso
#' \code{\link[stringi]{stri_extract}} from \pkg{stringi} and
#' \code{\link[stringr]{str_extract}} from \pkg{stringr}
#' for the underlying implementation.
#'
#' @export
#'
#' @examples
#'
#' x <- dna("ACGTTAGTGTAGCCGT", "CTCGAAATGA")
#' seq_extract_pattern(x, dna("AAA"))
#' seq_extract_pattern(x, "T.G")
#'
seq_extract_pattern <- function(x, pattern) {
check_dna_rna_aa(x)
pattern <- check_and_prepare_pattern(x, pattern)
res <- stringr::str_extract_all(string = x, pattern = pattern)
res <- lapply(res, coerce_seq_as_input, input = x, keep_names = FALSE)
names(res) <- names(x)
return(res)
}
#' Remove matched patterns in sequences
#'
#' @inheritParams seq_detect_pattern
#' @inheritSection seq_detect_pattern Patterns
#' @return A vector of same class as \code{x}.
#' @export
#' @family string operations
#' @seealso
#' \code{\link[stringr]{str_remove}} from \pkg{stringr}
#' for the underlying implementation.
#'
#' @examples
#'
#' x <- dna("ACGTTAGTGTAGCCGT", "CTCGAAATGA")
#' seq_remove_pattern(x, dna("AAA"))
#' seq_remove_pattern(x, "^A.{2}T")
#'
seq_remove_pattern <- function(x, pattern) {
check_dna_rna_aa(x)
pattern <- check_and_prepare_pattern(x, pattern)
res <- stringr::str_remove_all(string = x, pattern = pattern)
res <- coerce_seq_as_input(res, input = x)
return(res)
}
#' Replace matched patterns in sequences
#'
#' @inheritParams seq_detect_pattern
#' @param replacement a vector of replacements.
#' @inheritSection seq_detect_pattern Patterns
#' @return A vector of same class as \code{x}.
#' @family string operations
#' @seealso
#' \code{\link[stringi]{stri_replace}} from \pkg{stringi} and
#' \code{\link[stringr]{str_replace}} from \pkg{stringr}
#' for the underlying implementation.
#'
#' @export
#'
#' @name seq-replace
#'
#' @examples
#' x <- dna("ACGTTAGTGTAGCCGT", "CTCGAAATGA")
#' seq_replace_pattern(x, dna("AAA"), dna("GGGGGG"))
#' seq_replace_pattern(x, "^A.{2}T", "TTTTTT")
#'
seq_replace_pattern <- function(x, pattern, replacement) {
check_dna_rna_aa(x)
pattern <- check_and_prepare_pattern(x, pattern)
res <- stringr::str_replace_all(string = x, pattern = pattern,
replacement = replacement)
res <- coerce_seq_as_input(res, input = x)
return(res)
}
#' Split sequences
#'
#' @inheritParams seq_detect_pattern
#' @inheritSection seq_detect_pattern Patterns
#' @return A list of vectors of same class as \code{x}.
#' @family string operations
#' @seealso
#' \code{\link[stringi]{stri_split}} from \pkg{stringi} and
#' \code{\link[stringr]{str_split}} from \pkg{stringr}
#' for the underlying implementation.
#'
#' @export
#'
#' @examples
#'
#' x <- dna(a = "ACGTTAGTGTAGCCGT", b = "CTCGAAATGA")
#' seq_split_pattern(x, dna("AAA"))
#' seq_split_pattern(x, "T.G")
#'
seq_split_pattern <- function(x, pattern) {
check_dna_rna_aa(x)
pattern <- check_and_prepare_pattern(x, pattern)
res <- stringr::str_split(string = x, pattern = pattern,
n = Inf, simplify = FALSE)
res <- lapply(res, coerce_seq_as_input, input = x, keep_names = FALSE)
names(res) <- names(x)
return(res)
}
#' Count the number of matches in sequences
#'
#' @inheritParams seq_detect_pattern
#' @inheritSection seq_detect_pattern Patterns
#' @return An integer vector.
#' @family string operations
#' @seealso
#' \code{\link[stringi]{stri_count}} from \pkg{stringi} and
#' \code{\link[stringr]{str_count}} from \pkg{stringr}
#' for the underlying implementation.
#'
#' @export
#'
#' @examples
#'
#' x <- dna("ACGTTAGTGTAGCCGT", "CTCGAAATGA")
#' seq_count_pattern(x, dna("AAA"))
#' seq_count_pattern(x, "T.G")
#'
seq_count_pattern <- function(x, pattern) {
check_dna_rna_aa(x)
pattern <- check_and_prepare_pattern(x, pattern)
res <- stringr::str_count(string = x, pattern = pattern)
return(res)
}
############################# By Position #############################
#' Crop sequences between two positions
#'
#'
#' @param x a DNA, RNA or AA vector.
#' @param position_in an integer giving the position where to start cropping.
#' @param position_out an integer giving the position where to stop cropping.
#'
#' @return A cropped DNA, RNA or AA vector.
#' @family string operations
#' @seealso
#' \code{\link[stringi]{stri_sub}} from \pkg{stringi} and
#' \code{\link[stringr]{str_sub}} from \pkg{stringr}
#' for the underlying implementation.
#'
#' @export
#'
#' @examples
#'
#' x <- dna("ACGTTAGTGTAGCCGT")
#'
#' # Drop the first 3 nucleotides (ACG)
#' seq_crop_position(x, position_in = 4)
#'
#' # Crop codon between position 4 and 6
#' seq_crop_position(x, position_in = 4, position_out = 6)
#'
seq_crop_position <- function(x, position_in = 1, position_out = -1) {
check_dna_rna_aa(x)
out <- stringr::str_sub(x, start = position_in, end = position_out)
out <- coerce_seq_as_input(out, x)
return(out)
}
#' Remove a region between two positions in sequences.
#'
#' @param x a DNA, RNA or AA vector.
#' @param position_in an integer giving the position where to start to remove.
#' @param position_out an integer giving the position where to stop to remove.
#'
#' @return A vector of same class as \code{x}.
#' @family string operations
#' @seealso
#' \code{\link[stringr]{str_remove}} from \pkg{stringr}
#' for the underlying implementation.
#'
#' @export
#'
#' @examples
#'
#' x <- dna("ACGTTAGTGTAGCCGT", "CTCGAAATGA")
#' seq_remove_position(x, 2, 6)
#' seq_remove_position(x, 1:2, 3:4)
#'
seq_remove_position <- function(x, position_in, position_out) {
check_dna_rna_aa(x)
x_nchar <- nchar(x)
position_out <- ifelse(position_out > x_nchar, x_nchar, position_out)
d_pos <- position_out - position_in
rgx <- paste0("(?<=.{", position_in - 1, "}).{", d_pos + 1, "}")
res <- stringr::str_remove(x, pattern = rgx)
res <- coerce_seq_as_input(res, input = x)
return(res)
}
#' Replace a region between two positions in sequences
#'
#' @param x a DNA, RNA or AA vector.
#' @param position_in an integer giving the position where to start to replace.
#' @param position_out an integer giving the position where to stop to replace.
#' @param replacement a vector of replacements.
#'
#' @return A vector of same class as \code{x}.
#' @family string operations
#' @seealso
#' \code{\link[stringi]{stri_replace}} from \pkg{stringi} and
#' \code{\link[stringr]{str_replace}} from \pkg{stringr}
#' for the underlying implementation.
#'
#' @export
#' @examples
#'
#' x <- dna("ACGTTAGTGTAGCCGT", "CTCGAAATGA")
#' seq_replace_position(x, c(5, 2), 6, "-------")
seq_replace_position <- function(x, position_in, position_out, replacement){
check_dna_rna_aa(x)
x_nchar <- nchar(x)
position_out <- ifelse(position_out > x_nchar, x_nchar, position_out)
d_pos <- position_out - position_in
rgx <- paste0("(?<=.{", position_in - 1, "}).{", d_pos + 1, "}")
res <- stringr::str_replace(x, pattern = rgx, replacement = replacement)
res <- coerce_seq_as_input(res, input = x)
return(res)
}
#' Extract a region between two positions in sequences
#'
#' @param x a DNA, RNA or AA vector.
#' @param position_in an integer giving the position where to start to extract.
#' @param position_out an integer giving the position where to stop to extract.
#'
#' @return A vector of same class as \code{x}.
#' @family string operations
#' @seealso
#' \code{\link[stringi]{stri_extract}} from \pkg{stringi} and
#' \code{\link[stringr]{str_extract}} from \pkg{stringr}
#' for the underlying implementation.
#'
#' @export
#' @examples
#'
#' x <- dna("ACGTTAGTGTAGCCGT", "CTCGAAATGA")
#' seq_extract_position(x, 3, 8)
#'
seq_extract_position <- function(x, position_in, position_out){
check_dna_rna_aa(x)
x_nchar <- nchar(x)
position_out <- ifelse(position_out > x_nchar, x_nchar, position_out)
d_pos <- position_out - position_in
rgx <- paste0("(?<=.{", position_in - 1, "}).{", d_pos + 1, "}")
res <- stringr::str_extract(x, pattern = rgx)
res <- coerce_seq_as_input(res, input = x)
return(res)
}
############################# Assemble #############################
#' Combine multiple sequences
#'
#' @param ... One or more vectors of sequences (DNA, RNA, AA).
#' They must all be of the same type. Short vectors are recycled.
#' @param sep String to insert between input vectors.
#' @param collapse If not \code{NULL}, combine everything with this
#' string as separator.
#'
#' @details
#' The strings \code{sep} and \code{collapse}w ill be coerced to
#' the type of input vectors with a warning if some character have to replaced.
#'
#' @return A vector of sequences (if collapse is \code{NULL}).
#' A vector with a single sequence, otherwise.
#' @family string operations
#' @seealso
#' \code{\link[stringi]{stri_join}} from \pkg{stringi} and
#' \code{\link[stringr]{str_c}} from \pkg{stringr}
#' for the underlying implementation.
#'
#' @export
#'
#' @examples
#'
#' x <- dna("ACGTTAGTGTAGCCGT", "CTCGAAATGA")
#' y <- dna("TTTTTTTT", "AAAAAAAAA")
#' seq_combine(x, y)
#' seq_combine(y, x, sep = "CCCCC")
#' seq_combine(y, x, sep = "CCCCC", collapse = "GGGGG")
#'
seq_combine <- function(..., sep = "", collapse = NULL) {
x <- list(...)
if(!(all(vapply(x, is_dna, vector("logical", 1))) |
all(vapply(x, is_rna, vector("logical", 1))) |
all(vapply(x, is_aa, vector("logical", 1)))
)) {
stop("Vectors must be of same class to be combined.")
}
res <- stringr::str_c(..., sep = sep, collapse = collapse)
res <- coerce_seq_as_input(res, input = x[[1]])
return(res)
}
############################# Extra #############################
seq_remove_gap <- function(x) {
seq_remove_pattern(x, "-")
}
#' Split sequences into k-mers
#'
#' @param x A DNA, RNA or AA vector.
#' @param k an integer giving the size of the k-mer.
#'
#' @return a list of k-mer vectors of same class as \code{x}.
#' @family string operations
#' @seealso
#' \code{\link{seq_split_pattern}}.
#'
#' @export
#'
#' @examples
#'
#' x <- dna(a ="ACGTTAGTGTAGCCGT", b = "CTCGAAATGA")
#' seq_split_kmer(x, k = 5)
seq_split_kmer <- function(x, k) {
check_dna_rna_aa(x)
res <- lapply(x, function(x) {
x_len <- nchar(x)
if(k > x_len) {
warning("k was larger than the sequence: return NA.")
return(NA)
}
km_start <- seq(1, x_len - k + 1)
km_stop <- km_start + k - 1
out <- mapply(stringr::str_sub, x, km_start, km_stop, USE.NAMES = FALSE)
coerce_seq_as_input(out, input = x, keep_names = FALSE)
})
names(res) <- names(x)
return(res)
}
# ### Assemble
# - seq_duplicate
# - seq_insert
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.