################################################################################
#' Resolve conflict in a vector of taxonomy values
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Internally used in the function [assign_blastn()] with method="vote"
#'
#' @param vec (required) A vector of (taxonomic) values
#' @param method One of "consensus", "rel_majority", "abs_majority",
#' "preference" or "unanimity". See details.
#'
#' @param strict (logical, default FALSE). If TRUE, NA are considered as
#' informative in resolving conflict (i.e. NA are taking into account in vote).
#' See details for more informations.
#' @param second_method One of "consensus", "rel_majority", "abs_majority",
#' or "unanimity". Only used if method = "preference". See details.
#' @param nb_agree_threshold (Int, default 1) The minimum number of times a
#' value must arise to be selected using vote. If 2, we only kept
#' taxonomic value present at least 2 times in the vector.
#' @param preference_index (Int. default NULL). Required if method="preference".
#' Useless for other method. The preference index is the index of the value in
#' vec for which we have a preference.
#' @param collapse_string (default '/'). The character to collapse taxonomic names
#' when multiple assignment is done.
#' @param replace_collapsed_rank_by_NA (logical, default FALSE). If set to TRUE,
#' all multiple assignments (all taxonomic rank including the 'collapse_string'
#' parameter) are replaced by NA.
#'
#' @details
#'
#' - `unanimity`: Only keep taxonomic value when all methods are agree
#' - By default, the value with NA are not taking into account (strict=FALSE)
#' - If `strict` , one NA in the row is sufficient to return a NA
#'
#' - `consensus`: Keep all taxonomic values separated by a '/' (separation can be modify using param `collapse_string`)
#' - If `strict` is TRUE, NA are also added to taxonomic vector such as 'Tiger/Cat/NA' instead of 'Tiger/Cat'
#'
#' - `abs_majority`: Keep the most found taxonomic value if it represent at least half of all taxonomic values.
#' - If `strict` is TRUE, NA values are also count to determine the majority. For example, a vector of taxonomic rank c("A", "A", "A", "B", NA, NA) will give a value of 'A' if `strict` is FALSE (default) but a value of NA if `strict` is TRUE.
#'
#' - `rel_majority`: Keep the most found taxonomic value. In case of equality, apply a consensus strategy (collapse values separated by a '/') across the most found taxonomic values.
#' - If `strict` is TRUE, NA are considered as a rank and can win the relative majority vote. If `strict` is FALSE (default), NA are removed before ranking the taxonomic values.
#' - `nb_agree_threshold`: Only keep return value when at least x methods agreed with x is set by parameter `nb_agree_threshold`. By default, (`nb_agree_threshold` = 1): a majority of one is enough.
#'
#' - `preference`: Keep the value from a preferred column.
#' - when the value is NA in the preferred column, apply a second strategy (by default `consensus`) to the other column (parameter `second_method`). Note that the parameters `strict` and `nb_agree_threshold` are used for the second_method consensus.
#' @returns a vector of length 1 (one character value)
#'
#' @export
#' @author Adrien Taudière
#'
#' @examples
#'
#' resolve_vector_ranks(c("A"))
#' resolve_vector_ranks(c("A"),
#' method = "preference",
#' preference_index = 1
#' )
#' resolve_vector_ranks(c("A"), method = "abs_majority")
#' resolve_vector_ranks(c("A"), method = "rel_majority")
#' resolve_vector_ranks(c("A"),
#' method = "rel_majority",
#' nb_agree_threshold = 2
#' )
#' resolve_vector_ranks(c("A"), method = "unanimity")
#'
#' resolve_vector_ranks(c("A", "A", "A"))
#' resolve_vector_ranks(c("A", "A", "A"),
#' method = "preference",
#' preference_index = 1
#' )
#' resolve_vector_ranks(c("A", "A", "A"), method = "abs_majority")
#' resolve_vector_ranks(c("A", "A", "A"), method = "rel_majority")
#' resolve_vector_ranks(c("A", "A", "A"), method = "unanimity")
#'
#' resolve_vector_ranks(c(NA, NA, NA))
#' resolve_vector_ranks(c(NA, NA, NA),
#' method = "preference",
#' preference_index = 1
#' )
#' resolve_vector_ranks(c(NA, NA, NA), method = "abs_majority")
#' resolve_vector_ranks(c(NA, NA, NA), method = "rel_majority")
#' resolve_vector_ranks(c(NA, NA, NA), method = "unanimity")
#'
#' resolve_vector_ranks(c("A", "A", NA))
#' resolve_vector_ranks(c("A", "A", NA),
#' method = "preference",
#' preference_index = 1
#' )
#' resolve_vector_ranks(c("A", "A", NA), method = "rel_majority")
#' resolve_vector_ranks(c("A", "A", NA), method = "abs_majority")
#' resolve_vector_ranks(c("A", "A", NA, NA),
#' method = "abs_majority",
#' strict = FALSE
#' )
#' resolve_vector_ranks(c("A", "A", NA, NA),
#' method = "abs_majority",
#' strict = TRUE
#' )
#' resolve_vector_ranks(c("A", "A", NA), method = "unanimity")
#' resolve_vector_ranks(c("A", "A", NA),
#' method = "unanimity",
#' strict = TRUE
#' )
#'
#' resolve_vector_ranks(c("A", "B", NA))
#' resolve_vector_ranks(c("A", "B", NA), strict = TRUE)
#' resolve_vector_ranks(c("A", "B", NA),
#' method = "preference",
#' preference_index = 1
#' )
#' resolve_vector_ranks(c("A", "B", NA), method = "abs_majority")
#' resolve_vector_ranks(c("A", "B", NA), method = "rel_majority")
#' resolve_vector_ranks(c("A", "B", NA),
#' method = "rel_majority",
#' strict = TRUE
#' )
#' resolve_vector_ranks(c("A", "B", NA),
#' method = "rel_majority",
#' nb_agree_threshold = 2
#' )
#' resolve_vector_ranks(c("A", "B", NA), method = "unanimity")
#'
#' resolve_vector_ranks(c("A", NA, NA))
#' resolve_vector_ranks(c("A", NA, NA), method = "rel_majority")
#' resolve_vector_ranks(c("A", NA, NA), method = "unanimity")
#' resolve_vector_ranks(c("A", NA, NA),
#' method = "preference",
#' preference_index = 1
#' )
#' resolve_vector_ranks(c("A", NA, NA),
#' method = "preference",
#' preference_index = 2
#' )
#' resolve_vector_ranks(c("A", NA, "B"),
#' method = "preference",
#' preference_index = 2
#' )
#' resolve_vector_ranks(c("A", NA, "B"),
#' method = "preference",
#' preference_index = 2, second_method = "abs_majority"
#' )
#'
#' resolve_vector_ranks(c("A", "B", "B"))
#' resolve_vector_ranks(c("A", "B", "B"),
#' method = "preference",
#' preference_index = 1
#' )
#' resolve_vector_ranks(c("A", "B", "B"), method = "abs_majority")
#' resolve_vector_ranks(c("A", "B", "B"), method = "rel_majority")
#' resolve_vector_ranks(c("A", "B", "B"), method = "unanimity")
#'
#'
#' resolve_vector_ranks(c("A", "A", "A", "B", NA, NA))
#' resolve_vector_ranks(c("A", "A", "A", "B", NA, NA),
#' strict = TRUE
#' )
#' resolve_vector_ranks(c("A", "A", "A", "B", NA, NA),
#' method = "abs_majority"
#' )
#' resolve_vector_ranks(c("A", "A", "A", "B", NA, NA),
#' method = "abs_majority",
#' strict = TRUE
#' )
#' resolve_vector_ranks(c("A", "A", "A", "B", NA, NA),
#' method = "preference", preference_index = 6, second_method = "abs_majority"
#' )
#' resolve_vector_ranks(c("A", "A", "A", "B", NA, NA, NA),
#' method = "preference", preference_index = 6, second_method = "abs_majority"
#' )
#' resolve_vector_ranks(c("A", "A", "A", "B", NA, NA, NA),
#' method = "preference", preference_index = 6, second_method = "abs_majority",
#' strict = TRUE
#' )
resolve_vector_ranks <- function(vec,
method = c(
"consensus",
"rel_majority",
"abs_majority",
"preference",
"unanimity"
),
strict = FALSE,
second_method = c(
"consensus",
"rel_majority",
"abs_majority",
"unanimity"
),
nb_agree_threshold = 1,
preference_index = NULL,
collapse_string = "/",
replace_collapsed_rank_by_NA = FALSE) {
method <- match.arg(method)
second_method <- match.arg(second_method)
if (sum(is.na(vec)) == length(vec)) {
res <- NA
return(res)
}
if (method == "consensus") {
if (!strict) {
vec <- as.vector(na.omit(vec))
}
res <- paste0(unique(vec), collapse = collapse_string)
} else if (method == "preference") {
if (is.null(preference_index)) {
stop("You must specify a preference_index if method = 'preference'")
}
res <- vec[preference_index]
if (is.na(res)) {
res <- resolve_vector_ranks(
vec[-preference_index],
method = second_method,
nb_agree_threshold = nb_agree_threshold,
strict = strict,
collapse_string = collapse_string,
replace_collapsed_rank_by_NA = replace_collapsed_rank_by_NA
)
}
} else if (method == "abs_majority") {
if (!strict) {
vec <- as.vector(na.omit(vec))
}
nval <- length(vec)
higher_val <- sort(table(vec), decreasing = T)[1]
if (higher_val / nval > 0.5) {
res <- names(higher_val)
} else {
res <- NA
}
} else if (method == "rel_majority") {
if (!strict) {
vec <- as.vector(na.omit(vec))
}
nval <- sum(table(vec, useNA = "ifany") == max(table(vec, useNA = "ifany")))
if (sum(sort(table(vec, useNA = "ifany"), decreasing = T)[1:nval]) >= nb_agree_threshold) {
res <- paste0(names(sort(table(vec, useNA = "ifany"), decreasing = T)[1:nval]), collapse = collapse_string)
} else {
res <- NA
}
} else if (method == "unanimity") {
if (!strict) {
vec <- as.vector(na.omit(vec))
}
if (length(unique(vec)) == 1) {
res <- unique(vec)
} else {
res <- NA
}
}
if (replace_collapsed_rank_by_NA &&
sum(grepl(collapse_string, res)) > 0) {
res <- NA
}
return(res)
}
################################################################################
################################################################################
#' Format a fasta database in sintax format
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Only tested with Unite and Eukaryome fasta file for the moment. Rely on the presence of the pattern
#' pattern_tax default "k__" to format the header.
#'
#' A reference database in sintax format
#' contain taxonomic information in the header of
#' each sequence in the form of a string starting with ";tax=" and followed
#' by a comma-separated list of up to nine taxonomic identifiers. Each taxonomic
#' identifier must start with an indication of the rank by one of the letters d
#' (for domain) k (kingdom), p (phylum), c (class), o (order), f (family),
#' g (genus), s (species), or t (strain). The letter is followed by a colon
#' (:) and the name of that rank. Commas and semicolons are not allowed in
#' the name of the rank. Non-ascii characters should be avoided in the names.
#'
#' Example:
#'
#' \>X80725_S000004313;tax=d:Bacteria,p:Proteobacteria,c:Gammaproteobacteria,o:Enterobacteriales,f:Enterobacteriaceae,g:Escherichia/Shigella,s:Escherichia_coli,t:str._K-12_substr._MG1655
#'
#' @param fasta_db A link to a fasta files
#' @param taxnames A list of names to format. You must specify either fasta_db OR taxnames, not both.
#' @param pattern_tax (default "k__") The pattern to replace by pattern_sintax.
#' @param pattern_sintax (default "tax=k:") Useless for most users. Sometimes you may want to
#' replacte by "tax=d:" (d for domain instead of kingdom).
#' @param output_path (optional) A path to an output fasta files. Only used if fasta_db is set.
#' @export
#' @author Adrien Taudière
#' @seealso [format2dada2_species()], [format2dada2()]
#' @return Either an object of class DNAStringSet or a vector of reformated names
format2sintax <- function(fasta_db = NULL,
taxnames = NULL,
pattern_tax = "k__",
pattern_sintax = "tax=k:",
output_path = NULL) {
if (is.null(taxnames) && is.null(fasta_db)) {
stop("You must specify taxnames or fasta_db parameter.")
} else if (!is.null(taxnames) && !is.null(fasta_db)) {
stop("You must specify either taxnames or fasta_db, not both.")
} else if (!is.null(taxnames)) {
new_names <- taxnames %>%
{
gsub(";", ",", .)
} %>%
{
gsub(pattern_k, paste0(";", pattern_sintax), .)
} %>%
{
gsub("__", ":", .)
} %>%
{
gsub(";;", ";", .)
} %>%
{
gsub(paste0(",", pattern_sintax), paste0(";", pattern_sintax), .)
}
return(new_names)
} else if (!is.null(fasta_db)) {
dna <- Biostrings::readDNAStringSet(fasta_db)
new_names <- names(dna) %>%
{
gsub(";", ",", .)
} %>%
{
gsub(pattern_tax, paste0(";", pattern_sintax), .)
} %>%
{
gsub("__", ":", .)
} %>%
{
gsub(";;", ";", .)
} %>%
{
gsub(paste0(",", pattern_sintax), paste0(";", pattern_sintax), .)
}
names(dna) <- new_names
if (!is.null(output_path)) {
Biostrings::writeXStringSet(dna, filepath = output_path)
}
return(dna)
}
}
################################################################################
################################################################################
#' Format a fasta database in dada2 format
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' First format in sintax format and then in dada2 format
#'
#' @param fasta_db A link to a fasta files
#' @param taxnames A list of names to format. You must specify either fasta_db OR taxnames, not both.
#' @param output_path (optional) A path to an output fasta files. Only used if fasta_db is set.
#' @param from_sintax (logical, default FALSE) Is the original fasta file in sintax format?
#' @param ... Additional arguments passed on to [format2sintax()] function
#' @export
#' @author Adrien Taudière
#' @return Either an object of class DNAStringSet or a vector of reformated names
#' @seealso [format2dada2_species()], [format2sintax()]
format2dada2 <- function(fasta_db = NULL,
taxnames = NULL,
output_path = NULL,
from_sintax = TRUE,
...) {
if (is.null(taxnames) && is.null(fasta_db)) {
stop("You must specify taxnames or fasta_db parameter.")
} else if (!is.null(taxnames) && !is.null(fasta_db)) {
stop("You must specify either names or fasta_db, not both.")
} else if (!is.null(taxnames)) {
if (from_sintax) {
new_names <- taxnames
} else {
new_names <- format2sintax(taxnames = taxnames, ...)
}
new_names <- new_names |>
stringr::str_split_fixed(";tax=", n = 2) |>
as_tibble() |>
tidyr::unite(taxnames, c(V2, V1), sep = ";") |>
pull(taxnames) %>%
{
gsub(":", "__", .)
} %>%
{
gsub(",", ";", .)
}
return(new_names)
} else if (!is.null(fasta_db)) {
dna <- Biostrings::readDNAStringSet(fasta_db)
if (from_sintax) {
new_names <- names(dna)
} else {
new_names <- format2sintax(names(dna), ...)
}
new_names <- new_names |>
stringr::str_split_fixed(";tax=", n = 2) |>
as_tibble() |>
tidyr::unite(taxnames, c(V2, V1), sep = ";") |>
pull(taxnames) %>%
{
gsub(":", "__", .)
} %>%
{
gsub(",", ";", .)
}
names(dna) <- new_names
if (!is.null(output_path)) {
Biostrings::writeXStringSet(dna, filepath = output_path)
}
return(dna)
}
}
################################################################################
################################################################################
#' Format a fasta database in dada2 format for Species assignment
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' First format in sintax format and then in dada2 format
#'
#' @param fasta_db A link to a fasta files
#' @param taxnames A list of names to format. You must specify either fasta_db OR taxnames, not both.
#' @param output_path (optional) A path to an output fasta files. Only used if fasta_db is set.
#' @param from_sintax (logical, default FALSE) Is the original fasta file in sintax format?
#' @param ... Additional arguments passed on to [format2sintax()] function
#' @export
#' @author Adrien Taudière
#' @return Either an object of class DNAStringSet or a vector of reformated names
#' @seealso [format2dada2_species()], [format2sintax()]
#'
format2dada2_species <- function(
fasta_db = NULL,
taxnames = NULL,
from_sintax = FALSE,
output_path = NULL,
...) {
if (is.null(taxnames) && is.null(fasta_db)) {
stop("You must specify taxnames or fasta_db parameter.")
} else if (!is.null(taxnames) && !is.null(fasta_db)) {
stop("You must specify either taxnames or fasta_db, not both.")
} else if (!is.null(taxnames)) {
if (from_sintax) {
new_names <- paste(
stringr::str_extract(taxnames, "^(.*?);tax=", group = T),
stringr::str_extract(taxnames, "g:(.*?),", group = T),
stringr::str_extract(taxnames, "s:(.*?)$", group = T),
sep = " "
)
} else {
new_names <- paste(
stringr::str_extract(taxnames, "^(.*?)k__", group = T),
stringr::str_extract(taxnames, "g__(.*?);", group = T),
stringr::str_extract(taxnames, "s__(.*?)$", group = T),
sep = " "
)
}
return(new_names)
} else if (!is.null(fasta_db)) {
dna <- Biostrings::readDNAStringSet(fasta_db)
taxnames <- names(dna)
if (from_sintax) {
id <- stringr::str_extract(taxnames, "^(.*?);tax=", group = T)
id[is.na(id)] <- taxnames[is.na(id)]
genus <- stringr::str_extract(taxnames, "g:(.*?),", group = T)
species <- stringr::str_extract(taxnames, "s:(.*?)$", group = T)
} else {
id <- stringr::str_extract(taxnames, "^(.*?)k__", group = T)
id[is.na(id)] <- taxnames[is.na(id)]
genus <- stringr::str_extract(taxnames, "g__(.*?);", group = T)
species <- stringr::str_extract(taxnames, "s__(.*?)$", group = T)
}
new_names <- paste(id, genus, species, sep = " ")
names(dna) <- new_names
names(dna)[is.na(names(dna))] <- taxnames[names(dna)]
if (!is.null(output_path)) {
Biostrings::writeXStringSet(dna, filepath = output_path)
}
return(dna)
}
}
################################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.