Nothing
#' Export reference database for DADA2
#'
#' Write reference database in formats which can be used with the
#' functions of the package \pkg{dada2}.
#'
#' @param x a reference database.
#' @param file a path to the file to be written.
#' @param mode character string to determine the type of file to produce.
#' Use \code{"taxonomy"} to produce a file for function \code{assignTaxonomy}
#' or \code{"species"} to produce a file for function \code{assignSpecies}.
#'
#' @return
#' No return value, called for side effects.
#'
#' @examples
#' lib <- read.csv(system.file("extdata", "baetidae_bold.csv", package = "refdb"))
#' lib <- refdb_set_fields_BOLD(lib)
#' refdb_export_dada2(lib, tempfile())
#'
#' @export
#'
refdb_export_dada2 <- function(x, file, mode = "taxonomy") {
if(mode == "taxonomy") {
check_fields(x, what = c("sequence", "taxonomy"))
col_tax <- attributes(x)$refdb_fields$taxonomy
labs <- apply(x[, col_tax], 1, paste, collapse = ";")
labs <- paste0(labs, ";")
}
if(mode == "species") {
check_fields(x, what = c("sequence", "taxonomy", "id"))
col_tax <- attributes(x)$refdb_fields$taxonomy["species"]
col_id <- attributes(x)$refdb_fields$id
labs <- apply(x[, c(col_id, col_tax)], 1, paste, collapse = " ")
labs <- stringr::str_replace_all(labs, "(?<=[^ ]{1,1000} [^ ]{1,1000} [^ ]{1,1000}) ", "_")
# Worst regex ever
}
seqs <- x[[attributes(x)$refdb_fields$sequence]]
names(seqs) <- labs
bioseq::write_fasta(seqs, file = file)
}
#' Export reference database for Mothur
#'
#' Write a reference database in formats which can be used
#' with \code{Mothur}.
#'
#' @param x a reference database.
#' @param file a file path. This will be used to create
#' a .fasta file and a .txt file.
#'
#' @return
#' No return value, called for side effects.
#'
#' @examples
#' lib <- read.csv(system.file("extdata", "baetidae_bold.csv", package = "refdb"))
#' lib <- refdb_set_fields_BOLD(lib)
#' refdb_export_mothur(lib, tempfile())
#'
#' @export
#'
refdb_export_mothur <- function(x, file) {
check_fields(x, what = c("sequence", "taxonomy", "id"))
col_tax <- attributes(x)$refdb_fields$taxonomy
col_id <- attributes(x)$refdb_fields$id
col_seq <- attributes(x)$refdb_fields$sequence
labs <- x[[col_id]]
labs <- stringr::str_replace_all(labs, "[:blank:]", "_")
seqs <- x[[col_seq]]
names(seqs) <- labs
tax <- apply(x[, col_tax], 1, function(x) stringr::str_replace_all(paste(x, collapse = ";"), "[:blank:]", "_"))
tax <- paste0(labs, "\t", tax, ";")
file_fas <- paste0(file, ".fasta")
file_txt <- paste0(file, ".txt")
bioseq::write_fasta(seqs, file = file_fas)
readr::write_lines(tax, file = file_txt)
}
#' Export reference database for DECIPHER (IDTAXA)
#'
#' Write a reference database in file formats which can be used
#' to train the \code{IDTAXA} classifier implemented in \code{DECIPHER}.
#'
#' @param x a reference database.
#' @param file a file path without extension. This will be used to create
#' a .fasta file and two .txt files.
#' @param taxid should the taxid file be generated
#' (can be very slow with large databases)
#'
#' @details
#' The functions generates three files.
#'
#' - A fasta files containing the sequences with their IDs.
#' This file must be imported as a \code{DNAStringSet}
#' to be used with \code{DECIPHER}, using eg:\cr
#' \code{Biostrings::readDNAStringSet("ex_seqs.fasta")}
#'
#' - A text files containing the sequence taxonomic assignment.
#' This file must be imported as a character vector
#' to be used with \code{DECIPHER}, using eg:\cr
#' \code{readr::read_lines("ex_taxo.txt")}
#'
#' - A text file ("taxid") containing the taxonomic ranks
#' associated with each taxon. This is an asterisk delimited file
#' which must be imported as a dataframe (see LearnTaxa), using eg:\cr
#' \code{readr::read_delim("ex_ranks.txt",
#' col_names = c('Index', 'Name', 'Parent', 'Level', 'Rank'),
#' delim = "*", quote = "")}
#'
#' The taxid file can be very slow to write for large datasets.
#' Therefore it is not generated by default.
#'
#' @return
#' No return value, called for side effects.
#'
#' @examples
#' lib <- read.csv(system.file("extdata", "baetidae_bold.csv", package = "refdb"))
#' lib <- refdb_set_fields_BOLD(lib)
#' refdb_export_idtaxa(lib, tempfile())
#'
#' @export
#'
refdb_export_idtaxa <- function(x, file, taxid = FALSE) {
check_fields(x, what = c("sequence", "taxonomy", "id"))
col_tax <- attributes(x)$refdb_fields$taxonomy
col_id <- attributes(x)$refdb_fields$id
col_seq <- attributes(x)$refdb_fields$sequence
labs <- x[[col_id]]
labs <- stringr::str_replace_all(labs, "[:blank:]", "_")
seqs <- x[[col_seq]]
names(seqs) <- labs
tax <- x[, col_tax]
if(all(tax[, 1] != "Root")) {
tax <- cbind(rep("Root", nrow(tax)), tax)
}
tax <- apply(tax, 1, function(x) {
x <- x[!is.na(x)]
paste(x, collapse = ";")
})
file_fas <- paste0(file, "_seqs.fasta")
file_tax <- paste0(file, "_taxo.txt")
bioseq::write_fasta(seqs, file = file_fas)
readr::write_lines(tax, file = file_tax)
if(taxid) {
ranks <- taxid_file(x)
file_rank <- paste0(file, "_ranks.txt")
readr::write_lines(ranks, file = file_rank)
}
}
# Internal function to produce taxid file for DECIPHER
# Code is adapted from the vignette "Classify Sequences"
# from DECIPHER: 10.18129/B9.bioc.DECIPHER
taxid_file <- function(x) {
check_fields(x, what = "taxonomy")
col_tax <- attributes(x)$refdb_fields$taxonomy
tax_levels <- names(col_tax)
tax_letters <- letters[seq_along(tax_levels)]
taxa <- stats::setNames(tax_levels, tax_letters)
tax_mat <- x[, col_tax]
tax_mat <- tax_mat[!duplicated.data.frame(tax_mat), ]
id_string <- mapply(function(x, y) {paste(x, y, sep = "__")}, tax_letters, tax_mat)
id_string[is.na(tax_mat)] <- ""
id_string <- apply(id_string, 1, paste, collapse = ";")
id_string <- stringr::str_replace_all(id_string, ";{2,}", ";")
id_string <- stringr::str_remove(id_string, ";$")
ranks <- strsplit(id_string, ";", fixed = TRUE)
count <- 1L
groups <- "Root"
index <- -1L
level <- 0L
rank <- "rootrank"
pBar <- utils::txtProgressBar(style = 3)
for (i in seq_along(ranks)) {
for (j in seq_along(ranks[[i]])) {
rank_level <- taxa[substring(ranks[[i]][j], 1, 3)]
group <- substring(ranks[[i]][j], 4)
w <- which(groups == group & rank == rank_level)
if (length(w) > 0) {
parent <- match(substring(ranks[[i]][j - 1], 4),
groups)
if (j == 1 || any((parent - 1L) == index[w]))
next
}
count <- count + 1L
groups <- c(groups, group)
if (j==1) {
index <- c(index, 0)
} else {
parent <- match(substring(ranks[[i]][j - 1], 4),
groups)
index <- c(index,
parent - 1L)
}
level <- c(level, j)
rank <- c(rank, taxa[j])
}
utils::setTxtProgressBar(pBar, i/length(ranks))
}
groups <- gsub("^[ ]+", "", groups)
groups <- gsub("[ ]+$", "", groups)
taxid <- paste(0:(length(index) - 1L), groups, index, level, rank, sep = "*")
return(taxid)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.