R/refdb_export.R
In refdb: A DNA Reference Library Manager

Documented in refdb_export_dada2 refdb_export_idtaxa refdb_export_mothur

#' Export reference database for DADA2
#'
#' Write reference database in formats which can be used with the
#' functions of the package \pkg{dada2}.
#'
#' @param x a reference database.
#' @param file a path to the file to be written.
#' @param mode character string to determine the type of file to produce.
#' Use \code{"taxonomy"} to produce a file for function \code{assignTaxonomy}
#' or \code{"species"} to produce a file for function \code{assignSpecies}.
#'
#' @return
#' No return value, called for side effects.
#'
#' @examples
#' lib <- read.csv(system.file("extdata", "baetidae_bold.csv", package = "refdb"))
#' lib <- refdb_set_fields_BOLD(lib)
#' refdb_export_dada2(lib, tempfile())
#'
#' @export
#'
refdb_export_dada2 <- function(x, file, mode = "taxonomy") {

  if(mode == "taxonomy") {
    check_fields(x, what = c("sequence", "taxonomy"))
    col_tax <- attributes(x)$refdb_fields$taxonomy
    labs <- apply(x[, col_tax], 1, paste, collapse = ";")
    labs <- paste0(labs, ";")
  }

  if(mode == "species") {
    check_fields(x, what = c("sequence", "taxonomy", "id"))
    col_tax <- attributes(x)$refdb_fields$taxonomy["species"]
    col_id <- attributes(x)$refdb_fields$id

    labs <- apply(x[, c(col_id, col_tax)], 1, paste, collapse = " ")
    labs <- stringr::str_replace_all(labs, "(?<=[^ ]{1,1000} [^ ]{1,1000} [^ ]{1,1000}) ", "_")
    # Worst regex ever
  }

  seqs <- x[[attributes(x)$refdb_fields$sequence]]
  names(seqs) <- labs

  bioseq::write_fasta(seqs, file = file)

}





#' Export reference database for Mothur
#'
#' Write a reference database in formats which can be used
#' with \code{Mothur}.
#'
#' @param x a reference database.
#' @param file a file path. This will be used to create
#'  a .fasta file and a .txt file.
#'
#' @return
#' No return value, called for side effects.
#'
#' @examples
#' lib <- read.csv(system.file("extdata", "baetidae_bold.csv", package = "refdb"))
#' lib <- refdb_set_fields_BOLD(lib)
#' refdb_export_mothur(lib, tempfile())
#'
#' @export
#'
refdb_export_mothur <- function(x, file) {
  check_fields(x, what = c("sequence", "taxonomy", "id"))

  col_tax <- attributes(x)$refdb_fields$taxonomy
  col_id <- attributes(x)$refdb_fields$id
  col_seq <- attributes(x)$refdb_fields$sequence

  labs <- x[[col_id]]
  labs <- stringr::str_replace_all(labs, "[:blank:]", "_")

  seqs <- x[[col_seq]]
  names(seqs) <- labs

  tax <- apply(x[, col_tax], 1, function(x) stringr::str_replace_all(paste(x, collapse = ";"), "[:blank:]", "_"))
  tax <- paste0(labs, "\t", tax, ";")


  file_fas <- paste0(file, ".fasta")
  file_txt <- paste0(file, ".txt")

  bioseq::write_fasta(seqs, file = file_fas)
  readr::write_lines(tax, file = file_txt)
}



#' Export reference database for DECIPHER (IDTAXA)
#'
#' Write a reference database in file formats which can be used
#' to train the \code{IDTAXA} classifier implemented in \code{DECIPHER}.
#'
#' @param x a reference database.
#' @param file a file path without extension. This will be used to create
#'  a .fasta file and two .txt files.
#' @param taxid should the taxid file be generated
#' (can be very slow with large databases)
#'
#' @details
#' The functions generates three files.
#'
#' - A fasta files containing the sequences with their IDs.
#' This file must be imported as a \code{DNAStringSet}
#' to be used with \code{DECIPHER}, using eg:\cr
#' \code{Biostrings::readDNAStringSet("ex_seqs.fasta")}
#'
#' - A text files containing the sequence taxonomic assignment.
#' This file must be imported as a character vector
#' to be used with \code{DECIPHER}, using eg:\cr
#' \code{readr::read_lines("ex_taxo.txt")}
#'
#' - A text file ("taxid") containing the taxonomic ranks
#' associated with each taxon. This is an asterisk delimited file
#' which must be imported as a dataframe (see LearnTaxa), using eg:\cr
#' \code{readr::read_delim("ex_ranks.txt",
#'                    col_names = c('Index', 'Name', 'Parent', 'Level', 'Rank'),
#'                    delim = "*", quote = "")}
#'
#' The taxid file can be very slow to write for large datasets.
#' Therefore it is not generated by default.
#'
#' @return
#' No return value, called for side effects.
#'
#' @examples
#' lib <- read.csv(system.file("extdata", "baetidae_bold.csv", package = "refdb"))
#' lib <- refdb_set_fields_BOLD(lib)
#' refdb_export_idtaxa(lib, tempfile())
#'
#' @export
#'
refdb_export_idtaxa <- function(x, file, taxid = FALSE) {
  check_fields(x, what = c("sequence", "taxonomy", "id"))

  col_tax <- attributes(x)$refdb_fields$taxonomy
  col_id <- attributes(x)$refdb_fields$id
  col_seq <- attributes(x)$refdb_fields$sequence

  labs <- x[[col_id]]
  labs <- stringr::str_replace_all(labs, "[:blank:]", "_")

  seqs <- x[[col_seq]]
  names(seqs) <- labs

  tax <- x[, col_tax]
  if(all(tax[, 1] != "Root")) {
    tax <- cbind(rep("Root", nrow(tax)), tax)
  }

  tax <- apply(tax, 1, function(x) {
    x <- x[!is.na(x)]
    paste(x, collapse = ";")
  })


  file_fas <- paste0(file, "_seqs.fasta")
  file_tax <- paste0(file, "_taxo.txt")

  bioseq::write_fasta(seqs, file = file_fas)
  readr::write_lines(tax, file = file_tax)

  if(taxid) {
    ranks <- taxid_file(x)
    file_rank <- paste0(file, "_ranks.txt")
    readr::write_lines(ranks, file = file_rank)
  }
}



# Internal function to produce taxid file for DECIPHER
# Code is adapted from the vignette "Classify Sequences"
# from DECIPHER: 10.18129/B9.bioc.DECIPHER
taxid_file <- function(x) {
  check_fields(x, what = "taxonomy")
  col_tax <- attributes(x)$refdb_fields$taxonomy
  tax_levels <- names(col_tax)
  tax_letters <- letters[seq_along(tax_levels)]
  taxa <- stats::setNames(tax_levels, tax_letters)

  tax_mat <- x[, col_tax]
  tax_mat <- tax_mat[!duplicated.data.frame(tax_mat), ]

  id_string <- mapply(function(x, y) {paste(x, y, sep = "__")}, tax_letters, tax_mat)
  id_string[is.na(tax_mat)] <- ""
  id_string <- apply(id_string, 1, paste, collapse = ";")
  id_string <- stringr::str_replace_all(id_string, ";{2,}", ";")
  id_string <- stringr::str_remove(id_string, ";$")

  ranks <- strsplit(id_string, ";", fixed = TRUE)
  count <- 1L
  groups <- "Root"
  index <- -1L
  level <- 0L
  rank <- "rootrank"
  pBar <- utils::txtProgressBar(style = 3)
  for (i in seq_along(ranks)) {
  	for (j in seq_along(ranks[[i]])) {
  		rank_level <- taxa[substring(ranks[[i]][j], 1, 3)]
  		group <- substring(ranks[[i]][j], 4)
  		w <- which(groups == group & rank == rank_level)
  		if (length(w) > 0) {
  			parent <- match(substring(ranks[[i]][j - 1], 4),
  				groups)
  			if (j == 1 || any((parent - 1L) == index[w]))
  				next
  		}

  		count <- count + 1L
  		groups <- c(groups, group)
  		if (j==1) {
  			index <- c(index, 0)
  		} else {
  			parent <- match(substring(ranks[[i]][j - 1], 4),
  				groups)
  			index <- c(index,
  				parent - 1L)
  		}
  		level <- c(level, j)
  		rank <- c(rank, taxa[j])
  	}

  	utils::setTxtProgressBar(pBar, i/length(ranks))
  }
  groups <- gsub("^[ ]+", "", groups)
  groups <- gsub("[ ]+$", "", groups)

  taxid <- paste(0:(length(index) - 1L), groups, index, level, rank, sep = "*")
  return(taxid)
}