R/vsearch.R

Defines functions assign_vsearch_lca assign_sintax write_temp_fasta chimera_detection_vs chimera_removal_vs vsearch_clustering swarm_clustering vs_search_global

Documented in assign_sintax assign_vsearch_lca chimera_detection_vs chimera_removal_vs swarm_clustering vsearch_clustering vs_search_global

################################################################################
#' Search for a list of sequence in a fasta file against physeq reference
#'   sequences using [vsearch](https://github.com/torognes/vsearch)
#'
#' @description
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#'
#' Use of VSEARCH software.
#'
#' @inheritParams clean_pq
#' @param path_to_fasta (required if seq2search is NULL) a path to fasta file if seq2search is est to NULL.
#' @param seq2search (required if path_to_fasta is NULL) Either (i) a DNAStringSet object
#'   or (ii) a character vector that will be convert to DNAStringSet using
#'   [Biostrings::DNAStringSet()]
#' @param vsearchpath (default: "vsearch") path to vsearch
#' @param id (default: 0.8) id for the option `--usearch_global` of the vsearch software
#' @param iddef (default: 0) iddef for the option `--usearch_global` of the vsearch software
#' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary files
#'   - temp.fasta (refseq in fasta)
#'   - cluster.fasta (centroid)
#'   - temp.uc (clusters)
#' @examplesIf MiscMetabar::is_vsearch_installed()
#' \donttest{
#' if (requireNamespace("seqinr")) {
#'   file_dna <- tempfile("dna.fa")
#'   seqinr::write.fasta("GCCCATTAGTATTCTAGTGGGCATGCCTGTTCGAGCGTCATTTTCAACC",
#'     file = file_dna, names = "seq1"
#'   )
#'
#'   res <- vs_search_global(data_fungi, path_to_fasta = file_dna)
#'   unlink(file_dna)
#'
#'   res[res$identity != "*", ]
#'
#'   clean_pq(subset_taxa(data_fungi, res$identity != "*"))
#' }
#' }
#' @return A dataframe with uc results (invisible)
#' @export
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please cite [vsearch](https://github.com/torognes/vsearch).
#' @author Adrien Taudière

vs_search_global <- function(physeq,
                             path_to_fasta = NULL,
                             seq2search = NULL,
                             vsearchpath = "vsearch",
                             id = 0.8,
                             iddef = 0,
                             keep_temporary_files = FALSE) {
  verify_pq(physeq)
  dna <- Biostrings::DNAStringSet(physeq@refseq)
  Biostrings::writeXStringSet(dna, paste0(tempdir(), "/", "temp.fasta"))

  if (is.null(seq2search) && is.null(path_to_fasta)) {
    stop("You must fill either seq2search or path_to_fasta argument.")
  }

  if (!is.null(seq2search) && !is.null(path_to_fasta)) {
    stop("You must set either seq2search or path_to_fasta but not both.")
  }
  if (!is.null(seq2search)) {
    if (inherits(seq2search, "character")) {
      seq2search <- Biostrings::DNAStringSet(seq2search)
    }
    Biostrings::writeXStringSet(seq2search, paste0(tempdir(), "seq2search.fasta"))
    seq2search <- paste0(tempdir(), "seq2search.fasta")
  } else if (!is.null(path_to_fasta)) {
    dna <- Biostrings::readDNAStringSet(path_to_fasta)
    Biostrings::writeXStringSet(dna, paste0(tempdir(), "seq2search.fasta"))
    seq2search <- paste0(tempdir(), "seq2search.fasta")
  }

  system2(
    vsearchpath,
    paste(
      " --usearch_global ",
      paste0(tempdir(), "/", "temp.fasta"),
      " --db ",
      seq2search,
      " --uc ",
      paste0(tempdir(), "/", "temp.uc"),
      " --id ",
      id,
      " --uc_allhits",
      " --strand both",
      " --iddef ",
      iddef,
      sep = ""
    )
  )

  pack_clusts <-
    utils::read.table(paste0(tempdir(), "/", "temp.uc"), sep = "\t")
  colnames(pack_clusts) <- c(
    "type",
    "cluster",
    "width",
    "identity",
    "strand",
    "6",
    "7",
    "cigarAlignment",
    "query",
    "target"
  )

  if (keep_temporary_files) {
    message(paste0("Temporary files are located at ", tempdir()))
  } else {
    if (file.exists(paste0(tempdir(), "temp.fasta"))) {
      unlink(paste0(tempdir(), "temp.fasta"))
    }
    if (file.exists(paste0(tempdir(), "temp.uc"))) {
      unlink(paste0(tempdir(), "temp.uc"))
    }
    if (file.exists(paste0(tempdir(), "seq2search.fasta"))) {
      unlink(paste0(tempdir(), "seq2search.fasta"))
    }
  }

  return(invisible(pack_clusts))
}
################################################################################



###############################################################################
#' Re-cluster sequences of an object of class `physeq`
#'   or cluster a list of DNA sequences using SWARM
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#'
#' A wrapper of SWARM software.
#'
#' @inheritParams clean_pq
#' @param dna_seq NOT WORKING FOR THE MOMENT
#'   You may directly use a character vector of DNA sequences
#'   in place of physeq args. When physeq is set, dna sequences take the value of
#'   `physeq@refseq`
#' @param d (default: 1) maximum number of differences allowed between two
#'   amplicons, meaning that two amplicons will be grouped if they have `d`
#'   (or less) differences
#' @param swarmpath (default: swarm) path to swarm
#' @param vsearch_path (default: vsearch) path to vsearch, used only if physeq
#'   is NULL and dna_seq is provided.
#' @param nproc (default: 1)
#'   Set to number of cpus/processors to use for the clustering
#' @param swarm_args (default : "--fastidious") a one length character
#'   element defining other parameters to  passed on to swarm See other possible
#'   methods in the [SWARM pdf manual](https://github.com/torognes/swarm/blob/master/man/swarm_manual.pdf)
#' @param tax_adjust (Default 0) See the man page
#'   of [merge_taxa_vec()] for more details.
#'   To conserved the taxonomic rank of the most abundant ASV,
#' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary
#'   files ?
#'   - temp.fasta (refseq in fasta or dna_seq sequences)
#'   - temp_output (classical output of SWARM)
#'   - temp_uclust (clusters output of SWARM)
#' @details This function use the `merge_taxa_vec` function to
#'   merge taxa into clusters. By default tax_adjust = 0. See the man page
#'   of [merge_taxa_vec()].
#' @return A new object of class `physeq` or a list of cluster if dna_seq
#'   args was used.
#'
#' @references
#'   SWARM can be downloaded from
#'   \url{https://github.com/torognes/swarm/}.
#'
#' @export
#' @examplesIf MiscMetabar::is_swarm_installed()
#' summary_plot_pq(data_fungi)
#' system2("swarm", "-h")
#'
#' data_fungi_swarm <- swarm_clustering(data_fungi)
#' summary_plot_pq(data_fungi_swarm)
#'
#' sequences_ex <- c(
#'   "TACCTATGTTGCCTTGGCGGCTAAACCTACCCGGGATTTGATGGGGCGAATTAATAACGAATTCATTGAATCA",
#'   "TACCTATGTTGCCTTGGCGGCTAAACCTACCCGGGATTTGATGGGGCGAATTACCTGGTAAGGCCCACTT",
#'   "TACCTATGTTGCCTTGGCGGCTAAACCTACCCGGGATTTGATGGGGCGAATTACCTGGTAGAGGTG",
#'   "TACCTATGTTGCCTTGGCGGCTAAACCTACC",
#'   "CGGGATTTGATGGCGAATTACCTGGTATTTTAGCCCACTTACCCGGTACCATGAGGTG",
#'   "GCGGCTAAACCTACCCGGGATTTGATGGCGAATTACCTGG",
#'   "GCGGCTAAACCTACCCGGGATTTGATGGCGAATTACAAAG",
#'   "GCGGCTAAACCTACCCGGGATTTGATGGCGAATTACAAAG",
#'   "GCGGCTAAACCTACCCGGGATTTGATGGCGAATTACAAAG"
#' )
#'
#' sequences_ex_swarm <- swarm_clustering(
#'   dna_seq = sequences_ex
#' )
#' @seealso [postcluster_pq()], [vsearch_clustering()]
#' @references
#'   SWARM can be downloaded from
#'   \url{https://github.com/torognes/swarm}.
#'   More information in the associated publications
#'   \doi{doi:10.1093/bioinformatics/btab493} and \doi{doi:10.7717/peerj.593}
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please cite [SWARM](https://github.com/torognes/swarm).

swarm_clustering <- function(physeq = NULL,
                             dna_seq = NULL,
                             d = 1,
                             swarmpath = "swarm",
                             vsearch_path = "vsearch",
                             nproc = 1,
                             swarm_args = "--fastidious",
                             tax_adjust = 0,
                             keep_temporary_files = FALSE) {
  dna <- physeq_or_string_to_dna(
    physeq = physeq,
    dna_seq = dna_seq
  )

  if (!is.null(physeq)) {
    nseq <- taxa_sums(physeq)
    nseq <- nseq[match(names(nseq), names(dna))]
    names(dna) <- paste0(names(dna), "_", nseq)
    Biostrings::writeXStringSet(dna, paste0(tempdir(), "/", "temp.fasta"))
    system2(
      swarmpath,
      paste0(
        paste0(tempdir(), "/", "temp.fasta"),
        " -o ",
        paste0(tempdir(), "/", "temp_output"),
        " ",
        " -u ",
        paste0(tempdir(), "/", "temp_uclust"),
        " -t ",
        nproc,
        " ",
        swarm_args
      ),
      stdout = TRUE,
      stderr = TRUE
    )
  } else {
    Biostrings::writeXStringSet(dna, paste0(tempdir(), "/", "amplicons.fasta"))
    system2(
      vsearch_path,
      paste0(
        " --derep_fulllength ",
        paste0(tempdir(), "/", "amplicons.fasta"),
        " --sizeout  --relabel_sha1 --fasta_width 0 --output ",
        paste0(tempdir(), "/", "temp.fasta")
      ),
      stdout = TRUE,
      stderr = TRUE
    )
    system2(
      swarmpath,
      paste0(
        paste0(tempdir(), "/", "temp.fasta"),
        " -o ",
        paste0(tempdir(), "/", "temp_output"),
        " ",
        " -u ",
        paste0(tempdir(), "/", "temp_uclust"),
        " -z ",
        " -t ",
        nproc,
        " ",
        swarm_args
      ),
      stdout = TRUE,
      stderr = TRUE
    )
  }

  pack_clusts <-
    utils::read.table(paste0(tempdir(), "/", "temp_uclust"), sep = "\t")
  colnames(pack_clusts) <-
    c(
      "type",
      "cluster",
      "width",
      "identity",
      "strand",
      "6",
      "7",
      "cigarAlignment",
      "query",
      "target"
    )

  if (inherits(physeq, "phyloseq")) {
    clusters <- pack_clusts$cluster[pack_clusts$type != "C"]
    names(clusters) <-
      sub("_.*$", "", pack_clusts$query[pack_clusts$type != "C"])

    clusters <- clusters[match(taxa_names(physeq), names(clusters))]

    new_obj <-
      merge_taxa_vec(physeq,
        clusters,
        tax_adjust = tax_adjust
      )
  } else if (inherits(dna_seq, "character")) {
    new_obj <- pack_clusts
  } else {
    stop(
      "You must set the args physeq (object of class phyloseq) or
      dna_seq (character vector)."
    )
  }

  if (file.exists(paste0(tempdir(), "/", "temp.fasta")) &&
    !keep_temporary_files) {
    unlink(paste0(tempdir(), "/", "temp.fasta"))
  }
  if (file.exists(paste0(tempdir(), "/", "temp_output")) &&
    !keep_temporary_files) {
    unlink(paste0(tempdir(), "/", "temp_output"))
  }
  if (file.exists(paste0(tempdir(), "/", "temp_uclust")) &&
    !keep_temporary_files) {
    unlink(paste0(tempdir(), "/", "temp_uclust"))
  }
  if (file.exists(paste0(tempdir(), "/", "amplicon.fasta")) &&
    !keep_temporary_files) {
    unlink(paste0(tempdir(), "/", "amplicon.fasta"))
  }
  return(new_obj)
}
###############################################################################



###############################################################################
#' Recluster sequences of an object of class `physeq`
#'   or cluster a list of DNA sequences using vsearch software
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#'
#' A wrapper of VSEARCH software.
#'
#' @inheritParams clean_pq
#' @param dna_seq You may directly use a character vector of DNA sequences
#'   in place of physeq args. When physeq is set, dna sequences take the value of
#'   `physeq@refseq`
#' @param nproc (default: 1)
#'   Set to number of cpus/processors to use for the clustering
#' @param id (default: 0.97) level of identity to cluster
#' @param vsearchpath (default: "vsearch") path to vsearch
#' @param tax_adjust (Default 0) See the man page
#'   of [merge_taxa_vec()] for more details.
#'   To conserved the taxonomic rank of the most abundant ASV,
#'   set tax_adjust to 0 (default). For the moment only tax_adjust = 0 is
#'   robust
#' @param vsearch_cluster_method (default: "--cluster_size) See other possible
#'   methods in the [vsearch manual](https://github.com/torognes/vsearch) (e.g. `--cluster_size` or `--cluster_smallmem`)
#'   - `--cluster_fast` : Clusterize the fasta sequences in filename, automatically sort by decreasing sequence length beforehand.
#'   - `--cluster_size` : Clusterize the fasta sequences in filename, automatically sort by decreasing sequence abundance beforehand.
#'   - `--cluster_smallmem` : Clusterize the fasta sequences in filename without automatically modifying their order beforehand. Sequence are expected to be sorted by decreasing sequence length, unless *--usersort* is used.
#'     In that case you may set `vsearch_args` to vsearch_args = "--strand both --usersort"
#' @param vsearch_args (default : "--strand both") a one length character element defining other parameters to
#'   passed on to vsearch.
#' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary files ?
#'   - temp.fasta (refseq in fasta or dna_seq sequences)
#'   - cluster.fasta (centroid if method = "vsearch")
#'   - temp.uc (clusters if method = "vsearch")
#'
#' @seealso [postcluster_pq()], [swarm_clustering()]
#' @details This function use the [merge_taxa_vec()] function to
#'   merge taxa into clusters. By default tax_adjust = 0. See the man page
#'   of [merge_taxa_vec()].
#'
#' @return A new object of class `physeq` or a list of cluster if dna_seq
#'   args was used.
#'
#' @references
#'   VSEARCH can be downloaded from
#'   \url{https://github.com/torognes/vsearch}.
#'   More information in the associated publication
#'   \url{https://pubmed.ncbi.nlm.nih.gov/27781170}.
#' @export
#' @author Adrien Taudière
#'
#' @examplesIf MiscMetabar::is_vsearch_installed()
#' \donttest{
#' summary_plot_pq(data_fungi)
#' d_vs <- vsearch_clustering(data_fungi)
#' summary_plot_pq(d_vs)
#' }
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please cite [vsearch](https://github.com/torognes/vsearch).
vsearch_clustering <- function(physeq = NULL,
                               dna_seq = NULL,
                               nproc = 1,
                               id = 0.97,
                               vsearchpath = "vsearch",
                               tax_adjust = 0,
                               vsearch_cluster_method = "--cluster_size",
                               vsearch_args = "--strand both",
                               keep_temporary_files = FALSE) {
  dna <- physeq_or_string_to_dna(physeq = physeq, dna_seq = dna_seq)

  Biostrings::writeXStringSet(dna, paste0(tempdir(), "/", "temp.fasta"))

  system2(
    vsearchpath,
    paste0(
      paste0(
        " ",
        vsearch_cluster_method,
        " ",
        paste0(tempdir(), "/", "temp.fasta"),
        " ",
        vsearch_args
      ),
      " -id ",
      id,
      " --centroids ",
      paste0(tempdir(), "/", "cluster.fasta"),
      " --uc ",
      paste0(tempdir(), "/", "temp.uc")
    ),
    stdout = TRUE,
    stderr = TRUE
  )

  pack_clusts <-
    utils::read.table(paste0(tempdir(), "/", "temp.uc"), sep = "\t")
  colnames(pack_clusts) <-
    c(
      "type",
      "cluster",
      "width",
      "identity",
      "strand",
      "6",
      "7",
      "cigarAlignment",
      "query",
      "target"
    )

  clusters <- pack_clusts$cluster[pack_clusts$type != "C"]
  names(clusters) <- pack_clusts$query[pack_clusts$type != "C"]
  clusters <- clusters[match(taxa_names(physeq), names(clusters))]

  if (inherits(physeq, "phyloseq")) {
    new_obj <-
      merge_taxa_vec(physeq,
        clusters,
        tax_adjust = tax_adjust
      )
  } else if (inherits(dna_seq, "character")) {
    new_obj <- pack_clusts
  } else {
    stop(
      "You must set the args physeq (object of class phyloseq) or dna_seq (character vector)."
    )
  }

  if (file.exists(paste0(tempdir(), "/", "temp.fasta")) &&
    !keep_temporary_files) {
    unlink(paste0(tempdir(), "/", "temp.fasta"))
  }
  if (file.exists(paste0(tempdir(), "/", "cluster.fasta")) &&
    !keep_temporary_files) {
    unlink(paste0(tempdir(), "/", "cluster.fasta"))
  }
  if (file.exists(paste0(tempdir(), "/", "temp.uc")) &&
    !keep_temporary_files) {
    unlink(paste0(tempdir(), "/", "temp.uc"))
  }
  return(new_obj)
}
###############################################################################


################################################################################
#' Search for a list of sequence in an object to remove chimera taxa
#'   using [vsearch](https://github.com/torognes/vsearch)
#'
#' @description
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Use the VSEARCH software.
#'
#' @param object (required) A phyloseq-class object or one of dada, derep,
#'   data.frame or list coercible to sequences table using the
#'   function [dada2::makeSequenceTable()]
#' @param type (default "Discard_only_chim"). The type define the type of
#'   filtering.
#'
#'  - "Discard_only_chim" will only discard taxa classify as chimera by vsearch
#'  - "Select_only_non_chim" will only select taxa classify as non-chimera by
#'    vsearch(after filtering taxa based on their sequence length by the
#'    parameter `min_seq_length` from the [chimera_detection_vs()] function)
#'  - "Select_only_chim" will only select taxa classify as chimera by
#'    vsearch (after filtering taxa based on their sequence length by the
#'    parameter `min_seq_length` from the [chimera_detection_vs()] function)
#' @param clean_pq (logical; default FALSE) If TRUE, return the phyloseq object
#'   after cleaning using the default parameter of [clean_pq()] function.
#'
#' @param ... Additional arguments passed on to [chimera_detection_vs()] function
#' @seealso [chimera_detection_vs()], [dada2::removeBimeraDenovo()]
#' @return
#'
#' - I/ a sequences tables if object is of class dada, derep, data.frame or
#'   list.
#' - II/ a phyloseq object without (or with if type = 'Select_only_chim')
#'   chimeric taxa
#'
#' @export
#'
#' @examplesIf MiscMetabar::is_vsearch_installed()
#' \donttest{
#' data_fungi_nochim <- chimera_removal_vs(data_fungi)
#' data_fungi_nochim_16 <- chimera_removal_vs(data_fungi,
#'   abskew = 16,
#'   min_seq_length = 10
#' )
#' data_fungi_nochim2 <-
#'   chimera_removal_vs(data_fungi, type = "Select_only_non_chim")
#' data_fungi_chimera <-
#'   chimera_removal_vs(data_fungi, type = "Select_only_chim")
#' }
#' @author Adrien Taudière
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please make [vsearch](https://github.com/torognes/vsearch).


chimera_removal_vs <-
  function(object,
           type = "Discard_only_chim",
           clean_pq = FALSE,
           ...) {
    if (inherits(object, "dada") ||
      inherits(object, "derep") ||
      inherits(object, "data.frame") ||
      inherits(object, "list")) {
      object <- makeSequenceTable(object)
    }

    if (inherits(object, "matrix")) {
      chim_detect <-
        chimera_detection_vs(
          seq2search = colnames(object),
          nb_seq = colSums(object),
          ...
        )
      if (type == "Discard_only_chim") {
        seq_tab_final <-
          object[, !colnames(object) %in% as.character(chim_detect$chimera)]
      } else if (type == "Select_only_non_chim") {
        seq_tab_final <- seq_tab_Pairs[, as.character(chim_rm$non_chimera)]
      } else if (type == "Select_only_chim") {
        seq_tab_final <- seq_tab_Pairs[, as.character(chim_rm$chimera)]
      } else {
        stop(
          "Type must be set to one of 'Discard_only_chim',
             'Select_only_non_chim', or 'Select_only_chim'"
        )
      }
      seq_tab_final <- seq_tab_final
      return(seq_tab_final)
    } else if (inherits(object, "phyloseq")) {
      verify_pq(object)
      if (sum(taxa_sums(object) == 0) > 0) {
        object <- clean_pq(object)
      }
      chim_detect <-
        chimera_detection_vs(
          seq2search = refseq(object),
          nb_seq = taxa_sums(object),
          ...
        )
      if (type == "Discard_only_chim") {
        cond <-
          !as.character(refseq(object)) %in% as.character(chim_detect$chimera)
        names(cond) <- names(refseq(object))
        new_physeq <-
          subset_taxa_pq(object, condition = cond)
      } else if (type == "Select_only_non_chim") {
        cond <-
          as.character(refseq(object)) %in% as.character(chim_detect$non_chimera)
        names(cond) <- names(refseq(object))
        new_physeq <-
          subset_taxa_pq(object, condition = cond)
      } else if (type == "Select_only_chim") {
        cond <-
          as.character(refseq(object)) %in% as.character(chim_detect$chimera)
        names(cond) <- names(refseq(object))
        new_physeq <-
          subset_taxa_pq(object, condition = cond)
      } else {
        stop(
          "Type must be set to one of 'Discard_only_chim',
             'Select_only_non_chim', or 'Select_only_chim'"
        )
      }
      if (clean_pq) {
        new_physeq <- clean_pq(physeq)
      }
      return(new_physeq)
    }
  }
################################################################################



################################################################################
#' Detect for chimera taxa using [vsearch](https://github.com/torognes/vsearch)
#'
#' @description
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Use the VSEARCH software.
#'
#' @param seq2search (required) a list of DNA sequences coercible by function
#'  [Biostrings::DNAStringSet()]
#' @param nb_seq (required) a numeric vector giving the number of sequences for
#'   each DNA sequences
#' @param vsearchpath (default: "vsearch") path to vsearch
#' @param abskew (int, default 2) The abundance skew is used to distinguish in a
#'   three way alignment which sequence is the chimera and which are the
#'   parents. The assumption is that chimeras appear later in the PCR
#'   amplification process and are therefore less abundant than their parents.
#'   The default value is 2.0, which means that the parents should be at least
#'   2 times more abundant than their chimera. Any positive value equal or
#'   greater than 1.0 can be used.
#' @param min_seq_length (int, default 100)) Minimum length of sequences to
#'   be part of the analysis
#' @param vsearch_args (default "--fasta_width 0") A list of other args for
#'   vsearch command
#' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary
#'  files ?
#'   - non_chimeras.fasta
#'   - chimeras.fasta
#'   - borderline.fasta
#'
#' @return A list of 3 including non-chimera taxa (`$non_chimera`), chimera taxa
#'   (`$chimera`) and bordeline taxa (`$borderline`)
#' @export
#' @seealso [chimera_removal_vs()], [dada2::removeBimeraDenovo()]
#'
#' @examplesIf MiscMetabar::is_vsearch_installed()
#' \donttest{
#' chimera_detection_vs(
#'   seq2search = data_fungi@refseq,
#'   nb_seq = taxa_sums(data_fungi)
#' )
#' }
#' @author Adrien Taudière
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please make [vsearch](https://github.com/torognes/vsearch).
chimera_detection_vs <- function(seq2search,
                                 nb_seq,
                                 vsearchpath = "vsearch",
                                 abskew = 2,
                                 min_seq_length = 100,
                                 vsearch_args = "--fasta_width 0",
                                 keep_temporary_files = FALSE) {
  dna_raw <- Biostrings::DNAStringSet(seq2search)
  names(dna_raw) <- paste0(
    "Taxa", seq(1, length(seq2search)),
    ";size=", nb_seq
  )

  dna <- dna_raw[Biostrings::width(dna_raw) >= min_seq_length]
  abun <- unlist(strsplit(names(dna), split = "="))
  abun_tot <-
    sum(as.numeric(abun[seq(2, 2 * length(abun), by = 2)]), na.rm = T)

  message(
    paste(
      "Filtering for sequences under",
      min_seq_length,
      "bp remove a total of",
      length(dna_raw) - length(dna),
      "(",
      round((length(dna_raw) - length(dna)) / length(dna_raw) * 100, 2),
      "%)",
      "unique sequences for a total of",
      sum(nb_seq) - abun_tot,
      "sequences removed",
      "(",
      round((sum(nb_seq) - abun_tot) / sum(nb_seq) * 100, 2),
      "%)"
    )
  )

  Biostrings::writeXStringSet(
    dna,
    paste0(tempdir(), "/", "temp.fasta")
  )

  system2(
    vsearchpath,
    paste0(
      " --uchime_denovo ",
      paste0(tempdir(), "/", "temp.fasta"),
      " --abskew ",
      abskew,
      " --nonchimeras ",
      paste0(tempdir(), "/", "non_chimeras.fasta"),
      " --chimeras ",
      paste0(tempdir(), "/", "chimeras.fasta"),
      " --borderline ",
      paste0(tempdir(), "/", "borderline.fasta"),
      " ",
      vsearch_args
    ),
    stdout = TRUE,
    stderr = TRUE
  )

  non_chimera_AAStringSet <-
    Biostrings::readAAStringSet(paste0(tempdir(), "/", "non_chimeras.fasta"))
  chimera_AAStringSet <-
    Biostrings::readAAStringSet(paste0(tempdir(), "/", "chimeras.fasta"))
  borderline_AAStringSet <-
    Biostrings::readAAStringSet(paste0(tempdir(), "/", "borderline.fasta"))

  if (keep_temporary_files) {
    message(paste0("Temporary files are located at ", tempdir()))
  } else {
    if (file.exists(paste0(tempdir(), "temp.fasta"))) {
      unlink(paste0(tempdir(), "temp.fasta"))
    }
    if (file.exists(paste0(tempdir(), "non_chimeras.fasta"))) {
      unlink(paste0(tempdir(), "non_chimeras.fasta"))
    }
    if (file.exists(paste0(tempdir(), "chimeras.fasta"))) {
      unlink(paste0(tempdir(), "chimeras.fasta"))
    }
  }

  return(
    list(
      "non_chimera" = non_chimera_AAStringSet,
      "chimera" = chimera_AAStringSet,
      "borderline" = borderline_AAStringSet
    )
  )
}
################################################################################

################################################################################
#' Write a temporary fasta file (internal use)
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Write a fasta file from either a biostring object seq2search or a
#' refseq slot from a phyloseq object
#'
#' @inheritParams assign_sintax
#' @param temporary_fasta_file The name of a temporary_fasta_file (default "temp.fasta")
#' @param return_DNAStringSet (Logical default FALSE). If true, the temporary fasta file
#'   is removed and a DNAStringSet is return
#' @seealso [assign_sintax()], [assign_vsearch_lca]
#' @return Nothing, produce a fasta file or return a DNAStringset if temporary_fasta_file
#' @keywords internal
#' @noRd
#'

write_temp_fasta <- function(physeq,
                             seq2search,
                             temporary_fasta_file = "temp.fasta",
                             behavior = NULL,
                             clean_pq = TRUE,
                             verbose = TRUE,
                             return_DNAStringSet = FALSE) {
  if (!is.null(physeq) && !is.null(seq2search)) {
    stop("You must enter a single parameter from physeq and seq2search.")
  } else if (is.null(seq2search)) {
    verify_pq(physeq)
    if (is.null(physeq@refseq)) {
      stop("The phyloseq object do not contain a @refseq slot")
    }
    if (clean_pq) {
      physeq <- clean_pq(physeq, silent = !verbose)
    }
    dna <- Biostrings::DNAStringSet(physeq@refseq)
    Biostrings::writeXStringSet(dna, temporary_fasta_file)
  } else if (is.null(physeq)) {
    Biostrings::writeXStringSet(seq2search, temporary_fasta_file)
    if (behavior == "add_to_phyloseq") {
      stop("You can't use behavior = 'add_to_phyloseq' with seq2search param.")
    }
  } else if (is.null(physeq) && is.null(seq2search)) {
    stop("You must specify either physeq or seq2search parameter.")
  }
  if (return_DNAStringSet) {
    res <- Biostrings::readDNAStringSet(temporary_fasta_file)
    unlink(temporary_fasta_file)
    return(res)
  }
}
################################################################################

################################################################################
#' Assign Taxonomy using Sintax algorithm of Vsearch
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Please cite [Vsearch](https://github.com/torognes/vsearch)
#'   if you use this function to assign taxonomy.
#'
#' @inheritParams clean_pq
#' @param ref_fasta (required) A link to a database in vsearch format
#'  The reference database must contain taxonomic information in the header of
#'  each sequence in the form of a string starting with ";tax=" and followed
#'  by a comma-separated list of up to nine taxonomic identifiers. Each taxonomic
#'  identifier must start with an indication of the rank by one of the letters d
#'  (for domain) k (kingdom), p (phylum), c (class), o (order), f (family),
#'   g (genus), s (species), or t (strain). The letter is followed by a colon
#'    (:) and the name of that rank. Commas and semicolons are not allowed in
#'    the name of the rank. Non-ascii characters should be avoided in the names.
#'
#'  Example:
#'
#'  \>X80725_S000004313;tax=d:Bacteria,p:Proteobacteria,c:Gammaproteobacteria,o:Enterobacteriales,f:Enterobacteriaceae,g:Escherichia/Shigella,s:Escherichia_coli,t:str._K-12_substr._MG1655
#' @param seq2search A DNAStringSet object of sequences to search for. Replace
#'   the physeq object.
#' @param behavior Either "return_matrix" (default), "return_cmd",
#' or "add_to_phyloseq":
#'
#'  - "return_matrix" return a list of two matrix with taxonomic value in the
#'    first element of the list and bootstrap value in the second one.
#'
#'  - "return_cmd" return the command to run without running it.
#'
#'  - "add_to_phyloseq" return a phyloseq object with amended slot `@taxtable`.
#'    Only available if using physeq input and not seq2search input.
#'
#' @param vsearchpath (default: "vsearch") path to vsearch
#' @param clean_pq (logical, default TRUE)
#'   If set to TRUE, empty samples and empty ASV are discarded
#'   before clustering.
#' @param nproc (default: 1)
#'   Set to number of cpus/processors to use
#' @param suffix (character) The suffix to name the new columns.
#'   If set to "" (the default), the taxa_ranks algorithm is used
#'   without suffix.
#' @param taxa_ranks A list with the name of the taxonomic rank present in
#'   ref_fasta
#' @param min_bootstrap (Float \[0:1\], default 0.5)
#'   Minimum bootstrap value to inform taxonomy. For each bootstrap
#'   below the min_bootstrap value, the taxonomy information is set to NA.
#' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary files?
#'
#' - temporary_fasta_file (default "temp.fasta") : the fasta file from physeq
#'   or seq2search
#'
#' - "output_taxo_vs.txt" : see Vsearch Manual for parameter --tabbedout
#'
#' @param verbose (logical). If TRUE, print additional information.
#' @param temporary_fasta_file The name of a temporary_fasta_file (default "temp.fasta")
#' @param cmd_args Additional arguments passed on to vsearch sintax cmd.
#'   By default cmd_args is equal to "--sintax_random" as recommended by
#'   [Torognes](https://github.com/torognes/vsearch/issues/535).
#' @param too_few (default value "align_start") see [tidyr::separate_wider_delim()]
#' @param too_many (default value "drop") see [tidyr::separate_wider_delim()]
#' @return See param behavior
#' @examplesIf MiscMetabar::is_vsearch_installed()
#' \donttest{
#' assign_sintax(data_fungi_mini,
#'   ref_fasta = system.file("extdata", "mini_UNITE_fungi.fasta.gz", package = "MiscMetabar"),
#'   behavior = "return_cmd"
#' )
#'
#' data_fungi_mini_new <- assign_sintax(data_fungi_mini,
#'   ref_fasta = system.file("extdata", "mini_UNITE_fungi.fasta.gz", package = "MiscMetabar"),
#'   behavior = "add_to_phyloseq"
#' )
#'
#' assignation_results <- assign_sintax(data_fungi_mini,
#'   ref_fasta = system.file("extdata", "mini_UNITE_fungi.fasta.gz", package = "MiscMetabar")
#' )
#'
#' left_join(
#'   tidyr::pivot_longer(assignation_results$taxo_value, -taxa_names),
#'   tidyr::pivot_longer(assignation_results$taxo_bootstrap, -taxa_names),
#'   by = join_by(taxa_names, name),
#'   suffix = c("rank", "bootstrap")
#' ) |>
#'   mutate(name = factor(name,
#'     levels = c(
#'       "Kingdom", "Phylum", "Class",
#'       "Order", "Family", "Genus", "Species"
#'     )
#'   )) |>
#'   # mutate(valuerank = forcats::fct_reorder(valuerank,
#'   #   as.integer(name), .desc = TRUE)) |>
#'   ggplot(aes(valuebootstrap,
#'     valuerank,
#'     fill = name
#'   )) +
#'   geom_jitter(alpha = 0.8, aes(color = name)) +
#'   geom_boxplot(alpha = 0.3)
#' }
#' @export
#' @author Adrien Taudière
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please cite [vsearch](https://github.com/torognes/vsearch).
assign_sintax <- function(physeq = NULL,
                          ref_fasta = NULL,
                          seq2search = NULL,
                          behavior = c("return_matrix", "add_to_phyloseq", "return_cmd"),
                          vsearchpath = "vsearch",
                          clean_pq = TRUE,
                          nproc = 1,
                          suffix = "",
                          taxa_ranks = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
                          min_bootstrap = 0.5,
                          keep_temporary_files = FALSE,
                          verbose = FALSE,
                          temporary_fasta_file = "temp.fasta",
                          cmd_args = "--sintax_random",
                          too_few = "align_start",
                          too_many = "drop") {
  behavior <- match.arg(behavior)

  write_temp_fasta(
    physeq = physeq,
    seq2search = seq2search,
    temporary_fasta_file = temporary_fasta_file,
    behavior = behavior,
    clean_pq = clean_pq,
    verbose = verbose
  )

  if (verbose) {
    message("Start Vsearch sintax")
  }

  cmd_sintax <-
    paste0(
      " --sintax ",
      temporary_fasta_file,
      " --db ",
      ref_fasta,
      " --tabbedout output_taxo_vs.txt ",
      " --threads ",
      nproc,
      " ",
      cmd_args
    )

  if (behavior == "return_cmd") {
    if (!keep_temporary_files) {
      unlink(temporary_fasta_file)
    }
    return("sintax" = paste0(vsearchpath, " ", cmd_sintax))
  }

  system2(vsearchpath,
    args = cmd_sintax,
    stdout = TRUE,
    stderr = TRUE
  )

  if (!file.exists("output_taxo_vs.txt")) {
    warning("No taxonomic assignation were maded.")
    if (!keep_temporary_files) {
      unlink(temporary_fasta_file)
    }
    if (behavior == "add_to_phyloseq") {
      return(physeq)
    } else {
      return(NULL)
    }
  }
  res_sintax <- read.csv("output_taxo_vs.txt", sep = "\t", header = F)
  taxa_names <- res_sintax$V1
  res_sintax <- tibble(res_sintax$V2, taxa_names)
  res_sintax <- res_sintax |>
    tidyr::separate_wider_delim(-taxa_names, names = paste0(taxa_ranks, suffix), delim = ",", too_few = too_few) |>
    tidyr::pivot_longer(-taxa_names) |>
    tidyr::separate_wider_delim(
      value,
      names_sep = "",
      names = c("", "_bootstrap"),
      delim = "(",
      too_many = too_many
    ) |>
    mutate(across(value_bootstrap, ~ as.numeric(gsub(")", "", .x)))) |>
    mutate(across(value, ~ gsub(".:", "", .x)))

  res_sintax_wide_bootstrap <-
    res_sintax |>
    select(-value) |>
    tidyr::pivot_wider(names_from = name, values_from = value_bootstrap)

  res_sintax_wide_taxo <-
    res_sintax |>
    select(-value_bootstrap) |>
    tidyr::pivot_wider(names_from = name, values_from = value)

  if (!is.null(min_bootstrap)) {
    res_sintax_wide_taxo_filter <- res_sintax_wide_taxo
    res_sintax_wide_taxo_filter[res_sintax_wide_bootstrap < min_bootstrap] <- NA
  }

  if (!keep_temporary_files) {
    unlink(temporary_fasta_file)
    unlink("output_taxo_vs.txt")
  }

  if (behavior == "add_to_phyloseq") {
    tax_tab <- as.data.frame(as.matrix(physeq@tax_table))
    tax_tab$taxa_names <- taxa_names(physeq)

    new_physeq <- physeq
    new_tax_tab <- left_join(tax_tab, res_sintax_wide_taxo_filter,
      by = join_by(taxa_names)
    ) |>
      dplyr::select(-taxa_names) |>
      as.matrix()
    new_physeq@tax_table <- tax_table(new_tax_tab)
    taxa_names(new_physeq@tax_table) <- taxa_names(physeq)

    return(new_physeq)
  } else if (behavior == "return_matrix") {
    return(list(
      "taxo_value" = res_sintax_wide_taxo,
      "taxo_bootstrap" = res_sintax_wide_bootstrap
    ))
  }
}
################################################################################

################################################################################
#' Assign taxonomy using LCA
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#'  Please cite [Vsearch](https://github.com/torognes/vsearch) and
#'   [stampa](https://github.com/frederic-mahe/stampa) if you use this function
#'   to assign taxonomy.
#'
#'  1. If top_hits_only is TRUE, the algorithm is the one of [stampa](https://github.com/frederic-mahe/stampa).
#'
#'
#'  2. If top_hits_only is FALSE and vote_algorithm is NULL, you need to carefully define `maxaccept`, `id` and `lca_cutoff` parameters.
#'    The algorithm is internal to vsearch using the lcaout output.
#'
#'  3. If top_hits_only is FALSE and vote_algorithm is not NULL, conflict among the
#'   list of taxonomic assignations is resolve using the function [resolve_vector_ranks()].
#'  The possible values for vote_algorithm are "consensus", "rel_majority",
#'  "abs_majority" and "unanimity". See [resolve_vector_ranks()] for more details.
#'
#' @inheritParams clean_pq
#' @param ref_fasta (required) A link to a database in vsearch format
#'  The reference database must contain taxonomic information in the header of
#'  each sequence in the form of a string starting with ";tax=" and followed
#'  by a comma-separated list of up to nine taxonomic identifiers. Each taxonomic
#'  identifier must start with an indication of the rank by one of the letters d
#'  (for domain) k (kingdom), p (phylum), c (class), o (order), f (family),
#'   g (genus), s (species), or t (strain). The letter is followed by a colon
#'    (:) and the name of that rank. Commas and semicolons are not allowed in
#'    the name of the rank. Non-ascii characters should be avoided in the names.
#'
#'  Example:
#'
#'  \>X80725_S000004313;tax=d:Bacteria,p:Proteobacteria,c:Gammaproteobacteria,o:Enterobacteriales,f:Enterobacteriaceae,g:Escherichia/Shigella,s:Escherichia_coli,t:str._K-12_substr._MG1655
#' @param seq2search A DNAStringSet object of sequences to search for. Replace
#'   the physeq object.
#' @param behavior Either "return_matrix" (default), "return_cmd",
#' or "add_to_phyloseq":
#'
#'  - "return_matrix" return a list of two matrix with taxonomic value in the
#'    first element of the list and bootstrap value in the second one.
#'
#'  - "return_cmd" return the command to run without running it.
#'
#'  - "add_to_phyloseq" return a phyloseq object with amended slot `@taxtable`.
#'    Only available if using physeq input and not seq2search input.
#'
#' @param vsearchpath (default: "vsearch") path to vsearch
#' @param clean_pq (logical, default TRUE)
#'   If set to TRUE, empty samples and empty ASV are discarded
#'   before clustering.
#' @param taxa_ranks A list with the name of the taxonomic rank present in
#'   ref_fasta
#' @param nproc (int, default: 1)
#'   Set to number of cpus/processors to use
#' @param suffix (character) The suffix to name the new columns.
#'   If set to "" (the default), the taxa_ranks algorithm is used
#'   without suffix.
#' @param id (Float \[0:1\] default 0.5). Default value is based on
#'   [stampa](https://github.com/frederic-mahe/stampa).
#'   See Vsearch Manual for parameter `--id`
#' @param lca_cutoff (int, default 1). Fraction of matching hits
#'  required for the last common ancestor (LCA) output. For example, a value
#'  of 0.9 imply that if less than 10% of assigned species are not congruent
#'  the taxonomy is filled.
#'  Default value is based on [stampa](https://github.com/frederic-mahe/stampa).
#'  See Vsearch Manual for parameter `--lca_cutoff`
#'
#'  Text from vsearch manual :
#'  "Adjust the fraction of matching hits required for the last
#'  common ancestor (LCA) output with the --lcaout option during searches.
#'  The default value is 1.0 which requires all hits to match at each taxonomic
#'  rank for that rank to be included. If a lower cutoff value is used,
#'  e.g. 0.95, a small fraction of non-matching hits are allowed while that
#'  rank will still be reported. The argument to this option must be larger
#'  than 0.5, but not larger than 1.0"
#' @param maxrejects (int, default: 32)
#'   Maximum number of non-matching target sequences to consider before
#'   stopping the search for a given query.
#'   Default value is based on [stampa](https://github.com/frederic-mahe/stampa)
#'   See Vsearch Manual for parameter `--maxrejects`.

#' @param top_hits_only (Logical, default TRUE)
#'  Only the top hits with an equally high percentage of identity between the query and
#'  database sequence sets are written to the output. If you set top_hits_only
#'  you may need to set a lower `maxaccepts` and/or `lca_cutoff`.
#'   Default value is based on [stampa](https://github.com/frederic-mahe/stampa)
#'   See Vsearch Manual for parameter `--top_hits_only`
#' @param maxaccepts (int, default: 0)
#'   Default value is based on [stampa](https://github.com/frederic-mahe/stampa).
#'   Maximum number of matching target sequences to accept before stopping the search
#'   for a given query.
#'   See Vsearch Manual for parameter `--maxaccepts`
#' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary files?
#'
#' - temporary_fasta_file (default "temp.fasta") : the fasta file from physeq or
#'   seq2search
#'
#' - "out_lca.txt" : see Vsearch Manual for parameter --lcaout
#'
#' - "userout.txt" : see Vsearch Manual for parameter --userout
#'
#' @param verbose (logical). If TRUE, print additional information.
#' @param temporary_fasta_file Name of the temporary fasta file. Only useful
#'   with keep_temporary_files = TRUE.
#' @param cmd_args Additional arguments passed on to vsearch usearch_global cmd.
#' @param too_few (default value "align_start") see [tidyr::separate_wider_delim()]
#' @param nb_voting (Int, default NULL). The number of taxa to keep before apply
#'   a vote to resolve conflict. If NULL all taxa passing the filters (min_id,
#'   min_bit_score, min_cover and min_e_value) are selected.
#' @param vote_algorithm (default NULL) the method to vote among "consensus", "rel_majority",
#'  "abs_majority" and "unanimity". See [resolve_vector_ranks()] for more details.
#' @param strict (Logical, default FALSE). See [resolve_vector_ranks()] for more details.
#' @param nb_agree_threshold  See [resolve_vector_ranks()] for more details.
#' @param preference_index  See [resolve_vector_ranks()] for more details.
#' @param collapse_string  See [resolve_vector_ranks()] for more details.
#' @param replace_collapsed_rank_by_NA (Logical, default TRUE) See [resolve_vector_ranks()] for more details.
#' @param simplify_taxo (logical default TRUE). Do we apply the
#'   function [simplify_taxo()] to the phyloseq object?
#' @param keep_vsearch_score (Logical, default FALSE). If TRUE, the mean and sd of id score
#'    are stored in the tax_table.
#' @return See param behavior
#' @seealso [assign_sintax()], [add_new_taxonomy_pq()]
#' @examplesIf MiscMetabar::is_vsearch_installed()
#' \donttest{
#' data_fungi_mini_new <- assign_vsearch_lca(data_fungi_mini,
#'   ref_fasta = system.file("extdata", "mini_UNITE_fungi.fasta.gz", package = "MiscMetabar"),
#'   lca_cutoff = 0.9, behavior = "add_to_phyloseq"
#' )
#'
#' data_fungi_mini_new2 <- assign_vsearch_lca(data_fungi_mini,
#'   ref_fasta = system.file("extdata", "mini_UNITE_fungi.fasta.gz", package = "MiscMetabar"),
#'   id = 0.8, behavior = "add_to_phyloseq", top_hits_only = FALSE
#' )
#'
#' data_fungi_mini_new3 <- assign_vsearch_lca(data_fungi_mini,
#'   ref_fasta = system.file("extdata", "mini_UNITE_fungi.fasta.gz", package = "MiscMetabar"),
#'   id = 0.5, behavior = "add_to_phyloseq", top_hits_only = FALSE, vote_algorithm = "rel_majority"
#' )
#' }
#' @export
#' @author Adrien Taudière
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please cite [vsearch](https://github.com/torognes/vsearch) and
#'   [stampa](https://github.com/frederic-mahe/stampa)
assign_vsearch_lca <- function(physeq = NULL,
                               ref_fasta = NULL,
                               seq2search = NULL,
                               behavior = c("return_matrix", "add_to_phyloseq", "return_cmd"),
                               vsearchpath = "vsearch",
                               clean_pq = TRUE,
                               taxa_ranks = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
                               nproc = 1,
                               suffix = "_sintax",
                               id = 0.5,
                               lca_cutoff = 1,
                               maxrejects = 32,
                               top_hits_only = TRUE,
                               maxaccepts = 0,
                               keep_temporary_files = FALSE,
                               verbose = TRUE,
                               temporary_fasta_file = "temp.fasta",
                               cmd_args = "",
                               too_few = "align_start",
                               vote_algorithm = NULL,
                               nb_voting = NULL,
                               strict = FALSE,
                               nb_agree_threshold = 1,
                               preference_index = NULL,
                               collapse_string = "/",
                               replace_collapsed_rank_by_NA = TRUE,
                               simplify_taxo = TRUE,
                               keep_vsearch_score = FALSE) {
  behavior <- match.arg(behavior)
  write_temp_fasta(
    physeq = physeq,
    seq2search = seq2search,
    temporary_fasta_file = temporary_fasta_file,
    behavior = behavior,
    clean_pq = clean_pq,
    verbose = verbose
  )

  cmd_usearch <-
    paste0(
      " --usearch_global temp.fasta --db ",
      ref_fasta,
      " --lcaout out_lca.txt -id ",
      id,
      " --threads ",
      nproc,
      " --userfields query+id+target",
      " --maxaccepts ",
      maxaccepts,
      " --maxrejects ",
      maxrejects,
      " --lca_cutoff  ",
      lca_cutoff,
      " --userout userout.txt ",
      cmd_args
    )

  if (top_hits_only) {
    cmd_usearch <-
      paste0(cmd_usearch, " --top_hits_only")
  }

  if (behavior == "return_cmd") {
    if (!keep_temporary_files) {
      unlink(temporary_fasta_file)
    }
    return("sintax" = paste0(vsearchpath, " ", cmd_usearch))
  }

  system2(vsearchpath,
    args = cmd_usearch,
    stdout = TRUE,
    stderr = TRUE
  )

  if (top_hits_only || is.null(vote_algorithm)) {
    res_usearch <- read.csv("out_lca.txt", sep = "\t", header = F)

    taxa_names <- res_usearch$V1
    res_usearch <- tibble(res_usearch$V2, taxa_names)
    if (sum(is.na(res_usearch)) == nrow(res_usearch)) {
      message("None match were found using usearch global and id=", id)
      if (behavior == "add_to_phyloseq") {
        return(physeq)
      } else {
        return(NULL)
      }
    }

    res_usearch <- res_usearch |>
      tidyr::separate_wider_delim(-taxa_names,
        names = paste0(taxa_ranks, suffix),
        delim = ",",
        too_few = too_few
      ) |>
      tidyr::pivot_longer(-taxa_names) |>
      mutate(across(value, ~ gsub(".:", "", .x)))

    res_usearch_wide_taxo <-
      res_usearch |>
      tidyr::pivot_wider(names_from = name, values_from = value)
  } else if (!is.null(vote_algorithm)) {
    res_usearch <- read.csv("userout.txt", sep = "\t", header = F)

    if (is.null(nb_voting)) {
      nb_voting <- max(table(res_usearch$V1))
    }

    res_usearch_wide_taxo <- res_usearch |>
      rename(taxa_names = V1) |>
      rename(score = V2) |>
      tidyr::separate(`V3`,
        into = c(paste0("Taxa_name_db", suffix), "Classification"),
        sep = ";tax="
      ) |>
      tidyr::separate("Classification",
        into = paste0(taxa_ranks, suffix),
        sep = ","
      ) |>
      group_by(taxa_names) |>
      slice_head(n = nb_voting) |>
      summarise(
        across(
          c(
            paste0(taxa_ranks, suffix)
          ),
          ~ resolve_vector_ranks(
            .x,
            method = vote_algorithm,
            strict = strict,
            nb_agree_threshold = nb_agree_threshold,
            collapse_string = collapse_string,
            replace_collapsed_rank_by_NA = replace_collapsed_rank_by_NA
          )
        ),
        across(
          c(
            "score"
          ),
          list(mean = mean, sd = sd),
          .names = paste0("{.fn}_", "{.col}", suffix)
        )
      )
  }

  if (!keep_temporary_files) {
    unlink(temporary_fasta_file)
    unlink("out_lca.txt")
    unlink("userout.txt")
  }

  if (!keep_vsearch_score) {
    res_usearch_wide_taxo <- res_usearch_wide_taxo |>
      select(c(
        taxa_names,
        paste0(taxa_ranks, suffix)
      ))
  }

  if (behavior == "add_to_phyloseq") {
    tax_tab <- as.data.frame(as.matrix(physeq@tax_table))
    tax_tab$taxa_names <- taxa_names(physeq)

    new_physeq <- physeq
    new_tax_tab <- left_join(tax_tab, res_usearch_wide_taxo,
      by = join_by(taxa_names)
    ) |>
      dplyr::select(-taxa_names) |>
      as.matrix()
    new_physeq@tax_table <- tax_table(new_tax_tab)
    taxa_names(new_physeq@tax_table) <- taxa_names(physeq)

    if (simplify_taxo) {
      new_physeq <- simplify_taxo(new_physeq)
    }

    return(new_physeq)
  } else if (behavior == "return_matrix") {
    return(res_usearch_wide_taxo)
  }
}
################################################################################
adrientaudiere/MiscMetabar documentation built on April 12, 2025, 6:39 p.m.