R/dada_phyloseq.R

Defines functions rarefy_sample_count_by_modality taxa_as_rows taxa_as_columns psmelt_samples_pq normalize_prop_pq taxa_only_in_one_level cutadapt_remove_primers physeq_or_string_to_dna add_info_to_sam_data reorder_taxa_pq are_modality_even_depth build_phytree_pq plot_guild_pq add_funguild_info tbl_sum_samdata add_new_taxonomy_pq select_one_sample subset_taxa_pq subset_samples_pq verify_pq mumu_pq lulu_pq read_pq save_pq write_pq asv2otu track_wkflow_samples track_wkflow clean_pq add_dna_to_phyloseq

Documented in add_dna_to_phyloseq add_funguild_info add_info_to_sam_data add_new_taxonomy_pq are_modality_even_depth asv2otu build_phytree_pq clean_pq cutadapt_remove_primers lulu_pq mumu_pq normalize_prop_pq physeq_or_string_to_dna plot_guild_pq psmelt_samples_pq rarefy_sample_count_by_modality read_pq reorder_taxa_pq save_pq select_one_sample subset_samples_pq subset_taxa_pq taxa_as_columns taxa_as_rows taxa_only_in_one_level tbl_sum_samdata track_wkflow track_wkflow_samples verify_pq write_pq

if (getRversion() >= "2.15.1") {
  utils::globalVariables(".")
}

################################################################################
#' Add dna in `refseq` slot of a `physeq` object using taxa names and renames taxa
#'   using ASV_1, ASV_2, …
#'
#' @description 
#' 
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-stable-green" alt="lifecycle-stable"></a>
#'
#' Useful in targets bioinformatic pipeline.
#' 
#' @inheritParams clean_pq
#'
#' @return A new \code{\link{phyloseq-class}} object with `refseq` slot and new
#'   taxa names
#' @export

add_dna_to_phyloseq <- function(physeq) {
  verify_pq(physeq)
  dna <- Biostrings::DNAStringSet(phyloseq::taxa_names(physeq))
  names(dna) <- phyloseq::taxa_names(physeq)
  physeq <- phyloseq::merge_phyloseq(physeq, dna)
  phyloseq::taxa_names(physeq) <-
    paste0("ASV_", seq(phyloseq::ntaxa(physeq)))
  return(physeq)
}
################################################################################



################################################################################
#'  Clean phyloseq object by removing empty samples and taxa
#'
#' @description 
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#' 
#'  In addition, this function check for discrepancy (and rename) between
#' (i) taxa names in refseq, taxonomy table and otu_table and between
#' (ii) sample names in sam_data and otu_table.
#'
#' @param physeq (required): a \code{\link{phyloseq-class}} object obtained
#'   using the `phyloseq` package.
#' @param remove_empty_samples (logical) Do you want to remove samples
#'   without sequences (this is done after removing empty taxa)
#' @param remove_empty_taxa (logical) Do you want to remove taxa
#'   without sequences (this is done before removing empty samples)
#' @param clean_samples_names (logical) Do you want to clean samples names?
#' @param silent (logical) If true, no message are printing.
#' @param verbose (logical) Additional informations in the message
#'   the verbose parameter overwrite the silent parameter.
#' @param force_taxa_as_columns (logical) If true, if the taxa are rows
#'   transpose the otu_table and set taxa_are_rows to false
#' @param force_taxa_as_rows (logical) If true, if the taxa are columns
#'   transpose the otu_table and set taxa_are_rows to true
#' @param reorder_asv (logical) if TRUE the otu_table is ordered by the number of
#'   sequences of ASV (descending order). Default to FALSE.
#' @param rename_asv (logical) if TRUE, ASV are renamed by their position
#'   in the OTU_table (asv_1, asv_2, ...). Default to FALSE. If rename ASV is true,
#'   the ASV names in verbose information can be misleading.
#' @param simplify_taxo (logical) if TRUE, correct the taxonomy_table using the
#'   `MiscMetabar::simplify_taxo()` function
#' @return A new \code{\link{phyloseq-class}} object
#' @export
clean_pq <- function(physeq,
                     remove_empty_samples = TRUE,
                     remove_empty_taxa = TRUE,
                     clean_samples_names = TRUE,
                     silent = FALSE,
                     verbose = FALSE,
                     force_taxa_as_columns = FALSE,
                     force_taxa_as_rows = FALSE,
                     reorder_asv = FALSE,
                     rename_asv = FALSE,
                     simplify_taxo = FALSE) {
  if (clean_samples_names) {
    if (!is.null(physeq@refseq)) {
      if (sum(!names(physeq@refseq) %in% taxa_names(physeq)) > 0) {
        names(physeq@refseq) <- taxa_names(physeq)
        if (!silent) {
          message("Change the samples names in refseq slot")
        }
      }
    }
    if (!is.null(physeq@tax_table)) {
      if (sum(!rownames(physeq@tax_table) %in% taxa_names(physeq)) > 0) {
        rownames(physeq@tax_table) <- taxa_names(physeq)
        if (!silent) {
          message("Change the taxa names in tax_table slot")
        }
      }
    }

    if (!is.null(physeq@sam_data)) {
      if (sum(!rownames(physeq@sam_data) %in% sample_names(physeq)) > 0) {
        rownames(physeq@sam_data) <- sample_names(physeq)
        if (!silent) {
          message("Change the samples names in sam_data slot")
        }
      }
    }
  }

  verify_pq(physeq)

  if (reorder_asv) {
    physeq <- reorder_taxa_pq(
      physeq,
      taxa_names(physeq)[order(taxa_sums(physeq), decreasing = TRUE)]
    )
  }

  if (rename_asv) {
    taxa_names(physeq) <- paste0("ASV_", seq(1, ntaxa(physeq)))
  }

  if (sum(grepl("^0", sample_names(physeq)) > 0) && !silent) {
    message(
      "At least one sample name start with a zero.
    That can be a problem for some phyloseq functions such as
    plot_bar and psmelt."
    )
  }

  if (force_taxa_as_columns && force_taxa_as_rows) {
    stop("You can't force taxa as column and taxa as row in the same time.")
  }

  if (force_taxa_as_columns && taxa_are_rows(physeq)) {
    otu_table(physeq) <-
      otu_table(
        t(as.matrix(unclass(
          physeq@otu_table
        ))),
        taxa_are_rows = FALSE
      )
    message("Taxa are now in columns.")
  }

  if (force_taxa_as_rows && !taxa_are_rows(physeq)) {
    otu_table(physeq) <-
      otu_table(
        t(as.matrix(unclass(
          physeq@otu_table
        ))),
        taxa_are_rows = TRUE
      )
    message("Taxa are now in rows.")
  }

  if (simplify_taxo) {
    physeq <- simplify_taxo(physeq)
  }

  new_physeq <- physeq

  if (remove_empty_taxa) {
    if (sum(taxa_sums(new_physeq) != 0) > 0) {
      new_physeq <- subset_taxa(physeq, taxa_sums(physeq) > 0)
    }
  }
  if (remove_empty_samples) {
    if (sum(sample_sums(new_physeq) != 0) > 0) {
      new_physeq <- subset_samples(new_physeq, sample_sums(physeq) > 0)
    }
  }

  if (verbose) {
    message(
      paste(
        "Cleaning suppress",
        ntaxa(physeq) - ntaxa(new_physeq),
        "taxa (",
        paste(taxa_names(physeq)[taxa_sums(physeq) == 0], collapse = " / "),
        ") and",
        nsamples(physeq) - nsamples(new_physeq),
        "sample(s) (",
        paste(sample_names(physeq)[sample_sums(physeq) == 0], collapse = " / "),
        ")."
      )
    )
  } else if (!silent) {
    message(
      paste(
        "Cleaning suppress",
        ntaxa(physeq) - ntaxa(new_physeq),
        "taxa and",
        nsamples(physeq) - nsamples(new_physeq),
        "samples."
      )
    )
  }

  verify_pq(new_physeq)
  return(new_physeq)
}






################################################################################
#' Track the number of reads (= sequences), samples and cluster (e.g. ASV)
#' from various objects including dada-class and derep-class.
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#'
#'  * List of fastq and fastg.gz files -> nb of reads and samples
#'  * List of dada-class -> nb of reads, clusters (ASV) and samples
#'  * List of derep-class -> nb of reads, clusters (unique sequences)
#'    and samples
#'  * Matrix of samples x clusters (e.g. `otu_table`) -> nb of reads,
#'    clusters and samples
#'  * Phyloseq-class -> nb of reads, clusters and samples
#'
#' @param list_of_objects (required) a list of objects
#' @param obj_names
#'   A list of names corresponding to the list of objects
#' @param clean_pq (logical)
#'   If set to TRUE, empty samples and empty ASV are discarded
#'   before clustering.
#' @param taxonomy_rank A vector of int. Define the column number of
#'   taxonomic rank `in physeq@tax_table` to compute the number of unique value.
#'   Default is NULL and do not compute values for any taxonomic rank
#' @param ... Other arguments passed on to [clean_pq()] function.
#'
#' @return The number of sequences, clusters (e.g. OTUs, ASVs) and samples for
#'   each object.
#' @export
#' @seealso [track_wkflow_samples()]
#' @examplesIf tolower(Sys.info()[["sysname"]]) != "windows"
#' data(enterotype)
#' if (requireNamespace("pbapply")) {
#'   track_wkflow(list(data_fungi, enterotype), taxonomy_rank = c(3, 5))
#' }
track_wkflow <- function(list_of_objects,
                         obj_names = NULL,
                         clean_pq = FALSE,
                         taxonomy_rank = NULL,
                         ...) {
  message("Compute the number of sequences")
  if (!is.null(obj_names)) {
    names(list_of_objects) <- obj_names
  }

  if (clean_pq) {
    for (i in seq_along(list_of_objects)) {
      if (inherits(list_of_objects[[i]], "phyloseq")) {
        list_of_objects[[i]] <- clean_pq(list_of_objects[[i]], ...)
      }
    }
  }

  track_nb_seq_per_obj <-
    pbapply::pblapply(list_of_objects, function(object) {
      message(paste("Start object of class:", class(object), sep = " "))
      if (inherits(object, "phyloseq")) {
        sum(object@otu_table)
      } else if (inherits(object, "matrix")) {
        sum(object, na.rm = TRUE)
      } else if (is.character(object[1]) &&
        length(object[1]) == 1 &&
        file.exists(object[1])) {
        if (summary(file(object[[1]]))$class == "gzfile") {
          pbapply::pbsapply(object, function(x) {
            as.numeric(system(paste("zcat ", x, " | grep -c '^+$'", sep = ""),
              intern = TRUE
            ))
          })
        } else if (grepl("\\.fastq$", object[1])) {
          pbapply::pbsapply(object, function(x) {
            as.numeric(system(paste("cat ", x, " | grep -c '^+$'", sep = ""),
              intern = TRUE
            ))
          })
        } else {
          stop("Files must be either gzfile or .fastq")
        }
      } else if (inherits(object, "derep")) {
        sum(object$uniques)
      } else if (inherits(object, "dada")) {
        sum(dada2::getUniques(object))
      } else {
        pbapply::pbsapply(object, function(x) {
          sum(dada2::getUniques(x, silence = TRUE))
        })
      }
    })
  track_nb_seq_per_obj <-
    pbapply::pblapply(track_nb_seq_per_obj, sum)

  message("Compute the number of clusters")
  track_nb_cluster_per_obj <-
    pbapply::pblapply(list_of_objects, function(object) {
      message(paste("Start object of class:", class(object), sep = " "))
      if (inherits(object, "phyloseq")) {
        ntaxa(object)
      } else if (inherits(object, "matrix")) {
        ncol(object)
      } else if (inherits(object, "dada")) {
        length(object$sequence)
      } else if (inherits(object[[1]], "dada")) {
        dim(suppressMessages(dada2::makeSequenceTable(object)))[2]
      } else if (is.data.frame(object[[1]]) &&
        all(c("sequence", "abundance") %in% colnames(object[[1]]))) {
        dim(suppressMessages(dada2::makeSequenceTable(object)))[2]
      } else if (inherits(object, "derep")) {
        length(unique(names(object$uniques)))
      } else if (inherits(object[[1]], "derep")) {
        length(unique(unlist(lapply(object, function(x) {
          names(x$uniques)
        }))))
      } else {
        NA
      }
    })

  message("Compute the number of samples")
  track_nb_sam_per_obj <-
    pbapply::pblapply(list_of_objects, function(object) {
      message(paste("Start object of class:", class(object), sep = " "))
      if (inherits(object, "phyloseq")) {
        nsamples(object)
      } else if (inherits(object, "matrix")) {
        nrow(object)
      } else if (inherits(object, "dada")) {
        1
      } else if (inherits(object[[1]], "dada")) {
        dim(suppressMessages(dada2::makeSequenceTable(object)))[1]
      } else if (is.data.frame(object[[1]]) &&
        all(c("sequence", "abundance") %in% colnames(object[[1]]))) {
        dim(suppressMessages(dada2::makeSequenceTable(object)))[1]
      } else if (inherits(object, "derep")) {
        1
      } else if (inherits(object[[1]], "derep")) {
        length(object)
      } else if (is.character(object[1]) &&
        length(object[1]) == 1 &&
        file.exists(object[1])) {
        length(object)
      } else {
        NA
      }
    })

  if (!is.null(taxonomy_rank)) {
    message("Compute the number of values in taxonomic rank")
    track_nb_tax_value_per_obj <-
      pbapply::pblapply(list_of_objects, function(object) {
        message(paste("Start object of class:", class(object), sep = " "))
        if (inherits(object, "phyloseq")) {
          if (taxa_are_rows(object)) {
            apply(object@tax_table[taxonomy_rank, ], 1, function(x) {
              length(unique(stats::na.omit(x)))
            })
          } else {
            apply(object@tax_table[, taxonomy_rank], 2, function(x) {
              length(unique(stats::na.omit(x)))
            })
          }
        } else {
          rep(NA, length(taxonomy_rank))
        }
      })

    names_taxonomic_rank <-
      pbapply::pblapply(list_of_objects, function(object) {
        message(paste("Start object of class:", class(object), sep = " "))
        if (inherits(object, "phyloseq")) {
          if (taxa_are_rows(object)) {
            rownames(object@tax_table)[taxonomy_rank]
          } else {
            colnames(object@tax_table)[taxonomy_rank]
          }
        }
      })
    track <- plyr::rbind.fill.matrix(
      matrix(ncol = length(list_of_objects), unlist(track_nb_seq_per_obj)),
      matrix(
        ncol = length(list_of_objects),
        unlist(track_nb_cluster_per_obj)
      ),
      matrix(ncol = length(list_of_objects), unlist(track_nb_sam_per_obj)),
      matrix(
        ncol = length(list_of_objects),
        unlist(track_nb_tax_value_per_obj)
      )
    )

    rownames(track) <- c(
      "nb_sequences",
      "nb_clusters",
      "nb_samples",
      names_taxonomic_rank[[1]]
    )
  } else {
    track <- plyr::rbind.fill.matrix(
      matrix(ncol = length(list_of_objects), unlist(track_nb_seq_per_obj)),
      matrix(
        ncol = length(list_of_objects),
        unlist(track_nb_cluster_per_obj)
      ),
      matrix(ncol = length(list_of_objects), unlist(track_nb_sam_per_obj))
    )

    rownames(track) <- c(
      "nb_sequences",
      "nb_clusters",
      "nb_samples"
    )
  }


  track <- as.data.frame(t(track))
  if (!is.null(obj_names)) {
    rownames(track) <- obj_names
  } else {
    rownames(track) <- names(list_of_objects)
  }

  return(track)
}
################################################################################

################################################################################
#' Track the number of reads (= sequences), samples and cluster (e.g. ASV)
#' for each sample
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Contrary to [track_wkflow()], only phyloseq object are possible.
#' More information are available in the manual of the function [track_wkflow()]
#'
#' @param list_pq_obj (required): a list of object passed on to [track_wkflow()]
#'   Only phyloseq object will return value because information of sample is needed
#' @param ... Other args passed on to [track_wkflow()]
#'
#' @return A list of dataframe. cf [track_wkflow()] for more information
#'
#' @export
#' @author Adrien Taudière
#'
#' @examplesIf tolower(Sys.info()[["sysname"]]) != "windows"
#' tree_A10_005 <- subset_samples(data_fungi, Tree_name == "A10-005")
#' if (requireNamespace("pbapply")) {
#'   track_wkflow_samples(tree_A10_005)
#' }
track_wkflow_samples <- function(list_pq_obj, ...) {
  if (!inherits(list_pq_obj, "list")) {
    list_pq_obj <- list(list_pq_obj)
  }
  if (sum(!unlist(lapply(list_pq_obj, inherits, "phyloseq"))) != 0) {
    stop("At least one object in your list_pq_obj is not a phyloseq obj.")
  }
  res <- list()
  sam_names <- unique(unlist(lapply(list_pq_obj, sample_names)))
  for (s in sam_names) {
    list_pq_obj_samples <-
      lapply(list_pq_obj, function(physeq) {
        if (sum(sample_names(physeq) %in% s) == 1) {
          select_one_sample(physeq, sam_name = s)
        } else {
          matrix(0, nrow = 0, ncol = 0)
        }
      })
    res[[s]] <- track_wkflow(list_pq_obj_samples) # ,...)
  }
  return(res)
}
###########################################################################


################################################################################
#' Recluster sequences of an object of class `physeq`
#'   or a list of DNA sequences
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#'
#' This function use the `merge_taxa_vec` function to  merge taxa into clusters.
#' 
#' @inheritParams clean_pq
#' @param dna_seq You may directly use a character vector of DNA sequences
#'   in place of physeq args. When physeq is set, dna sequences take the value of
#'   `physeq@refseq`
#' @param nproc (default: 1)
#'   Set to number of cpus/processors to use for the clustering
#' @param method (default: clusterize)
#'   Set the clustering method.
#'   - `clusterize` use the [DECIPHER::Clusterize()] fonction,
#'   - `vsearch` use the vsearch software (https://github.com/torognes/vsearch)
#'     with arguments `--cluster_size` by default (see args `vsearch_cluster_method`)
#'     and `-strand both` (see args `vsearch_args`)
#'   - `swarm` use the swarm
#' @param vsearchpath (default: vsearch) path to vsearch
#' @param id (default: 0.97) level of identity to cluster
#' @param tax_adjust (Default 0) See the man page
#'   of [merge_taxa_vec()] for more details.
#'   To conserved the taxonomic rank of the most abundant ASV,
#'   set tax_adjust to 0 (default). For the moment only tax_adjust = 0 is
#'   robust
#' @param vsearch_cluster_method (default: "--cluster_size) See other possible
#'   methods in the [vsearch manual](https://github.com/torognes/vsearch/) (e.g. `--cluster_size` or `--cluster_smallmem`)
#'   - `--cluster_fast` : Clusterize the fasta sequences in filename, automatically sort by decreasing sequence length beforehand.
#'   - `--cluster_size` : Clusterize the fasta sequences in filename, automatically sort by decreasing sequence abundance beforehand.
#'   - `--cluster_smallmem` : Clusterize the fasta sequences in filename without automatically modifying their order beforehand. Sequence are expected to be sorted by decreasing sequence length, unless *--usersort* is used.
#'     In that case you may set `vsearch_args` to vsearch_args = "--strand both --usersort"
#' @param vsearch_args (default : "--strand both") a one length character element defining other parameters to
#'   passed on to vsearch.
#' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary files
#'   - temp.fasta (refseq in fasta or dna_seq sequences)
#'   - cluster.fasta (centroid if method = "vsearch")
#'   - temp.uc (clusters if method = "vsearch")
#' @param swarmpath (default: swarm) path to swarm
#' @param d (default: 1) maximum number of differences allowed between two
#'   amplicons, meaning that two amplicons will be grouped if they have `d`
#'   (or less) differences
#' @param swarm_args (default : "--fastidious") a one length character
#'   element defining other parameters to  passed on to swarm See other possible
#'   methods in the [SWARM pdf manual](https://github.com/torognes/swarm/blob/master/man/swarm_manual.pdf)
#' @param method_clusterize (default "overlap") the method for the [DECIPHER::Clusterize()] method
#' @param ... Other arguments passed on to [DECIPHER::Clusterize()]
#' @details This function use the `merge_taxa_vec` function to
#'   merge taxa into clusters. By default tax_adjust = 0. See the man page
#'   of [merge_taxa_vec()].
#'
#' @return A new object of class `physeq` or a list of cluster if dna_seq
#'   args was used.
#'
#' @examples
#' if (requireNamespace("DECIPHER")) {
#'   asv2otu(data_fungi_mini)
#' }
#' \donttest{
#' if (requireNamespace("DECIPHER")) {
#'   asv2otu(data_fungi_mini, method_clusterize = "longest")
#'
#'   if (MiscMetabar::is_swarm_installed()) {
#'     d_swarm <- asv2otu(data_fungi_mini, method = "swarm")
#'   }
#'   if (MiscMetabar::is_vsearch_installed()) {
#'     d_vs <- asv2otu(data_fungi_mini, method = "vsearch")
#'   }
#' }
#' }
#' @references
#'   VSEARCH can be downloaded from
#'   \url{https://github.com/torognes/vsearch}.
#'   More information in the associated publication
#'   \url{https://pubmed.ncbi.nlm.nih.gov/27781170}.
#' @seealso [vsearch_clustering()] and [swarm_clustering()]
#' @export
#' @author Adrien Taudière

asv2otu <- function(physeq = NULL,
                    dna_seq = NULL,
                    nproc = 1,
                    method = "clusterize",
                    id = 0.97,
                    vsearchpath = "vsearch",
                    tax_adjust = 0,
                    vsearch_cluster_method = "--cluster_size",
                    vsearch_args = "--strand both",
                    keep_temporary_files = FALSE,
                    swarmpath = "swarm",
                    d = 1,
                    swarm_args = "--fastidious",
                    method_clusterize = "overlap",
                    ...) {
  if (inherits(physeq, "phyloseq")) {
    verify_pq(physeq)
    if (is.null(physeq@refseq)) {
      stop("The phyloseq object do not contain a @refseq slot")
    }
    dna <- Biostrings::DNAStringSet(physeq@refseq)
    if (!is.null(dna_seq)) {
      stop("You must use either physeq or dna_seq args but not both")
    }
  } else if (inherits(dna_seq, "character")) {
    dna <- Biostrings::DNAStringSet(dna_seq)
  } else {
    stop(
      "You must set the args physeq (object of class phyloseq) or
    dna_seq (character vector)."
    )
  }

  if (method == "clusterize") {
    ## Find clusters of ASVs to form the new OTUs
    clusters <- DECIPHER::Clusterize(dna,
      cutoff = 1 - id,
      # e.g. `cutoff = 0.03` for a 97% OTU
      processors = nproc,
      method = method_clusterize,
      ...
    )

    if (inherits(physeq, "phyloseq")) {
      new_obj <-
        merge_taxa_vec(physeq,
          clusters$cluster,
          tax_adjust = tax_adjust
        )
    } else if (inherits(dna_seq, "character")) {
      new_obj <- clusters
    } else {
      stop(
        "You must set the args physeq (object of class phyloseq) or
    dna_seq (character vector)."
      )
    }
  } else if (method == "vsearch") {
    new_obj <- vsearch_clustering(
      physeq = physeq,
      dna_seq = dna_seq,
      nproc = nproc,
      id = id,
      vsearchpath = vsearchpath,
      tax_adjust = tax_adjust,
      vsearch_cluster_method = vsearch_cluster_method,
      vsearch_args = vsearch_args,
      keep_temporary_files = keep_temporary_files
    )
  } else if (method == "swarm") {
    new_obj <- swarm_clustering(
      physeq = physeq,
      dna_seq = dna_seq,
      d = d,
      swarmpath = swarmpath,
      vsearch_path = vsearchpath,
      nproc = nproc,
      swarm_args = swarm_args,
      tax_adjust = tax_adjust,
      keep_temporary_files = keep_temporary_files
    )
  } else {
    stop("Method allows 2 values only : `clusterize`, `vsearch` or `swarm`")
  }
  return(new_obj)
}
################################################################################


################################################################################
#' Save phyloseq object in the form of multiple csv tables.
#'
#' @description 
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#'
#' This is the reverse function of [read_pq()].
#' @inheritParams clean_pq
#' @param path a path to the folder to save the phyloseq object
#' @param rdata (logical) does the phyloseq object is also saved in Rdata format?
#' @param one_file (logical) if TRUE, combine all data in one file only
#' @param write_sam_data (logical) does the samples data are add to
#'   the file. Only used if `one_file` is TRUE.
#'   Note that these option result in a lot of NA values.
#' @param sam_data_first (logical) if TRUE, put the sample data at the top of the table
#'   Only used if `one_file` and write_sam_data are both TRUE.
#' @param clean_pq (logical)
#'   If set to TRUE, empty samples are discarded after subsetting ASV
#' @param reorder_asv (logical) if TRUE the otu_table is ordered by the number of
#'   sequences of ASV (descending order). Default to TRUE. Only possible if clean_pq
#'   is set to TRUE.
#' @param rename_asv reorder_asv (logical) if TRUE, ASV are renamed by their position
#'   in the OTU_table (asv_1, asv_2, ...). Default to FALSE. Only possible if clean_pq
#'   is set to TRUE.
#' @param quote a logical value (default FALSE) or a numeric vector.
#'   If TRUE, any character or factor columns will be surrounded by
#'   double quotes.  If a numeric vector, its elements are taken
#'   as the indices of columns to quote.  In both cases, row and
#'   column names are quoted if they are written. If FALSE nothing is quoted.
#' @param sep_csv (default tabulation) separator for column
#' @param ... Other arguments passed on to [utils::write.table()] function.
#' @return Build a folder (path) containing one to four csv tables
#'   (refseq.csv, otu_table.csv, tax_table.csv, sam_data.csv)
#'   and if present a phy_tree in Newick format
#' @export
#' @author Adrien Taudière
#' @examples
#' write_pq(data_fungi, path = paste0(tempdir(), "/phyloseq"))
#' write_pq(data_fungi, path = paste0(tempdir(), "/phyloseq"), one_file = TRUE)
#' unlink(paste0(tempdir(), "/phyloseq"), recursive = TRUE)
#' @seealso [MiscMetabar::save_pq()]

write_pq <- function(physeq,
                     path = NULL,
                     rdata = FALSE,
                     one_file = FALSE,
                     write_sam_data = TRUE,
                     sam_data_first = FALSE,
                     clean_pq = TRUE,
                     reorder_asv = FALSE,
                     rename_asv = FALSE,
                     remove_empty_samples = TRUE,
                     remove_empty_taxa = TRUE,
                     clean_samples_names = TRUE,
                     silent = FALSE,
                     verbose = FALSE,
                     quote = FALSE,
                     sep_csv = "\t",
                     ...) {
  verify_pq(physeq)

  physeq <- clean_pq(
    physeq,
    reorder_asv = reorder_asv,
    rename_asv = rename_asv,
    remove_empty_samples = remove_empty_samples,
    remove_empty_taxa = remove_empty_taxa,
    clean_samples_names = clean_samples_names,
    silent = silent,
    verbose = verbose
  )

  if (!dir.exists(path)) {
    dir.create(file.path(path), recursive = TRUE)
  }
  if (one_file) {
    if (!is.null(physeq@refseq) &&
      !is.null(physeq@otu_table) && !is.null(physeq@tax_table)) {
      if (!taxa_are_rows(physeq)) {
        otu_table(physeq) <-
          otu_table(
            t(as.matrix(unclass(
              physeq@otu_table
            ))),
            taxa_are_rows = TRUE
          )
      }
      df_physeq_interm <- cbind(
        physeq@otu_table,
        physeq@tax_table,
        as.vector(physeq@refseq)
      )
      colnames(df_physeq_interm) <-
        c(
          sample_names(physeq),
          colnames(physeq@tax_table),
          "Reference Sequences"
        )

      df_physeq_interm <- as.data.frame(df_physeq_interm)

      if (write_sam_data) {
        sam_data <- data.frame(t(data.frame(unclass(
          physeq@sam_data
        ))))
        colnames(sam_data) <- sample_names(physeq)
        if (sam_data_first) {
          df_physeq <- dplyr::full_join(sam_data, df_physeq_interm)
          rownames(df_physeq) <-
            c(rownames(sam_data), rownames(df_physeq_interm))
        } else {
          df_physeq <- dplyr::full_join(df_physeq_interm, sam_data)
          rownames(df_physeq) <-
            c(rownames(df_physeq_interm), rownames(sam_data))
        }
      } else {
        df_physeq <- df_physeq_interm
      }
      utils::write.table(
        df_physeq,
        paste0(path, "/ASV_table_allInOne.csv"),
        quote = quote,
        sep = sep_csv,
        ...
      )
    } else if (!is.null(physeq@otu_table) &&
      !is.null(physeq@tax_table)) {
      if (!taxa_are_rows(physeq)) {
        otu_table(physeq) <-
          otu_table(
            t(as.matrix(unclass(
              physeq@otu_table
            ))),
            taxa_are_rows = TRUE
          )
      }
      df_physeq_interm <- cbind(
        physeq@otu_table,
        physeq@tax_table,
      )
      colnames(df_physeq_interm) <-
        c(
          sample_names(physeq),
          colnames(physeq@tax_table),
          "Reference Sequences"
        )

      df_physeq_interm <- as.data.frame(df_physeq_interm)

      if (write_sam_data) {
        sam_data <- data.frame(t(data.frame(unclass(
          physeq@sam_data
        ))))
        colnames(sam_data) <- sample_names(physeq)
        if (sam_data_first) {
          df_physeq <- dplyr::full_join(sam_data, df_physeq_interm)
          rownames(df_physeq) <-
            c(rownames(sam_data), rownames(df_physeq_interm))
        } else {
          df_physeq <- dplyr::full_join(df_physeq_interm, sam_data)
          rownames(df_physeq) <-
            c(rownames(df_physeq_interm), rownames(sam_data))
        }
      }
      utils::write.table(
        df_physeq,
        paste0(path, "/ASV_table_allInOne.csv"),
        quote = quote,
        sep = sep_csv,
        ...
      )
    }
  } else {
    if (!is.null(physeq@otu_table)) {
      utils::write.table(
        physeq@otu_table,
        paste0(path, "/otu_table.csv"),
        quote = quote,
        sep = sep_csv,
        ...
      )
    }
    if (!is.null(physeq@refseq)) {
      utils::write.table(
        physeq@refseq,
        paste0(path, "/refseq.csv"),
        quote = quote,
        sep = sep_csv,
        ...
      )
    }
    if (!is.null(physeq@tax_table)) {
      utils::write.table(
        physeq@tax_table,
        paste0(path, "/tax_table.csv"),
        quote = quote,
        sep = sep_csv,
        col.names = NA,
        ...
      )
    }
    if (!is.null(physeq@sam_data)) {
      utils::write.table(
        as.matrix(physeq@sam_data),
        paste0(path, "/sam_data.csv"),
        quote = quote,
        sep = sep_csv,
        col.names = NA,
        ...
      )
    }
  }
  if (!is.null(physeq@phy_tree)) {
    ape::write.tree(physeq@phy_tree, paste(path, "/phy_tree.txt", sep = ""))
  }
  if (rdata) {
    save(physeq, file = paste(path, "/physeq.RData", sep = ""))
  }
}
################################################################################


################################################################################
#' A wrapper of write_pq to save in all three possible formats
#'
#' @details
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#'
#' Write :
#' - 4 separate tables
#' - 1 table version
#' - 1 RData file
#'
#' @inheritParams clean_pq
#' @param path a path to the folder to save the phyloseq object
#' @param ... Other arguments passed on to [write_pq()] or [utils::write.table()] function.
#' @return Build a folder (in path) with four csv tables (`refseq.csv`, `otu_table.csv`, `tax_table.csv`, `sam_data.csv`) + one
#'   table with all tables together + a rdata file (`physeq.RData`) that can be loaded using
#'   [base::load()] function + if present a phylogenetic tree in Newick format (`phy_tree.txt`)
#' @export
#' @author Adrien Taudière
#' @examplesIf tolower(Sys.info()[["sysname"]]) != "windows"
#' save_pq(data_fungi, path = paste0(tempdir(), "/phyloseq"))
#' unlink(paste0(tempdir(), "/phyloseq"), recursive = TRUE)
#' @seealso [MiscMetabar::write_pq()]
save_pq <- function(physeq, path = NULL, ...) {
  write_pq(physeq,
    path = path,
    rdata = TRUE,
    one_file = TRUE,
    ...
  )
  write_pq(physeq,
    path = path,
    rdata = FALSE,
    one_file = FALSE,
    ...
  )
}

################################################################################
#' Read phyloseq object from multiple csv tables and a phylogenetic tree
#' in Newick format.
#'
#' @description 
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#' 
#' This is the reverse function of [write_pq()].
#'
#' @param path (required) a path to the folder to read the phyloseq object
#' @param taxa_are_rows (default to FALSE) see ?phyloseq for details
#' @param sam_names The name of the variable (column) in sam_data.csv to rename
#'   samples. Note that if you use [write_phyloseq()] function to save your
#'   physeq object, you may use sam_names = "X" to rename the samples names
#'   as before.
#' @param sep_csv (default tabulation) separator for column
#' @param ... Other arguments passed on to [utils::write.table()] function.
#' @return One to four csv tables (refseq.csv, otu_table.csv, tax_table.csv, sam_data.csv)
#' and if present a phy_tree in Newick format. At least the otu_table.csv need to be present.
#' @export
#'
#' @examples
#' write_pq(data_fungi, path = paste0(tempdir(), "/phyloseq"))
#' read_pq(path = paste0(tempdir(), "/phyloseq"))
#' unlink(paste0(tempdir(), "/phyloseq"), recursive = TRUE)
read_pq <- function(path = NULL,
                    taxa_are_rows = FALSE,
                    sam_names = NULL,
                    sep_csv = "\t",
                    ...) {
  if (file.exists(paste0(path, "/otu_table.csv"))) {
    if (taxa_are_rows) {
      otu_table_csv <-
        as.matrix(utils::read.table(paste0(path, "/otu_table.csv"), sep = sep_csv))
      samp_names <- colnames(otu_table_csv)
      otu_table_csv <- apply(otu_table_csv, 2, as.numeric)
      table_otu <- otu_table(otu_table_csv, taxa_are_rows = TRUE)
      sample_names(table_otu) <- samp_names
      physeq <- phyloseq(table_otu)
    } else {
      otu_table_csv <-
        as.matrix(utils::read.table(paste0(path, "/otu_table.csv"), sep = sep_csv))
      samp_names <- rownames(otu_table_csv)
      otu_table_csv <- apply(otu_table_csv, 2, as.numeric)
      rownames(otu_table_csv) <- samp_names
      physeq <-
        phyloseq(otu_table(otu_table_csv, taxa_are_rows = FALSE))
    }
  }
  if (file.exists(paste0(path, "/refseq.csv"))) {
    dna <-
      Biostrings::DNAStringSet(utils::read.table(
        paste0(path, "/refseq.csv"),
        sep = sep_csv,
        row.names = NULL
      )[, 2])
    names(dna) <-
      utils::read.table(paste0(path, "/refseq.csv"),
        sep = sep_csv,
        row.names = NULL
      )[, 1]
    physeq <- phyloseq::merge_phyloseq(physeq, refseq(dna))
  }
  if (file.exists(paste0(path, "/tax_table.csv"))) {
    tax_table_csv <-
      utils::read.table(paste0(path, "/tax_table.csv"), sep = sep_csv)
    rownames(tax_table_csv) <- tax_table_csv[, 1]
    tax_table_csv <- as.matrix(tax_table_csv[, -1])
    physeq <-
      phyloseq::merge_phyloseq(physeq, tax_table(tax_table_csv))
  }
  if (file.exists(paste0(path, "/sam_data.csv"))) {
    sam_data_csv <-
      utils::read.table(paste0(path, "/sam_data.csv"), sep = sep_csv)
    rownames(sam_data_csv) <- sam_data_csv[, 1]
    physeq <-
      phyloseq::merge_phyloseq(physeq, sample_data(sam_data_csv))
  }

  if (!is.null(physeq@phy_tree)) {
    tree <- ape::read.tree(paste0(path, "/phy_tree.txt"))
    physeq <- phyloseq::merge_phyloseq(physeq, phy_tree(tree))
  }
  if (!is.null(sam_names)) {
    sample_names(physeq) <- unclass(physeq@sam_data[, sam_names])[[1]]
  }


  return(physeq)
}

################################################################################

################################################################################
#' Lulu reclustering of class `physeq`
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' See https://www.nature.com/articles/s41467-017-01312-x for more information
#'  on the method.
#'
#' @inheritParams clean_pq
#' @param nproc (default 1)
#'   Set to number of cpus/processors to use for the clustering
#' @param id (default: 0.84) id for --usearch_global.
#' @param vsearchpath (default: vsearch) path to vsearch.
#' @param verbose (logical) if true, print some additional messages.
#' @param clean_pq (logical) if true, empty samples and empty ASV are discarded
#'   before clustering.
#' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary files
#' @param ... Others args for function [lulu()]
#' @return a list of for object
#' - "new_physeq": The new phyloseq object (class physeq)
#' - "discrepancy_vector": A vector of discrepancy showing for each taxonomic
#'   level the proportion of identic value before and after lulu reclustering.
#'   A value of 0.6 stands for 60% of ASV before re-clustering have
#'   identical value after re-clustering. In other word, 40% of ASV are assigned
#'   to a different taxonomic
#'   value. NA value are not counted as discrepancy.
#' - "res_lulu": A list of the result from the lulu function
#' - "merged_ASV": the data.frame used to merged ASV
#'
#' @export
#' @examplesIf MiscMetabar::is_vsearch_installed()
#' \donttest{
#' lulu_pq(data_fungi_sp_known)
#' }
#' @author Tobias Guldberg Frøslev \email{tobiasgf@snm.ku.dk}
#'   & Adrien Taudière \email{adrien.taudiere@@zaclys.net}
#' @details
#' The version of LULU is a fork of Adrien Taudière (\url{https://github.com/adrientaudiere/lulu})
#'  from \url{https://github.com/tobiasgf/lulu}
#' @references
#' - LULU : \url{https://github.com/adrientaudiere/lulu}
#'  forked from \url{https://github.com/tobiasgf/lulu}.
#' - VSEARCH can be downloaded from
#'  \url{https://github.com/torognes/vsearch}.

lulu_pq <- function(physeq,
                    nproc = 1,
                    id = 0.84,
                    vsearchpath = "vsearch",
                    verbose = FALSE,
                    clean_pq = FALSE,
                    keep_temporary_files = FALSE,
                    ...) {
  verify_pq(physeq)
  if (is.null(physeq@refseq)) {
    stop("The phyloseq object do not contain a @refseq slot")
  }
  if (clean_pq) {
    physeq <- clean_pq(physeq)
  }

  message("Start Vsearch usearch_global")
  dna <- Biostrings::DNAStringSet(physeq@refseq)
  Biostrings::writeXStringSet(dna, "temp.fasta")
  system2(
    vsearchpath,
    paste(
      " --usearch_global temp.fasta --db temp.fasta --self --iddef 1",
      " -userfields query+target+id --maxaccepts 0 --query_cov .9 --maxhits 10",
      " -id ",
      id,
      "  --userout match_list.txt",
      sep = ""
    ),
    stdout = TRUE,
    stderr = TRUE
  )

  match_list <- utils::read.csv(file = "match_list.txt", sep = "\t")

  message("Lulu algorithm")
  res_lulu <-
    lulu(data.frame(t(physeq@otu_table), ...), match_list)

  if (!keep_temporary_files) {
    if (file.exists("temp.fasta")) {
      unlink("temp.fasta")
    }
    if (file.exists("cluster.fasta")) {
      unlink("cluster.fasta")
    }
    if (file.exists("temp.uc")) {
      unlink("temp.uc")
    }
    if (file.exists("match_list.txt")) {
      unlink("match_list.txt")
    }
  }
  merged <- res_lulu$otu_map[res_lulu$otu_map$curated == "merged", ]
  merged <- merged[rownames(merged) != merged$parent_id, ]

  test_vector <- vector(mode = "logical")
  for (tax_rank in colnames(physeq@tax_table)) {
    test <-
      physeq@tax_table[rownames(merged), tax_rank] == physeq@tax_table[merged$parent_id, tax_rank]
    test_vector <-
      c(
        test_vector,
        sum(test, na.rm = TRUE) / length(stats::na.exclude(test))
      )
  }

  names(test_vector) <- colnames(physeq@tax_table)

  new_physeq <-
    prune_taxa(
      taxa_names(physeq) %in% rownames(res_lulu$curated_table),
      physeq
    )
  new_physeq@otu_table <-
    otu_table(t(res_lulu$curated_table), taxa_are_rows = FALSE)
  sample_names(new_physeq) <- sample_names(physeq)

  if (verbose) {
    message(paste(
      "The number of taxa decrease from ",
      ntaxa(physeq),
      " to ",
      ntaxa(new_physeq),
      ".",
      sep = ""
    ))
    message(
      "See the discrepancy_vector to verify the degree of discrepancy in taxonomy due to lulu re-clustering."
    )
  }
  return(
    list(
      "new_physeq" = new_physeq,
      "discrepancy_vector" = test_vector,
      "res_lulu" = res_lulu,
      "merged_ASV" = merged
    )
  )
}
################################################################################


################################################################################
#' MUMU reclustering of class `physeq`
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' See https://www.nature.com/articles/s41467-017-01312-x for more information
#'  on the original method LULU. This is a wrapper of
#'  [mumu](https://github.com/frederic-mahe/mumu) a C++ re-implementation
#'  of LULU by Frédéric Mahé
#'
#' @inheritParams clean_pq
#' @param nproc (default 1)
#'   Set to number of cpus/processors to use for the clustering
#' @param id (default: 0.84) id for --usearch_global.
#' @param vsearchpath (default: vsearch) path to vsearch.
#' @param mumupath path to mumu. See [mumu](https://github.com/frederic-mahe/mumu)
#'   for installation instruction
#' @param verbose (logical) if true, print some additional messages.
#' @param clean_pq (logical) if true, empty samples and empty ASV are discarded
#'   before clustering.
#' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary files
#' @return a list of for object
#' - "new_physeq": The new phyloseq object (class physeq)
#' - "mumu_results": The log file of the mumu software. Run `man mumu` into
#'   bash to obtain details about columns' signification.
#'
#' @export
#' @examplesIf MiscMetabar::is_mumu_installed()
#' \dontrun{
#' mumu_pq(data_fungi_sp_known)
#' }
#' @author Frédéric Mahé
#'   & Adrien Taudière \email{adrien.taudiere@@zaclys.net}
#' @references
#' - MUMU: \url{https://github.com/frederic-mahe/mumu}
#' - VSEARCH can be downloaded from
#'  \url{https://github.com/torognes/vsearch}.
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please cite [mumu](https://github.com/frederic-mahe/mumu/blob/main/CITATION.cff) and
#'   [lulu](https://www.nature.com/articles/s41467-017-01312-x) if you use this function
#'   for your work.
#'
mumu_pq <- function(physeq,
                    nproc = 1,
                    id = 0.84,
                    vsearchpath = "vsearch",
                    mumupath = "mumu",
                    verbose = FALSE,
                    clean_pq = TRUE,
                    keep_temporary_files = FALSE) {
  verify_pq(physeq)
  if (is.null(physeq@refseq)) {
    stop("The phyloseq object do not contain a @refseq slot")
  }
  if (clean_pq) {
    physeq <- clean_pq(physeq)
  }

  message("Start Vsearch usearch_global")
  dna <- Biostrings::DNAStringSet(physeq@refseq)
  Biostrings::writeXStringSet(dna, "temp.fasta")
  system2(
    vsearchpath,
    paste(
      " --usearch_global temp.fasta --db temp.fasta --self --iddef 1",
      " -userfields query+target+id --maxaccepts 0 --query_cov 0.9 --maxhits 10",
      " -id ",
      id,
      "  --userout match_list.txt"
    ),
    stdout = TRUE,
    stderr = TRUE
  )
  otu_tab <-
    data.frame(unclass(taxa_as_rows(physeq)@otu_table))
  otu_tab <- cbind("ASV" = rownames(otu_tab), otu_tab)
  write.table(
    otu_tab,
    "otu_table.csv",
    sep = "\t",
    row.names = FALSE,
    quote = FALSE
  )

  message("Mumu algorithm")
  system2(
    mumupath,
    paste(
      "--otu_table otu_table.csv",
      "--match_list match_list.txt",
      "--log log.txt",
      "--new_otu_table new_OTU.tablemumu"
    )
  )

  res_mumu <- read.delim("new_OTU.tablemumu")
  new_otu_tab <- otu_table(res_mumu[, -1], taxa_are_rows = TRUE)
  taxa_names(new_otu_tab) <- res_mumu[, 1]

  new_physeq <-
    prune_taxa(taxa_names(physeq) %in% taxa_names(new_otu_tab), physeq)
  new_physeq@otu_table <-
    otu_table(t(new_otu_tab), taxa_are_rows = FALSE)
  if (nsamples(new_physeq) != nsamples(physeq)) {
    stop(
      "There is a different number of samples before and after mumu algorithm.
         This may be due to empty samples. You may try to rerun mumu_pq()
         using clean_pq = TRUE."
    )
  }
  sample_names(new_physeq) <- sample_names(physeq)

  if (verbose) {
    message(paste(
      "The number of taxa decrease from ",
      ntaxa(physeq),
      " to ",
      ntaxa(new_physeq),
      ".",
      sep = ""
    ))
    message(
      "See the log slot to verify the degree of discrepancy in taxonomy due to mumu re-clustering."
    )
  }

  result_mumu <- read.delim("log.txt")
  colnames(result_mumu) <-
    c(
      "Query_ASV",
      "Potential parent",
      "Similarity_percent",
      "Ab_query",
      "Ab_parent",
      "Overlap_ab_query",
      "Overlap_ab_parent",
      "Incidence_query",
      "Incidence_parent",
      "Smallest_ab_ratio",
      "Sum_ab_ratio",
      "Average_ratio",
      "Average_non_null_ratio",
      "Relative_cooccurence_value",
      "Status"
    )

  if (!keep_temporary_files) {
    if (file.exists("temp.fasta")) {
      unlink("temp.fasta")
    }
    if (file.exists("cluster.fasta")) {
      unlink("cluster.fasta")
    }
    if (file.exists("temp.uc")) {
      unlink("temp.uc")
    }

    if (file.exists("log.txt")) {
      unlink("temp.uc")
    }
    if (file.exists("match_list.txt")) {
      unlink("match_list.txt")
    }
    if (file.exists("otu_table.csv")) {
      unlink("otu_table.csv")
    }
    if (file.exists("new_OTU.tablemumu")) {
      unlink("new_OTU.tablemumu")
    }
  }

  return(list(
    "new_physeq" = new_physeq,
    "mumu_results" = result_mumu
  ))
}
################################################################################




################################################################################
#' Verify the validity of a phyloseq object
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#'
#' Mostly for internal use in MiscMetabar functions.
#'
#' @inheritParams clean_pq
#' @param verbose (logical, default FALSE) If TRUE, prompt some warnings.
#' @param min_nb_seq_sample (numeric) Only used if verbose = TRUE.
#'   Minimum number of sequences per samples to not show warning.
#' @param min_nb_seq_taxa (numeric) Only used if verbose = TRUE.
#'   Minimum number of sequences per taxa to not show warning.
#' @return Nothing if the phyloseq object is valid. An error in the other case.
#'  Warnings if verbose = TRUE
#' @export
#'
verify_pq <- function(
    physeq,
    verbose = FALSE,
    min_nb_seq_sample = 500,
    min_nb_seq_taxa = 1) {
  if (!methods::validObject(physeq) ||
    !inherits(physeq, "phyloseq")) {
    stop("The physeq argument is not a valid phyloseq object.")
  }
  if (verbose) {
    if (min(sample_sums(physeq)) < min_nb_seq_sample) {
      warning(paste0(
        "At least one of your sample contains less than ",
        min_nb_seq_sample,
        " sequences."
      ))
    }
    if (min(sample_sums(physeq)) < min_nb_seq_sample) {
      warning(paste0(
        "At least one of your taxa is represent by less than ",
        min_nb_seq_taxa,
        " sequences."
      ))
    }
    if (sum(is.na(physeq@sam_data)) > 0) {
      warning("At least one of your samples metadata columns contains NA.")
    }
    if (sum(grepl("^[0-9]", sample_names(physeq)) > 0) && !silent) {
      message(
        "At least one sample name start with a number.
      It may introduce bug in some function such
      as psmelt. You may replace sample_names using
      for example :
      sample_names(physeq) <- paste('samp', sample_names(physeq))"
      )
    }
  }
}
################################################################################


################################################################################
#' Subset samples using a conditional boolean vector.
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' The main objective of this function is to complete the
#' [phyloseq::subset_samples()] function by propose a more easy
#' (but more prone to error) way of subset_samples.
#' It replace the subsetting expression which used the name of the variable
#' in the sam_data by a boolean vector.
#'
#' Warnings: you must verify the result of this function as the
#' boolean condition must match the order of samples in the `sam_data`
#' slot.
#'
#' This function is robust when you use the sam_data slot of the phyloseq object
#' used in physeq (see examples)
#'
#' @inheritParams clean_pq
#' @param condition A boolean vector to subset samples. Length must fit
#'   the number of samples
#'
#' @examples
#'
#' cond_samp <- grepl("A1", data_fungi@sam_data[["Sample_names"]])
#' subset_samples_pq(data_fungi, cond_samp)
#'
#' subset_samples_pq(data_fungi, data_fungi@sam_data[["Height"]] == "Low")
#'
#' @return a new phyloseq object
#' @export
#'
subset_samples_pq <- function(physeq, condition) {
  if (length(condition) != nsamples(physeq)) {
    stop("Length of condition is different from the number of samples.")
  }
  if (is.null(physeq@sam_data)) {
    message("Nothing subset. No sample_data in physeq.\n")
    return(physeq)
  } else {
    old_DF <- as(sample_data(physeq), "data.frame")
    new_DF <- old_DF[condition, ]
    if (inherits(physeq, "sample_data")) {
      return(sample_data(new_DF))
    } else {
      sample_data(physeq) <- sample_data(new_DF)
      return(physeq)
    }
  }
}
################################################################################

################################################################################
#' Subset taxa using a conditional named boolean vector.
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' The main objective of this function is to complete the
#' [phyloseq::subset_taxa()] function by propose a more easy way of
#' subset_taxa using a named boolean vector. Names must match taxa_names.
#'
#' @inheritParams clean_pq
#' @param condition A named boolean vector to subset taxa. Length must fit
#'   the number of taxa and names must match taxa_names. Can also be a
#'   condition using a column of the tax_table slot (see examples). If
#'   the order of condition is the same as taxa_names(physeq),
#'   you can use the parameter `taxa_names_from_physeq = TRUE`.
#' @param clean_pq (logical)
#'   If set to TRUE, empty samples are discarded after subsetting ASV
#' @param verbose (logical) Informations are printed
#' @param taxa_names_from_physeq (logical) If set to TRUE, rename the
#'   condition vector using taxa_names(physeq). Carefully check the result
#'   of this function if you use this parameter. No effect if the condition
#'   is of class `tax_table`.
#' @examples
#'
#' subset_taxa_pq(data_fungi, data_fungi@tax_table[, "Phylum"] == "Ascomycota")
#'
#' cond_taxa <- grepl("Endophyte", data_fungi@tax_table[, "Guild"])
#' names(cond_taxa) <- taxa_names(data_fungi)
#' subset_taxa_pq(data_fungi, cond_taxa)
#'
#' subset_taxa_pq(data_fungi, grepl("mycor", data_fungi@tax_table[, "Guild"]),
#'   taxa_names_from_physeq = TRUE
#' )
#'
#' @return a new phyloseq object
#' @export
#'
subset_taxa_pq <- function(physeq,
                           condition,
                           verbose = TRUE,
                           clean_pq = TRUE,
                           taxa_names_from_physeq = FALSE) {
  if (inherits(condition, "taxonomyTable")) {
    condition_temp <- as.vector(condition)
    names(condition_temp) <- rownames(condition)
    condition <- condition_temp
  } else {
    if (taxa_names_from_physeq) {
      names(condition) <- taxa_names(physeq)
    }
  }

  if (!sum(names(condition) %in% taxa_names(physeq)) == length(condition)) {
    stop(paste(
      "Some names in condition do not fit taxa_names of physeq : ",
      paste(names(condition)[!names(condition) %in% taxa_names(physeq)],
        collapse = "/"
      )
    ))
  }

  new_physeq <- physeq

  if (!taxa_are_rows(new_physeq)) {
    new_physeq@otu_table <-
      otu_table(t(new_physeq@otu_table), taxa_are_rows = TRUE)
    taxa_are_rows(new_physeq) <- TRUE
  }

  cond <- condition[match(taxa_names(new_physeq), names(condition))]
  cond[is.na(cond)] <- FALSE

  old_MA <-
    as(otu_table(new_physeq, taxa_are_rows = TRUE), "matrix")
  new_MA <- old_MA[cond, ]

  if (!is.matrix(new_MA)) {
    new_MA <- as.matrix(new_MA)
    new_otu_table <- otu_table(new_MA, taxa_are_rows = TRUE)
    sample_names(new_otu_table) <- sample_names(new_physeq)
  } else {
    new_otu_table <- otu_table(new_MA, taxa_are_rows = TRUE)
  }

  otu_table(new_physeq) <- new_otu_table

  if (clean_pq) {
    new_physeq <- clean_pq(new_physeq, verbose = TRUE)
  }

  if (verbose) {
    message(paste("Number of non-matching ASV", sum(is.na(
      match(taxa_names(physeq), names(condition))
    ))))
    message(paste("Number of matching ASV", sum(!is.na(
      match(taxa_names(physeq), names(condition))
    ))))
    message(paste(
      "Number of filtered-out ASV",
      ntaxa(physeq) - ntaxa(new_physeq)
    ))
    message(paste("Number of kept ASV", ntaxa(new_physeq)))
    message(paste("Number of kept samples", nsamples(new_physeq)))
  }

  return(new_physeq)
}
################################################################################


################################################################################
#' Select one sample from a physeq object
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Mostly for internal used, for example in function [track_wkflow_samples()].
#'
#' @inheritParams clean_pq
#' @param sam_name (required) The sample name to select
#' @param silent (logical) If true, no message are printing.
#' @return A new \code{\link{phyloseq-class}} object with one sample
#'
#' @export
#'
#' @author Adrien Taudière
#'
#' @examples
#'
#' A8_005 <- select_one_sample(data_fungi, "A8-005_S4_MERGED.fastq.gz")
#' A8_005
select_one_sample <- function(physeq, sam_name, silent = FALSE) {
  if (sum(sample_names(physeq) %in% sam_name) == 0) {
    stop(
      paste0(
        "The sample ",
        sam_name,
        " is not present in the names of samples of your phyloseq physeq object.
        You may use the sample_names() function."
      )
    )
  }
  cl_sam <-
    clean_pq(subset_samples_pq(physeq, sample_names(physeq) == sam_name),
      silent = TRUE
    )

  if (!silent) {
    message(
      paste0(
        "You select 1 of ",
        nsamples(physeq),
        " samples and conserved ",
        ntaxa(cl_sam),
        " out of ",
        ntaxa(physeq),
        " taxa represented by ",
        sum(cl_sam@otu_table),
        " sequences (out of ",
        sum(physeq@otu_table),
        " sequences [",
        perc(sum(cl_sam@otu_table), sum(physeq@otu_table)),
        "%])"
      )
    )
  }

  return(cl_sam)
}
################################################################################


################################################################################
#' Add new taxonomic rank to a phyloseq object.
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' One of main use of this function is to add taxonomic assignment from
#' a new database.
#'
#' @inheritParams clean_pq
#' @param ref_fasta (required) A link to a database.
#'   Pass on to `dada2::assignTaxonomy`.
#' @param suffix (character) The suffix to name the new columns.
#'   If set to NULL (the default), the basename of the file reFasta
#'   is used.
#' @param ... Other arguments pass on to `dada2::assignTaxonomy`.
#' @return A new \code{\link{phyloseq-class}} object with a larger slot tax_table"
#'
#' @export
#'
#' @author Adrien Taudière
#'
add_new_taxonomy_pq <- function(physeq, ref_fasta, suffix = NULL, ...) {
  if (is.null(suffix)) {
    suffix <- basename(ref_fasta)
  }
  tax_tab <-
    dada2::assignTaxonomy(physeq@refseq, refFasta = ref_fasta, ...)
  colnames(tax_tab) <-
    make.unique(paste0(colnames(tax_tab), "_", suffix))
  new_tax_tab <- tax_table(cbind(physeq@tax_table, tax_tab))
  new_physeq <- physeq
  tax_table(new_physeq) <- new_tax_tab
  return(new_physeq)
}
################################################################################


################################################################################
#' Summarize information from sample data in a table
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' A wrapper for the [gtsummary::tbl_summary()] function in the case of `physeq`
#'   object.
#'
#' @inheritParams clean_pq
#' @param remove_col_unique_value (logical, default TRUE) Do we remove
#'  informative columns (categorical column with one value per samples),
#'   e.g. samples names ?
#' @param ... Other arguments pass on to [gtsummary::tbl_summary()].
#' @return A new \code{\link{phyloseq-class}} object with a larger slot tax_table
#'
#' @export
#' @author Adrien Taudière
#' @examples
#' if (requireNamespace("gtsummary")) {
#'   tbl_sum_samdata(data_fungi) %>%
#'     gtsummary::as_kable()
#'
#'   summary_samdata <- tbl_sum_samdata(data_fungi,
#'     include = c("Time", "Height"),
#'     type = list(Time ~ "continuous2", Height ~ "categorical"),
#'     statistic = list(Time ~ c("{median} ({p25}, {p75})", "{min}, {max}"))
#'   )
#' }
#' \donttest{
#' data(enterotype)
#' if (requireNamespace("gtsummary")) {
#'   summary_samdata <- tbl_sum_samdata(enterotype)
#'   summary_samdata <- tbl_sum_samdata(enterotype, include = !contains("SampleId"))
#' }
#' }
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please make a reference to `gtsummary::tbl_summary()` if you
#'   use this function.

tbl_sum_samdata <- function(physeq, remove_col_unique_value = TRUE, ...) {
  tbl <- tibble(data.frame(physeq@sam_data))
  if (remove_col_unique_value) {
    tbl <- tbl[, !apply(tbl, 2, function(x) {
      length(unique(x)) == nrow(tbl) && is.character(x)
    })]
  }
  tbl_sum <- tbl %>% gtsummary::tbl_summary(...)
  return(tbl_sum)
}
################################################################################
#' Add information about Guild for FUNGI the FUNGuild databse
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Please cite Nguyen et al. 2016 (\doi{doi:10.1016/j.funeco.2015.06.006})
#'
#' @inheritParams clean_pq
#' @param taxLevels Name of the 7 columns in tax_table required by funguild
#'
#' @return A new object of class `physeq` with Guild information added to
#'   `tax_table` slot
#' @export
#' @author Adrien Taudière
#' @examples
#' \donttest{
#' if (requireNamespace("httr")) {
#'   d_fung_mini <- add_funguild_info(data_fungi_mini,
#'     taxLevels = c(
#'       "Domain",
#'       "Phylum",
#'       "Class",
#'       "Order",
#'       "Family",
#'       "Genus",
#'       "Species"
#'     )
#'   )
#'   sort(table(d_fung_mini@tax_table[, "guild"]), decreasing = TRUE)
#' }
#' }
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please make a reference to `FUNGuildR` package and the associate
#'   [publication](https://www.sciencedirect.com/science/article/abs/pii/S1754504815000847) if you
#'   use this function.
#' @seealso [plot_guild_pq()]

add_funguild_info <- function(physeq,
                              taxLevels = c(
                                "Kingdom",
                                "Phylum",
                                "Class",
                                "Order",
                                "Family",
                                "Genus",
                                "Species"
                              )) {
  tax_tab <- physeq@tax_table
  FUNGuild_assign <-
    funguild_assign(data.frame(
      "Taxonomy" =
        apply(tax_tab[, taxLevels], 1,
          paste,
          collapse = ";"
        )
    ))
  tax_tab <-
    as.matrix(cbind(
      tax_tab,
      FUNGuild_assign
    ))
  physeq@tax_table <- tax_table(tax_tab)
  return(physeq)
}




################################################################################
#' Plot information about Guild from tax_table slot previously
#' created with [add_funguild_info()]
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Graphical function.
#' 
#' @inheritParams clean_pq
#' @param levels_order (Default NULL) A character vector to
#'   reorder the levels of guild. See examples.
#' @param clean_pq (logical, default TRUE): Does the phyloseq
#'   object is cleaned using the [clean_pq()] function?
#' @param ... Other params for be passed on to
#'   [clean_pq()] function
#' @return A ggplot2 object
#'
#' @export
#' @author Adrien Taudière
#' @examples
#' \donttest{
#' if (requireNamespace("httr")) {
#'   d_fung_mini <- add_funguild_info(data_fungi_mini,
#'     taxLevels = c(
#'       "Domain",
#'       "Phylum",
#'       "Class",
#'       "Order",
#'       "Family",
#'       "Genus",
#'       "Species"
#'     )
#'   )
#'   sort(table(d_fung_mini@tax_table[, "guild"]), decreasing = TRUE)
#'
#'   p <- plot_guild_pq(d_fung_mini)
#'   if (requireNamespace("patchwork")) {
#'     (plot_guild_pq(subset_samples(d_fung_mini, Height == "Low"),
#'       levels_order = p$data$Guild[order(p$data$nb_seq)]
#'     ) + theme(legend.position = "none")) +
#'       (plot_guild_pq(subset_samples(d_fung_mini, Height == "High"),
#'         levels_order = p$data$Guild[order(p$data$nb_seq)]
#'       ) + ylab("") + theme(axis.text.y = element_blank()))
#'   }
#' }
#' }
#' @seealso [add_funguild_info()]

plot_guild_pq <-
  function(physeq,
           levels_order = NULL,
           clean_pq = TRUE,
           ...) {
    if (clean_pq) {
      physeq <- clean_pq(physeq, ...)
    }
    guilds <-
      data.frame(sort(table(strsplit(
        paste(
          physeq@tax_table[, "guild"]
          [physeq@tax_table[, "confidenceRanking"] %in%
              c("Highly Probable", "Probable")],
          collapse = "-"
        ),
        split = "-"
      ))))

    guilds$Var1 <- as.vector(guilds$Var1)
    guilds <- guilds[guilds$Var1 != "NA", ]
    guilds <- guilds[guilds$Var1 != "NULL", ]
    guilds <- guilds[guilds$Var1 != "", ]

    # Number of sequences per guild
    nb_seq_by_guild <- c()
    for (i in seq(1, length(guilds$Var1))) {
      nb_seq_by_guild[i] <-
        sum(taxa_sums(physeq@otu_table)[grepl(
          guilds$Var1[i],
          physeq@tax_table[, "guild"]
        )])
    }
    names(nb_seq_by_guild) <- guilds$Var1
    guilds$seq <- nb_seq_by_guild

    names(guilds) <- c("Guild", "nb_asv", "nb_seq")
    guilds$nb_seq <- as.numeric(guilds$nb_seq)
    guilds$nb_asv <- as.numeric(guilds$nb_asv)

    guilds$Guild <- factor(as.vector(guilds$Guild),
      levels = guilds$Guild[order(guilds$nb_seq)]
    )


    COLORS <- rep("Others", nrow(guilds))
    COLORS[grepl("Sapro", guilds$Guild)] <- "Sapro"
    COLORS[grepl("Parasite", guilds$Guild)] <- "Parasite/pathogen"
    COLORS[grepl("Pathog", guilds$Guild)] <- "Parasite/pathogen"
    COLORS[grepl("ycorrh", guilds$Guild)] <- "Mutualist"
    COLORS[grepl("Lichen", guilds$Guild)] <- "Mutualist"
    COLORS[grepl("Endophy", guilds$Guild)] <- "Mutualist"

    guilds$colors <- COLORS
    guilds <- rbind(
      guilds,
      data.frame(
        "Guild" = "All ASV",
        "nb_asv" = ntaxa(physeq),
        "nb_seq" = sum(physeq@otu_table),
        "colors" = "ALL"
      )
    )
    guilds <- guilds[order(guilds$nb_seq), ]
    if (!is.null(levels_order)) {
      guilds$Guild <- factor(guilds$Guild, levels = levels_order)
    }

    ggplot(
      guilds,
      aes(
        y = Guild,
        x = log10(nb_seq),
        fill = colors
      )
    ) +
      geom_bar(stat = "identity") +
      annotation_logticks(sides = "b", alpha = 0.5) +
      ylab("GUILD by FUNGuild") +
      scale_fill_manual("Guild",
        values = c(
          "gray", "Olivedrab", "cyan4", "tomato3",
          "lightpink4"
        )
      ) +
      geom_text(aes(label = nb_asv, x = log10(nb_seq) + 0.2),
        family = "serif"
      ) +
      geom_text(aes(label = nb_seq, x = log10(nb_seq) / 2),
        family = "mono",
        col = "white"
      )
  }

################################################################################

################################################################################
#' Build phylogenetic trees from refseq slot of a phyloseq object
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' This function build tree phylogenetic tree and if nb_bootstrap is
#' set, it build also the 3 corresponding bootstrapped tree.
#'
#' Default parameters are based on \doi{doi:10.12688/f1000research.8986.2}
#' and phangorn vignette [Estimating phylogenetic trees with phangorn](https://klausvigo.github.io/phangorn/articles/Trees.html). You should understand your data, especially the markers,
#' before using this function.
#'
#' Note that phylogenetic reconstruction with markers used for metabarcoding are
#' not robust. You must verify the robustness of your phylogenetic tree using
#' taxonomic classification (see vignette [Tree visualization](https://adrientaudiere.github.io/MiscMetabar/articles/tree_visualization.html)) and bootstrap or multi-tree visualization
#'
#' @inheritParams clean_pq
#' @param nb_bootstrap (default 0): If a positive number is set,
#'   the function also build 3 bootstrapped trees using `nb_bootstrap`
#'   bootstrap samples
#' @param model allows to choose an amino acid models or nucleotide model,
#'   see [phangorn::optim.pml()] for more details
#' @param optInv 	Logical value indicating whether topology gets optimized
#'  (NNI). See [phangorn::optim.pml()] for more details
#' @param optGamma Logical value indicating whether gamma rate parameter gets
#'  optimized. See [phangorn::optim.pml()] for more details
#' @param rearrangement type of tree tree rearrangements to perform, one of
#'  "NNI", "stochastic" or "ratchet"
#'   see [phangorn::optim.pml()] for more details
#' @param control A list of parameters for controlling the fitting process.
#'   see [phangorn::optim.pml()] for more details
#' @param optNni Logical value indicating whether topology gets optimized (NNI).
#'   see [phangorn::optim.pml()] for more details
#' @param multicore	(logical) whether models should estimated in parallel.
#'   see [phangorn::bootstrap.pml()] for more details
#' @param ... Other params for be passed on to
#'   [phangorn::optim.pml()] function
#'
#' @return A list of phylogenetic tree
#' @export
#' @author Adrien Taudière
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please make a reference to `phangorn` package if you
#'   use this function.
#' @examplesIf tolower(Sys.info()[["sysname"]]) != "windows"
#' \donttest{
#' if (requireNamespace("phangorn")) {
#'   df <- subset_taxa_pq(data_fungi_mini, taxa_sums(data_fungi_mini) > 9000)
#'   df_tree <- build_phytree_pq(df, nb_bootstrap = 2)
#'   plot(df_tree$UPGMA)
#'   phangorn::plotBS(df_tree$UPGMA, df_tree$UPGMA_bs, main = "UPGMA")
#'   plot(df_tree$NJ, "unrooted")
#'   plot(df_tree$ML)
#'
#'   phangorn::plotBS(df_tree$ML$tree, df_tree$ML_bs, p = 20, frame = "circle")
#'   phangorn::plotBS(
#'     df_tree$ML$tree,
#'     df_tree$ML_bs,
#'     p = 20,
#'     frame = "circle",
#'     method = "TBE"
#'   )
#'   plot(phangorn::consensusNet(df_tree$ML_bs))
#'   plot(phangorn::consensusNet(df_tree$NJ_bs))
#'   ps_tree <- merge_phyloseq(df, df_tree$ML$tree)
#' }
#' }
build_phytree_pq <- function(physeq,
                             nb_bootstrap = 0,
                             model = "GTR",
                             optInv = TRUE,
                             optGamma = TRUE,
                             rearrangement = "NNI",
                             control = phangorn::pml.control(trace = 0),
                             optNni = TRUE,
                             multicore = FALSE,
                             ...) {
  seqs <- physeq@refseq
  alignment <-
    DECIPHER::AlignSeqs(Biostrings::DNAStringSet(seqs), anchor = NA)

  phang.align <-
    phangorn::phyDat(as(alignment, "matrix"), type = "DNA")
  dm <- phangorn::dist.ml(phang.align)
  treeNJ <- phangorn::NJ(dm) # Note, tip order != sequence order
  treeUPGMA <-
    phangorn::upgma(dm) # Note, tip order != sequence order
  fit <- phangorn::pml(treeNJ, data = phang.align)
  ## negative edges length changed to 0!
  fitGTR <- update(fit, k = 4, inv = 0.2)
  tree_ML <-
    phangorn::optim.pml(
      fitGTR,
      model = model,
      optInv = optInv,
      optGamma = optGamma,
      rearrangement = rearrangement,
      control = control,
      optNni = optNni,
      ...
    )
  if (nb_bootstrap > 0) {
    treeUPGMA_bs <-
      phangorn::bootstrap.phyDat(phang.align,
        function(x) {
          phangorn::upgma(phangorn::dist.ml(x))
        },
        bs = nb_bootstrap
      )
    if (rearrangement == "NNI") {
      tree_ML_bs <- phangorn::bootstrap.pml(
        tree_ML,
        bs = nb_bootstrap,
        multicore = multicore,
        rearrangement = "NNI",
        ...
      )
    } else if (rearrangement == "stochastic") {
      tree_ML_bs <- phangorn::bootstrap.pml(
        tree_ML,
        bs = nb_bootstrap,
        multicore = multicore,
        rearrangement = "stochastic",
        ...
      )
    } else if (rearrangement == "ratchet") {
      tree_ML_bs <- phangorn::bootstrap.pml(
        tree_ML,
        bs = nb_bootstrap,
        multicore = multicore,
        rearrangement = "ratchet",
        ...
      )
    } else {
      stop("rearrangement parameter one of the three value 'stochastic',
       'NNI' or 'ratchet'")
    }
    treeNJ_bs <- phangorn::bootstrap.phyDat(phang.align,
      function(x) {
        phangorn::NJ(phangorn::dist.ml(x))
      },
      bs = nb_bootstrap
    )
    return(
      list(
        "UPGMA" = treeUPGMA,
        "NJ" = treeNJ,
        "ML" = tree_ML,
        "UPGMA_bs" = treeUPGMA_bs,
        "NJ_bs" = treeNJ_bs,
        "ML_bs" = tree_ML_bs
      )
    )
  } else {
    return(list(
      "UPGMA" = treeUPGMA,
      "NJ" = treeNJ,
      "ML" = tree_ML
    ))
  }
}
################################################################################

################################################################################
#' Test if the mean number of sequences by samples is link to the modality of
#' a factor
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' The aim of this function is to provide a warnings if samples depth significantly
#' vary among the modalities of a factor present in the `sam_data` slot.
#'
#' This function apply a Kruskal-Wallis rank sum test to the number of sequences
#' per samples in function of the factor `fact`.
#'
#' @inheritParams clean_pq
#' @param fact (required): Name of the factor to cluster samples by modalities.
#'   Need to be in \code{physeq@sam_data}.
#' @param boxplot (logical) Do you want to plot boxplot?
#'
#' @return The result of a Kruskal-Wallis rank sum test
#' @export
#' @author Adrien Taudière
#' @importFrom stats kruskal.test
#' @examples
#'
#' are_modality_even_depth(data_fungi_mini, "Time")$p.value
#' are_modality_even_depth(rarefy_even_depth(data_fungi_mini), "Time")$p.value
#' are_modality_even_depth(data_fungi_mini, "Height", boxplot = TRUE)
are_modality_even_depth <- function(physeq, fact, boxplot = FALSE) {
  nb_seq <- sample_sums(physeq)
  fact <- factor(unclass(physeq@sam_data[, fact])[[1]])
  res <- kruskal.test(nb_seq ~ fact)
  if (boxplot) {
    boxplot(nb_seq ~ fact)
  }
  if (length(unique(tapply(nb_seq, fact, mean))) == 1) {
    message("All modality were undoubtedly rarefy in the physeq object.")
    res$p.value <- 1
  }
  return(res)
}
################################################################################


################################################################################
#' Reorder taxa in otu_table/tax_table/refseq slot of a phyloseq object
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Note that the taxa order in a physeq object with a tree is locked by
#' the order of leaf in the phylogenetic tree.
#'
#' @inheritParams clean_pq
#' @param names_ordered (required): Names of the taxa (must be the same
#'   as taxa in `taxa_names(physeq)`) in a given order
#' @param remove_phy_tree (logical, default FALSE) If TRUE, the phylogenetic
#'   tree is removed. It is
#' @return A phyloseq object
#' @export
#' @author Adrien Taudière
#' @examples
#'
#' data_fungi_ordered_by_genus <- reorder_taxa_pq(
#'   data_fungi,
#'   taxa_names(data_fungi)[order(as.vector(data_fungi@tax_table[, "Genus"]))]
#' )
#'
#' data_fungi_mini_asc_ordered_by_abundance <- reorder_taxa_pq(
#'   data_fungi_mini,
#'   taxa_names(data_fungi_mini)[order(taxa_sums(data_fungi_mini))]
#' )
reorder_taxa_pq <- function(physeq, names_ordered, remove_phy_tree = FALSE) {
  new_physeq <- physeq

  if (!is.null(phy_tree(new_physeq, FALSE))) {
    if (remove_phy_tree) {
      message("Removing phylogenetic tree!")
      new_physeq@phy_tree <- NULL
    } else {
      stop("The taxa order in a physeq object with a tree is locked by
      the order of leaf in the phylogenetic tree. You could use args
      remove_phy_tree = TRUE.")
    }
  }

  if (sum(!names_ordered %in% taxa_names(new_physeq)) > 0) {
    stop(
      "The taxa names in physeq and in the args names_ordered are not the same.
      You can try identify them usign match(names_ordered, taxa_names(physeq))"
    )
  }
  order_taxa_names <- match(names_ordered, taxa_names(new_physeq))
  if (!is.null(otu_table(new_physeq, FALSE))) {
    if (taxa_are_rows(new_physeq)) {
      new_physeq@otu_table <-
        otu_table(new_physeq, taxa_are_rows = TRUE)[order_taxa_names, ]
    } else {
      new_physeq@otu_table <-
        otu_table(new_physeq, taxa_are_rows = FALSE)[, order_taxa_names]
    }
  }
  if (!is.null(tax_table(new_physeq, FALSE))) {
    new_physeq@tax_table <-
      tax_table(new_physeq)[order_taxa_names, ]
  }
  if (!is.null(refseq(new_physeq, FALSE))) {
    new_physeq@refseq <-
      refseq(new_physeq)[order_taxa_names]
  }
  return(new_physeq)
}
################################################################################

################################################################################
#' @title Add information to sample_data slot of a phyloseq-class object
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Warning: The value nb_seq and nb_otu may be outdated if you transform your
#' phyloseq object, e.g. using the [subset_taxa_pq()] function
#'
#' @inheritParams clean_pq
#' @param df_info : A dataframe with rownames matching for sample names of the
#'   phyloseq object
#' @param add_nb_seq (Logical, default TRUE) Does we add a column nb_seq
#'   collecting the number of sequences per sample?
#' @param add_nb_otu (Logical, default TRUE) Does we add a column nb_otu
#'   collecting the number of OTUs per sample?
#'
#' @return A phyloseq object with an updated sam_data slot
#' @export
#'
#' @examples
#'
#' data_fungi <- add_info_to_sam_data(data_fungi)
#' boxplot(data_fungi@sam_data$nb_otu ~ data_fungi@sam_data$Time)
#'
#' new_df <- data.frame(
#'   variable_1 = runif(n = nsamples(data_fungi), min = 1, max = 20),
#'   variable_2 = runif(n = nsamples(data_fungi), min = 1, max = 2)
#' )
#' rownames(new_df) <- sample_names(data_fungi)
#' data_fungi <- add_info_to_sam_data(data_fungi, new_df)
#' plot(data_fungi@sam_data$nb_otu ~ data_fungi@sam_data$variable_1)
#' @author Adrien Taudière

add_info_to_sam_data <- function(physeq,
                                 df_info = NULL,
                                 add_nb_seq = TRUE,
                                 add_nb_otu = TRUE) {
  if (add_nb_seq) {
    physeq@sam_data$nb_seq <- sample_sums(physeq)
  }
  if (add_nb_otu) {
    physeq@sam_data$nb_otu <- sample_sums(as_binary_otu_table(physeq))
  }
  if (!is.null(df_info)) {
    if (sum(sample_names(physeq) %in% rownames(df_info)) == 0) {
      stop("Rownames of df_info must match the sample names of physeq.")
    }
    df_info <-
      df_info[match(sample_names(physeq), rownames(df_info)), ]
    physeq@sam_data <- sample_data(cbind(
      as.data.frame(physeq@sam_data),
      df_info
    ))
  }
  return(physeq)
}
################################################################################


###############################################################################
#' Return a DNAStringSet object from either a character vector of DNA sequences
#'   or the `refseq` slot of a phyloseq-class object
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-stable-green" alt="lifecycle-stable"></a>
#'
#'   Internally used in [vsearch_clustering()], [swarm_clustering()] and
#'   [asv2otu()].
#'
#' @inheritParams clean_pq
#' @param dna_seq You may directly use a character vector of DNA sequences
#'   in place of physeq args. When physeq is set, dna sequences take the value
#'   of `physeq@refseq`
#'
#' @return An object of class DNAStringSet (see the [Biostrings::DNAStringSet()]
#'   function)
#' @export
#'
#' @examples
#'
#' dna <- physeq_or_string_to_dna(data_fungi)
#' dna
#'
#' sequences_ex <- c(
#'   "TACCTATGTTGCCTTGGCGGCTAAACCTACCCGGGATTTGATGGGGCGAATTAATAACGAATTCATTGAATCA",
#'   "TACCTATGTTGCCTTGGCGGCTAAACCTACCCGGGATTTGATGGGGCGAATTACCTGGTAAGGCCCACTT",
#'   "TACCTATGTTGCCTTGGCGGCTAAACCTACCCGGGATTTGATGGGGCGAATTACCTGGTAGAGGTG",
#'   "TACCTATGTTGCCTTGGCGGCTAAACCTACC",
#'   "CGGGATTTGATGGCGAATTACCTGGTATTTTAGCCCACTTACCCGGTACCATGAGGTG",
#'   "GCGGCTAAACCTACCCGGGATTTGATGGCGAATTACCTGG",
#'   "GCGGCTAAACCTACCCGGGATTTGATGGCGAATTACAAAG",
#'   "GCGGCTAAACCTACCCGGGATTTGATGGCGAATTACAAAG",
#'   "GCGGCTAAACCTACCCGGGATTTGATGGCGAATTACAAAG"
#' )
#' dna2 <- physeq_or_string_to_dna(dna_seq = sequences_ex)
#' dna2
#'
#' @seealso [Biostrings::DNAStringSet()]
#' @author Adrien Taudière
physeq_or_string_to_dna <- function(physeq = NULL,
                                    dna_seq = NULL) {
  if (inherits(physeq, "phyloseq")) {
    verify_pq(physeq)
    if (is.null(physeq@refseq)) {
      stop("The phyloseq object do not contain a @refseq slot")
    }
    dna <- Biostrings::DNAStringSet(physeq@refseq)
    if (!is.null(dna_seq)) {
      stop("You must use either physeq or dna_seq args but not both")
    }
  } else if (inherits(dna_seq, "character")) {
    dna <- Biostrings::DNAStringSet(dna_seq)
  } else {
    stop(
      "You must set the args physeq (object of class phyloseq) or
    dna_seq (character vector)."
    )
  }
  return(dna)
}
###############################################################################



################################################################################
#' Remove primers using [cutadapt](https://github.com/marcelm/cutadapt/)
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' You need to install [Cutadapt](https://cutadapt.readthedocs.io/).
#'   See also https://github.com/VascoElbrecht/JAMP/blob/master/JAMP/R/Cutadapt.R for another call to cutadapt
#'   from R
#'
#' @param path_to_fastq (Required) A path to a folder with fastq files. See
#'   [list_fastq_files()] for help.
#' @inheritParams list_fastq_files
#' @param primer_fw (Required, String) The forward primer DNA sequence.
#' @param primer_rev (String)  The reverse primer DNA sequence.
#' @param folder_output The path to a folder for output files
#' @param nproc (default 1)
#'   Set to number of cpus/processors to use for the clustering
#' @param cmd_is_run (logical, default TRUE) Do the cutadapt command is run.
#'   If set to FALSE, the only effect of the function is to return a list of
#'   command to manually run in a terminal.
#' @param args_before_cutadapt (String) A one line bash command to run before
#' to run cutadapt. For examples, "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv &&" allow to bypass the conda init which asks to restart the shell
#'
#' @return a list of command 
#' @export
#' @author Adrien Taudière
#'
#' @examplesIf tolower(Sys.info()[["sysname"]]) != "windows"
#' \dontrun{
#' cutadapt_remove_primers("inst/extdata", "TTC", "GAA",
#'   folder_output = tempdir()
#' )
#'
#' cutadapt_remove_primers(
#'   system.file("extdata",
#'     package = "dada2"
#'   ),
#'   pattern_R1 = "F.fastq.gz",
#'   pattern_R2 = "R.fastq.gz",
#'   primer_fw = "TTC",
#'   primer_rev = "GAA",
#'   folder_output = tempdir()
#' )
#'
#' cutadapt_remove_primers(
#'   system.file("extdata",
#'     package = "dada2"
#'   ),
#'   pattern_R1 = "F.fastq.gz",
#'   primer_fw = "TTC",
#'   folder_output = tempdir(),
#'   cmd_is_run = FALSE
#' )
#'
#' unlink(tempdir(), recursive = TRUE)
#' }
#' @details
#' This function is mainly a wrapper of the work of others.
#'   Please cite cutadapt (\doi{doi:10.14806/ej.17.1.200}).


cutadapt_remove_primers <- function(path_to_fastq,
                                    primer_fw = NULL,
                                    primer_rev = NULL,
                                    folder_output = "wo_primers",
                                    nproc = 1,
                                    pattern = "fastq.gz",
                                    pattern_R1 = "_R1",
                                    pattern_R2 = "_R2",
                                    nb_files = Inf,
                                    cmd_is_run = TRUE,
                                    args_before_cutadapt = "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv && ") {
  cmd <- list()
  if (!dir.exists(folder_output)) {
    dir.create(folder_output)
  }

  if (is.null(primer_rev)) {
    lff <- list_fastq_files(
      path_to_fastq,
      paired_end = FALSE,
      pattern = pattern,
      pattern_R1 = pattern_R1,
      pattern_R2 = pattern_R2,
      nb_files = nb_files
    )
    for (f in lff$fnfs) {
      cmd[[f]] <-
        paste0(
          args_before_cutadapt,
          "cutadapt --cores=",
          nproc,
          " --discard-untrimmed -g '",
          primer_fw,
          "' -o ",
          folder_output,
          "/",
          basename(f),
          " ",
          f
        )
    }
  } else {
    lff <- list_fastq_files(path_to_fastq,
      paired_end = TRUE,
      pattern = pattern,
      pattern_R1 = pattern_R1,
      pattern_R2 = pattern_R2,
      nb_files = nb_files
    )

    primer_fw_RC <- dada2::rc(primer_fw)
    primer_rev_RC <- dada2::rc(primer_rev)
    for (f in lff$fnfs) {
      cmd[[f]] <-
        paste0(
          args_before_cutadapt,
          "cutadapt -n 2 --cores=",
          nproc,
          " --discard-untrimmed -g '",
          primer_fw,
          "' -G '",
          primer_rev,
          "' -a '",
          primer_rev_RC,
          "' -A '",
          primer_fw_RC,
          "' -o ",
          folder_output,
          "/",
          basename(f),
          " -p ",
          folder_output,
          "/",
          gsub(pattern_R1, pattern_R2, basename(f)),
          " ",
          f,
          " ",
          gsub(pattern_R1, pattern_R2, f)
        )
    }
  }
  if (cmd_is_run) {
    writeLines(unlist(cmd), paste0(tempdir(), "/script_cutadapt.sh"))
    system2("bash", paste0(tempdir(), "/script_cutadapt.sh"))
    message(paste0("Output files are available in the folder ", normalizePath(folder_output)))
    unlink(paste0(tempdir(), "/script_cutadapt.sh"))
  }
  return(cmd)
}
################################################################################

################################################################################
#' List the taxa founded only in one given level of a modality
#'
#' @description 
#'   
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#' 
#' Given one modality name in sam_data and one level of the modality,
#' return the taxa strictly specific of this level.
#'
#' @inheritParams clean_pq
#' @param modality (required) The name of a column present in the `@sam_data` slot
#'   of the physeq object. Must be a character vector or a factor.
#' @param level (required) The level (must be present in modality) of interest
#' @param min_nb_seq_taxa (default 0 = no filter) The minimum number of sequences per taxa
#' @param min_nb_samples_taxa (default 0 = no filter) The minimum number of samples per taxa
#'
#' @return A vector of taxa names
#' @export 
#'
#' @examples
#' # Taxa present only in low height samples
#' suppressMessages(suppressWarnings(taxa_only_in_one_level(data_fungi, "Height", "Low")))
#' # Number of taxa present only in sample of time equal to 15
#' suppressMessages(suppressWarnings(length(taxa_only_in_one_level(data_fungi, "Time", "15"))))
#' @seealso [ggvenn_pq()] and [upset_pq()]
#' @export
#' @author Adrien Taudière

taxa_only_in_one_level <- function(physeq,
                                   modality,
                                   level,
                                   min_nb_seq_taxa = 0,
                                   min_nb_samples_taxa = 0) {
  if (min_nb_seq_taxa > 0) {
    physeq <-
      subset_taxa_pq(physeq, taxa_sums(physeq) >= min_nb_seq_taxa)
  }
  if (min_nb_samples_taxa > 0) {
    physeq <-
      subset_taxa_pq(
        physeq,
        taxa_sums(as_binary_otu_table(physeq)) >= min_nb_samples_taxa
      )
  }

  physeq_merged <- clean_pq(merge_samples2(physeq, modality))

  physeq_merged_only_one_level <-
    subset_taxa_pq(physeq_merged, taxa_sums(as_binary_otu_table(physeq_merged)) ==
      1)
  physeq_merged_only_level_given <-
    clean_pq(subset_samples_pq(
      physeq_merged_only_one_level,
      rownames(physeq_merged_only_one_level@sam_data) == level
    ))
  return(taxa_names(physeq_merged_only_level_given))
}
################################################################################



################################################################################
#' Normalize OTU table using samples depth
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#'   This function implement the method proposed by
#'   McKnight et al. 2018 (\doi{doi:10.5061/dryad.tn8qs35})
#'
#' @inheritParams clean_pq
#'
#' @param base_log (integer, default 2) the base for log-transformation. If
#'   set to NULL or NA, no log-transformation is compute after normalization.
#' @param constante a constante to multiply the otu_table values
#' @param digits (default = 2) integer indicating the number of decimal places
#'   to be used (see `?round` for more information)
#'
#' @return A new \code{\link{phyloseq-class}} object with otu_table count
#'   normalize and log transformed (if base_log is an integer)
#' @export
#' @author Adrien Taudière
#' @examples
#' taxa_sums(data_fungi_mini)
#' data_f_norm <- normalize_prop_pq(data_fungi_mini)
#' taxa_sums(data_f_norm)
#' ggplot(data.frame(
#'   "norm" = scale(taxa_sums(data_f_norm)),
#'   "raw" = scale(taxa_sums(data_fungi_mini)),
#'   "name_otu" = taxa_names(data_f_norm)
#' )) +
#'   geom_point(aes(x = raw, y = norm))
#'
#' data_f_norm <- normalize_prop_pq(data_fungi_mini, base_log = NULL)
normalize_prop_pq <- function(physeq, base_log = 2, constante = 10000, digits = 4) {
  verify_pq(physeq)
  if (taxa_are_rows(physeq)) {
    new_otutab <- round((apply(physeq@otu_table, 2, function(x) {
      x / sum(x)
    })) * constante, digits = digits)
  } else {
    new_otutab <- round((apply(physeq@otu_table, 1, function(x) {
      x / sum(x)
    })) * constante, digits = digits)
  }

  if (!is.null(base_log) && !is.na(base_log)) {
    new_otutab <- round(log(new_otutab + 1, base = base_log), digits = digits)
  }

  new_physeq <- physeq
  new_physeq@otu_table <- otu_table(new_otutab, taxa_are_rows = taxa_are_rows(physeq))

  return(new_physeq)
}
################################################################################



################################################################################
#' Build a sample information tibble from physeq object
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#'   Hill numbers are the number of equiprobable species giving the same diversity
#'   value as the observed distribution.
#'
#'   Note that contrary to [hill_pq()], this function does not take into
#'   account for difference in the number of sequences per samples/modalities.
#'   You may use rarefy_by_sample = TRUE if the mean number of sequences per
#'   samples differs among modalities.
#' @inheritParams clean_pq
#' @param hill_scales (a vector of integer) The list of q values to compute
#'   the hill number H^q. If Null, no hill number are computed. Default value
#'   compute the Hill number 0 (Species richness), the Hill number 1
#'   (exponential of Shannon Index) and the Hill number 2 (inverse of Simpson
#'   Index).
#' @param filter_zero (logical, default TRUE) Do we filter non present OTU from
#'   samples ? For the moment, this has no effect on the result because the dataframe
#'   is grouped by samples with abundance summed across OTU.
#' @param rarefy_by_sample (logical, default FALSE) If TRUE, rarefy
#'   samples using [phyloseq::rarefy_even_depth()] function.
#' @param taxa_ranks A vector of taxonomic ranks. For examples c("Family","Genus").
#'   If taxa ranks is not set (default value = NULL), taxonomic information are not
#'   present in the resulting tibble.
#' @author Adrien Taudière
#' @export
#' @return A tibble with a row for each sample. Columns provide information
#'   from `sam_data` slot as well as hill numbers, Abundance (nb of sequences),
#'   and Abundance_log10 (*log10(1+Abundance)*).
#' @examples
#' if (requireNamespace("ggstatsplot")) {
#'   psm_tib <- psmelt_samples_pq(data_fungi_mini, hill_scales = c(0, 2, 7))
#'   ggstatsplot::ggbetweenstats(psm_tib, Height, Hill_0)
#'   ggstatsplot::ggbetweenstats(psm_tib, Height, Hill_7)
#'
#'   psm_tib_tax <- psmelt_samples_pq(data_fungi_mini, taxa_ranks = c("Class", "Family"))
#'   ggplot(filter(psm_tib_tax, Abundance > 2000), aes(y = Family, x = Abundance, fill = Time)) +
#'     geom_bar(stat = "identity") +
#'     facet_wrap(~Height)
#' }
psmelt_samples_pq <-
  function(physeq,
           hill_scales = c(0, 1, 2),
           filter_zero = TRUE,
           rarefy_by_sample = FALSE,
           taxa_ranks = NULL) {
    verify_pq(physeq)
    if (rarefy_by_sample) {
      physeq <- rarefy_even_depth(physeq)
    }
    psm <- psmelt(physeq)
    if (filter_zero) {
      psm <- psm |> filter(Abundance > 0)
    }
    if (is.null(taxa_ranks)) {
      psm <- psm |>
        select(Sample, OTU, Abundance, colnames(physeq@sam_data))
      nb_distinct_samp <- psm |>
        group_by(Sample) |>
        select(-OTU, -Abundance) |>
        distinct() |>
        nrow()

      if (nsamples(physeq) != nb_distinct_samp) {
        stop("The number of samples in physeq is different from the resulting
         number in psm tibble.")
      }
    } else {
      psm <- psm |>
        select(Sample, OTU, Abundance, colnames(physeq@sam_data), !!taxa_ranks)
    }

    if (is.null(taxa_ranks)) {
      psm_samp <- psm |>
        select(-OTU) |>
        group_by(Sample) |>
        summarise(
          Abundance = sum(Abundance),
          across(where(is.numeric) &
            !Abundance, ~ mean(.x, na.rm = TRUE)),
          across(where(is.character), ~ .x[1])
        )
    } else {
      psm_temp <- psm |>
        select(-OTU) |>
        group_by(Sample)

      for (i in seq_along(taxa_ranks)) {
        psm_temp <- psm_temp |>
          group_by(.data[[taxa_ranks[[i]]]], .add = TRUE)
      }

      psm_samp <- psm_temp |>
        summarise(
          Abundance = sum(Abundance),
          across(where(is.numeric) &
            !Abundance, ~ mean(.x, na.rm = TRUE)),
          across(where(is.character), ~ .x[1]),
          .groups = "drop"
        )
    }

    if (!is.null(hill_scales)) {
      physeq <- taxa_as_rows(physeq)
      df_hill <-
        vegan::renyi(t(physeq)@otu_table,
          scales = hill_scales,
          hill = TRUE
        )

      colnames(df_hill) <- paste0("Hill_", hill_scales)
      df_hill$Sample <- rownames(df_hill)

      psm_samp <- full_join(psm_samp, df_hill)
    }

    psm_samp <- psm_samp |> mutate(Abundance_log10 = log10(1 + Abundance))
    return(tibble(psm_samp))
  }
################################################################################

################################################################################
#' Force taxa to be in columns in the otu_table of a physeq object
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#'
#' Mainly for internal use. It is a special case of clean_pq function.
#' 
#' @inheritParams clean_pq
#' @author Adrien Taudière
#' @export
#' @return A new \code{\link{phyloseq-class}} object
taxa_as_columns <- function(physeq) {
  physeq <- clean_pq(
    physeq,
    clean_samples_names = FALSE,
    remove_empty_samples = FALSE,
    remove_empty_taxa = FALSE,
    force_taxa_as_columns = TRUE,
    silent = TRUE
  )
  return(physeq)
}
################################################################################

################################################################################
#' Force taxa to be in columns in the otu_table of a physeq object
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-maturing-blue" alt="lifecycle-maturing"></a>
#'
#' Mainly for internal use. It is a special case of clean_pq function.
#' 
#' @inheritParams clean_pq
#' @author Adrien Taudière
#' @export
#' @return A new \code{\link{phyloseq-class}} object
taxa_as_rows <- function(physeq) {
  physeq <- clean_pq(
    physeq,
    clean_samples_names = FALSE,
    remove_empty_samples = FALSE,
    remove_empty_taxa = FALSE,
    force_taxa_as_rows = TRUE,
    silent = TRUE
  )
  return(physeq)
}
################################################################################

################################################################################
#' Rarefy (equalize) the number of samples per modality of a factor
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' This function randomly draw the same number of samples for each modality of factor.  
#' It is usefull to dissentangle the effect of different number of samples per modality 
#' on diversity. Internally used in [accu_plot_balanced_modality()].
#' 
#' @inheritParams clean_pq
#' @param fact (required): The variable to rarefy. Must be present in
#'   the `sam_data` slot of the physeq object.
#' @param rngseed	(Optional). A single integer value passed to set.seed,
#'   which is used to fix a seed for reproducibly random number generation
#'   (in this case, reproducibly random subsampling). If set to FALSE, then no
#'   iddling with the RNG seed is performed, and it is up to the user to
#'   appropriately call
#' @param verbose (logical). If TRUE, print additional informations.
#' @export
#' @author Adrien Taudière
#' @return A new \code{\link{phyloseq-class}} object.
#' @seealso [accu_plot_balanced_modality()]
#' @examples
#' table(data_fungi_mini@sam_data$Height)
#' data_fungi_mini2 <- rarefy_sample_count_by_modality(data_fungi_mini, "Height")
#' table(data_fungi_mini2@sam_data$Height)
#' if (requireNamespace("patchwork")) {
#'   ggvenn_pq(data_fungi_mini, "Height") + ggvenn_pq(data_fungi_mini2, "Height")
#' }
rarefy_sample_count_by_modality <-
  function(physeq, fact, rngseed = FALSE, verbose = TRUE) {
    if (as(rngseed, "logical")) {
      set.seed(rngseed)
      if (verbose) {
        message("`set.seed(", rngseed, ")` was used to initialize repeatable random subsampling.")
        message("Please record this for your records so others can reproduce.")
        message("Try `set.seed(", rngseed, "); .Random.seed` for the full vector",
          sep = ""
        )
        message("...")
      }
    } else if (verbose) {
      message(
        "You set `rngseed` to FALSE. Make sure you've set & recorded\n",
        " the random seed of your session for reproducibility.\n",
        "See `?set.seed`\n"
      )
      message("...")
    }
    mod <- as.factor(physeq@sam_data[[fact]])
    n_mod <- table(mod)
    samples_names <- sample_names(physeq)
    samp_to_keep <- c()
    for (modality in levels(mod)) {
      vec_samp_mod <- c(as.numeric(grep(modality, mod)))

      # To bypass the pb of vector of length 1
      # We build a vector of two equal values and we will take only one
      # It is cause by range base behavior:
      # 'If x has length 1, is numeric (in the sense of is.numeric) and x >= 1, sampling via sample takes place from 1:x.'
      if (length(vec_samp_mod) == 1) {
        vec_samp_mod <- c(vec_samp_mod, vec_samp_mod)
      }
      samp_to_keep <-
        c(
          samp_to_keep,
          sample(
            vec_samp_mod,
            size = min(n_mod),
            replace = FALSE
          )
        )
    }
    new_physeq <-
      subset_samples_pq(physeq, 1:nsamples(physeq) %in% samp_to_keep)

    if (length(table(new_physeq@sam_data[[fact]])) != length(table(mod))) {
      warning(
        paste0(
          "The number of final levels (sam_data of the output phyloseq
                    object) is not equal to the inital (sam_data of the input
                    phyloseq object) number of levels in the  factor: '",
          fact, "'"
        )
      )
    }

    return(new_physeq)
  }
################################################################################
adrientaudiere/MiscMetabar documentation built on July 6, 2024, 7:02 p.m.