R/process_import_qiime2.R

Defines functions is_dna_seq parse_q2taxonomy subset_taxa_in_feature read_q2sample_meta read_q2taxa read_q2biom read_qza import_qiime2

Documented in import_qiime2

# This function is modified from import-qiime2 in microbiomeMarker
# https://github.com/yiluheihei/microbiomeMarker/tree/master/R/import-qiime2.R

#' @title Convert the qiime2 output of dada2 into phyloseq object
#'
#' @description
#' Import the qiime2 artifacts, including feature table, taxonomic table,
#'   phylogenetic tree, representative sequence and sample metadata into
#'   phyloseq object.
#'
#' @author Created by Yang Cao; modified by Hua Zou (5/14/2022 Shenzhen China)
#'
#' @param otu_qza (Required). character, file path of the feature table from qiime2.
#' @param taxa_qza (Optional). character, file path of the taxonomic table from qiime2,
#'   default `NULL`.
#' @param sam_tab (Optional). character, file path of the sample metadata in tsv format,
#'   default `NULL`.
#' @param refseq_qza (Optional). character, file path of the representative sequences from
#'   qiime2, default `NULL`.
#' @param tree_qza (Optional). character, file path of the phylogenetic tree from
#'   qiime2, default `NULL`.
#' @param keep_taxa_rows (Optional). logical. whether keep taxa in rows or not
#' in the `otu_table` of the returned `phyloseq` object, default `TRUE`.
#'
#' @return [`phyloseq::phyloseq-class`] object.
#'
#' @import dplyr
#'
#' @export
#'
#' @examples
#' \dontrun{
#' otuqza_file <- system.file(
#'     "extdata", "table.qza",
#'     package = "MicrobiomeAnalysis"
#' )
#' taxaqza_file <- system.file(
#'     "extdata", "taxonomy.qza",
#'     package = "MicrobiomeAnalysis"
#' )
#' sample_file <- system.file(
#'     "extdata", "sample-metadata.tsv",
#'     package = "MicrobiomeAnalysis"
#' )
#' treeqza_file <- system.file(
#'     "extdata", "tree.qza",
#'     package = "MicrobiomeAnalysis"
#' )
#' ps <- import_qiime2(
#'     otu_qza = otuqza_file, taxa_qza = taxaqza_file,
#'     sam_tab = sample_file, tree_qza = treeqza_file
#' )
#' ps
#' }
import_qiime2 <- function(
    otu_qza,
    taxa_qza = NULL,
    sam_tab = NULL,
    refseq_qza = NULL,
    tree_qza = NULL,
    keep_taxa_rows = TRUE) {


  # otuqza_file <- system.file(
  #   "extdata", "table.qza",
  #   package = "MicrobiomeAnalysis")
  # taxaqza_file <- system.file(
  #   "extdata", "taxonomy.qza",
  #   package = "MicrobiomeAnalysis")
  # sample_file <- system.file(
  #   "extdata", "sample-metadata.tsv",
  #   package = "MicrobiomeAnalysis")
  # treeqza_file <- system.file(
  #   "extdata", "tree.qza",
  #   package = "MicrobiomeAnalysis")
  #
  # otu_qza = otuqza_file
  # taxa_qza = taxaqza_file
  # sam_tab = sample_file
  # refseq_qza = NULL
  # tree_qza = treeqza_file
  # refseq_qza = NULL
  # keep_taxa_rows = TRUE


  feature_tab <- read_qza(otu_qza)

  if (!is.null(taxa_qza)) {
    taxa_tab <- read_qza(taxa_qza)
    taxa_tab <- subset_taxa_in_feature(taxa_tab, feature_tab)
    taxa_tab <- parse_q2taxonomy(taxa_tab)
    tax_tab <- .import_dada2_taxa(dada2_taxa = taxa_tab)
  } else {
    taxa_tab <- NULL
  }

  if (!is.null(sam_tab)) {
    sam_tab <- read_q2sample_meta(sam_tab)
  } else {
    sam_tab <- NULL
  }

  if (!is.null(tree_qza)) {
    tree <- read_qza(tree_qza)
  } else {
    tree <- NULL
  }

  # check the row.names of feature table: DNA sequence or not
  # if is DNA, row.names(feature_tab) is set as refseq
  if (is_dna_seq(row.names(feature_tab))) {
    refseq <- row.names(feature_tab)
    refseq_nm <- paste0("OTU", seq_along(refseq))
    names(refseq) <- refseq_nm

    # set the rownames of feature and taxa tab as OTU1, OTU2,...
    if (!is.null(taxa_tab)) {
      rownames(taxa_tab) <- refseq_nm
    }
      rownames(feature_tab) <- refseq_nm
  } else {
    refseq <- NULL
  }

  if (!is.null(refseq_qza)) {
    refseq <- read_qza(refseq_qza)
  }

  if (!is.null(refseq)) {
    refseq <- Biostrings::DNAStringSet(refseq)
  } else {
    refseq <- NULL
  }

  ps <- phyloseq::phyloseq(
    otu_tab = feature_tab,
    tax_tab = tax_tab,
    sam_tab = sam_tab,
    phy_tree = tree,
    refseq = refseq)

  if (all(!ps@otu_table@taxa_are_rows, keep_taxa_rows)) {
    ps <- phyloseq::t(ps)
  }

  if (inherits(ps, "otu_table")) {
    warning("`otu_table` object is returned")
  }

  return(ps)
}


#' Read the qza file output from qiime2
#'
#' Import the qiime2 artifacts to R.
#'
#' @param file character, path of the input qza file. Only files in format of
#'   `BIOMV210DirFmt` (feature table), `TSVTaxonomyDirectoryFormat` (taxonomic
#'   table), `NewickDirectoryFormat` (phylogenetic tree ) and
#'    `DNASequencesDirectoryFormat` (representative sequences) are supported
#'    right now.
#' @param temp character, a temporary directory in which the qza file will be
#'   decompressed to, default `tempdir()`.
#'
#' @importFrom yaml read_yaml
#'
#' @return [`phyloseq::otu_table-class`] object for feature table,
#'   [`phyloseq::taxonomyTable-class`] object for taxonomic table,
#'   [`ape::phylo`] object for phylogenetic tree,
#'   [`Biostrings::DNAStringSet-class`] for representative sequences of taxa.
#'
#' @noRd
read_qza <- function(file,
                     temp = tempdir()) {

  unzipped_file <- utils::unzip(file, exdir = temp)
  meta_file <- grep("metadata.yaml", unzipped_file, value = TRUE)
  metadata <- yaml::read_yaml(meta_file[1])
  format <- metadata$format
  uuid <- metadata$uuid

  if (grepl("BIOMV", metadata$format)) {
    biom_file <- file.path(temp, uuid, "data/feature-table.biom")
    res <- read_q2biom(biom_file)
  } else if (format == "TSVTaxonomyDirectoryFormat") {
    taxa_file <- file.path(temp, uuid, "data/taxonomy.tsv")
    res <- read_q2taxa(taxa_file)
  } else if (format == "NewickDirectoryFormat") {
    tree_file <- file.path(temp, uuid, "data/tree.nwk")
    res <- read_tree(tree_file)
  } else if (format == "DNASequencesDirectoryFormat") {
    refseq_file <- file.path(temp, uuid, "data/dna-sequences.fasta")
    res <- Biostrings::readDNAStringSet(refseq_file)
  } else {
    stop("Only files in format of 'BIOMV210DirFmt' ",
         "'TSVTaxonomyDirectoryFormat', ",
         "'NewickDirectoryFormat' and 'DNASequencesDirectoryFormat'",
         " are supported."
    )
  }

  return(res)
}

#' Read qiime2 feature table
#'
#' Import qiime2 feature table into otu_table object
#'
#' @param file character, file name of the biom file.
#'
#' @importFrom biomformat read_biom biom_data
#'
#' @return [`phyloseq::otu_table-class`] object.
#'
#' @noRd
read_q2biom <- function(file) {

  biomobj <- read_biom(file)
  feature_tab <- as(biom_data(biomobj), "matrix")

  return(otu_table(feature_tab, taxa_are_rows = TRUE))
}

#' Read qiime2 taxa file
#' @keywords internal
#' @noRd
read_q2taxa <- function(file) {

  taxa <- utils::read.table(file, sep = "\t", header = TRUE)
  if ("Confidence" %in% names(taxa)) {
    taxa$Confidence <- NULL
  }

  return(tax_table(as.matrix(taxa)))
}

#' Read qiime2 sample meta data file
#' @keywords internal
#' @noRd
read_q2sample_meta <- function(file) {

  phyloseq::import_qiime_sample_data(file)
}

#' Subset taxa according to the taxa in feature table
#' @keywords internal
#' @noRd
subset_taxa_in_feature <- function(taxa_tab,
                                   feature_tab) {

  idx <- match(row.names(feature_tab), taxa_tab[, "Feature.ID"])
  taxa_tab <- taxa_tab[idx, , drop = FALSE]
  row.names(taxa_tab) <- taxa_tab[, "Feature.ID"]

  return(taxa_tab)
}

#' Parse qiime2 taxa in different taxonomic levels
#'
#' @param taxa `tax_table` object.
#' @param sep character string containing a regular expression, separator
#'  between different taxonomic levels, defaults to on compatible with both
#'  GreenGenes and SILVA `; |;"`.
#' @param trim_rank_prefix logical whether remove leading characters from
#'   taxonomic levels, e.g. k__ or D_0__, default `TRUE`.
#'
#' @importFrom tidyr separate
#' @importFrom purrr map_df
#'
#' @return [`phyloseq::taxonomyTable-class`] object.
#' @keywords internal
#' @noRd
parse_q2taxonomy <- function(
    taxa,
    sep = "; |;",
    trim_rank_prefix = TRUE) {

  taxa <- data.frame(taxa)
  if (trim_rank_prefix) {
    # remove leading characters from GG
    taxa$Taxon <- gsub("[kpcofgs]__", "", taxa$Taxon)
    # remove leading characters from SILVA
    taxa$Taxon <- gsub("D_\\d__", "", taxa$Taxon)
  }

  taxa <- tidyr::separate(taxa, .data$Taxon,
                          c("Kingdom", "Phylum", "Class", "Order",
                            "Family", "Genus", "Species"),
                          sep = sep,
                          fill = "right")

  taxa <- purrr::map_df(taxa, ~ ifelse(.x == "", NA_character_, .x)) %>%
    as.data.frame()
  rownames(taxa) <- taxa$Feature.ID
  taxa$Feature.ID <- NULL

  return(tax_table(as.matrix(taxa)))
}

#' check the row.names of feature table is DNA sequence or not
#' This function is from
#' {github}YuLab-SMU/MicrobiotaProcess/blob/master/R/import_qiime2.R#L169-L177
#' @keywords internal
#' @noRd
is_dna_seq <- function(names) {

  x <- unlist(strsplit(toupper(names[1]), split = ""))
  freq <- mean(x %in% c("A", "G", "T", "C", "N", "X", "-"))

  if (length(x) > 30 & freq > 0.9) {
    return(TRUE)
  } else {
    return(FALSE)
  }
}
HuaZou/MicrobiomeAnalysis documentation built on May 13, 2024, 11:10 a.m.