R/method-io.R

Defines functions mp_import_metaphlan mp_import_biom mp_import_qiime mp_import_dada2 import_dada2 mp_import_qiime2 import_qiime2

Documented in import_dada2 import_qiime2 mp_import_biom mp_import_dada2 mp_import_metaphlan mp_import_qiime mp_import_qiime2

#' @title Import function to load the output of qiime2.
#' 
#' @description
#' The function was designed to import the output of qiime2 and convert them to phyloseq
#' class.
#' @name ImportQiime2
#' @param otuqza character, the file contained otu table, the ouput of qiime2.
#' @param taxaqza character, the file contained taxonomy, the ouput of qiime2,
#' default is NULL.
#' @param mapfilename character, the file contained sample information,
#' the tsv format, default is NULL.
#' @param refseqqza character, the file contained reference sequences or the XStringSet object,
#' default is NULL.
#' @param treeqza character, the file contained the tree file or treedata object, which is the result 
#' parsed by functions of treeio, default is NULL.
#' @param parallel logical, whether parsing the column of taxonomy multi-parallel, default is FALSE.
#' @param ..., additional parameters.
#' @return MPSE-class or phyloseq-class contained the argument class.
#' @export
#' @author Shuangbin Xu
#' @examples
#' otuqzafile <- system.file("extdata", "table.qza",
#'                           package="MicrobiotaProcess")
#' taxaqzafile <- system.file("extdata", "taxa.qza",
#'                            package="MicrobiotaProcess")
#' mapfile <- system.file("extdata", "metadata_qza.txt",
#'                        package="MicrobiotaProcess")
#' mpse <- mp_import_qiime2(otuqza=otuqzafile, taxaqza=taxaqzafile,
#'                          mapfilename=mapfile)
#' mpse
import_qiime2 <- function(otuqza, taxaqza=NULL, mapfilename=NULL, 
                          refseqqza=NULL, treeqza=NULL,
                          parallel=FALSE, ...){
    params <- list(...)
    if ('sampledata' %in% names(params) && is.null(mapfilename)){
        sampleda <- params[['sampledata']]
    }    
    res <- .internal_import_qiime2(otuqza=otuqza, 
                                   taxaqza=taxaqza, 
                                   mapfilename=mapfilename, 
                                   refseqqza=refseqqza, 
                                   treeqza=treeqza, 
                                   parallel=parallel, 
                                   ...)
    ps <- .build_ps(res)
}

#' @rdname ImportQiime2
#' @export
mp_import_qiime2 <- function(otuqza, taxaqza=NULL, mapfilename=NULL,
                          refseqqza=NULL, treeqza=NULL, parallel=FALSE, ...){
    check_installed(c('yaml', 'biomformat'), reason = 'for `mp_import_qiime2()`.', action = BiocManager::install)
    params <- list(...)
    if ('sampledata' %in% names(params) && is.null(mapfilename)){
        mapfilename <- params[['sampledata']]
    }
    res <- .internal_import_qiime2(otuqza=otuqza,
                                   taxaqza=taxaqza,
                                   mapfilename=mapfilename,
                                   refseqqza=refseqqza,
                                   treeqza=treeqza,
                                   parallel=parallel, 
                                   ...)
    mpse <- .build_mpse(res=res)
    return(mpse)
}

#' @title Import function to load the feature table and taxonomy table of dada2
#'
#' @description
#' the function can import the ouput of dada2, and generated the phyloseq obj contained the
#' argument class.
#' @name ImportDada2
#' @param seqtab matrix, feature table, the output of \code{\link[dada2]{removeBimeraDenovo}}.
#' @param taxatab matrix, a taxonomic table, the output of \code{\link[dada2]{assignTaxonomy}},
#' or the ouput of \code{\link[dada2]{addSpecies}}.
#' @param reftree phylo, treedata or character, the treedata or phylo class of tree, or the tree file.
#' @param sampleda data.frame or character, the data.frame of sample information,
#' or the file of sample information, nrow samples X ncol factors.
#' @param ..., additional parameters.
#' @importFrom ape read.tree
#' @return phyloseq class contained the argument class.
#' @author Shuangbin Xu
#' @export
#' @examples
#' seqtabfile <- system.file("extdata", "seqtab.nochim.rds",
#'                           package="MicrobiotaProcess")
#' taxafile <- system.file("extdata", "taxa_tab.rds",
#'                         package="MicrobiotaProcess")
#' seqtab <- readRDS(seqtabfile)
#' taxa <- readRDS(taxafile)
#' sampleda <- system.file("extdata", "mouse.time.dada2.txt", 
#'                         package="MicrobiotaProcess")
#' mpse <- mp_import_dada2(seqtab=seqtab, taxatab=taxa,
#'                    sampleda=sampleda)
#' mpse
import_dada2 <- function(seqtab, taxatab=NULL, reftree=NULL, 
                         sampleda=NULL, 
                         ...){
    params <- list(...)
    if ('sampledata' %in% names(params) && is.null(sampleda)){
        sampleda <- params[['sampledata']]
    }
    res <- .internal_import_dada2(seqtab=seqtab,
                                  taxatab=taxatab,
                                  reftree=reftree,
                                  sampleda=sampleda)

    ps <- .build_ps(res)

    return(ps)
}

#' @rdname ImportDada2
#' @export
mp_import_dada2 <- function(seqtab,
                             taxatab=NULL,
                             reftree=NULL,
                             sampleda=NULL,
                             ...){
    params <- list(...)
    if ('sampledata' %in% names(params) && is.null(sampleda)){
        sampleda <- params[['sampledata']]
    }

    res <- .internal_import_dada2(seqtab=seqtab,
                                  taxatab=taxatab,
                                  reftree=reftree,
                                  sampleda=sampleda)

    mpse <- .build_mpse(res)

    return(mpse)
}

#' @title Import function to load the output of qiime.
#'
#' @description
#' The function was designed to import the output of qiime and convert them to MPSE
#' class.
#' @param otufilename character, the file contained otu table, the ouput of qiime.
#' @param mapfilename character, the file contained sample information,
#' the tsv format, default is NULL.
#' @param otutree treedata, phylo or character, the file contained reference sequences, or
#' treedata object, which is the result parsed by functions of treeio, default is NULL.
#' @param refseq XStringSet or character, the file contained the representation sequence file or 
#' XStringSet class to store the representation sequence, default is NULL.
#' @param ..., additional parameters.
#' @return MPSE-class.
#' @export
#' @author Shuangbin Xu
mp_import_qiime <- function(otufilename, 
                            mapfilename = NULL,
                            otutree = NULL, 
                            refseq = NULL,
                            ...){
    params <- list(...)
    if ('sampledata' %in% names(params) && is.null(mapfilename)){
        sampleda <- params[['sampledata']]
    }

    res <- .internal_import_qiime(
                                  otufilename = otufilename,
                                  mapfilename = mapfilename,
                                  otutree = otutree,
                                  refseq = refseq,
                                  ...
                                  )

    mpse <- .build_mpse(res)

    return(mpse)
}


#' @title building MPSE object from biom-format file.
#' @param biomfilename character the biom-format file path.
#' @inheritParams mp_import_qiime
#' @importFrom rlang check_installed
#' @param ... additional parameter, which is meaningless now.
#' @return MPSE-class
#' @export
mp_import_biom <- function(biomfilename, mapfilename = NULL, otutree = NULL, refseq = NULL, ...){
    check_installed("jsonlite", "for `mp_import_biom()`.")
    x <- .internal_biom(biomfilename)
    mpse <- as.MPSE(x)
    if (!is.null(mapfilename)){
        sample.da <- read_qiime_sample(mapfilename) %>%
                     avoid_conflict_names() %>%
                     tibble::as_tibble(rownames = 'Sample')
        mpse <- mpse %>% left_join(sample.da, by='Sample')
    }
    if (!is.null(otutree)){
        if (file.exists(otutree)){
            rlang::warn(c("The reftree is a tree file, it is parsing by read.tree function.",
                     "It is better to parse it with the function of treeio",
                     "then the treedata or phylo result all can be supported."))
            otutree <- ape::read.tree(otutree) %>% treeio::as.treedata()
        }else if (inherits(otutree, "phylo")){
            otutree <- otutree %>% treeio::as.treedata()
        }else if (!inherits(otutree, "treedata")){
            otutree <- NULL
        }
    }
    otutree(mpse) <- otutree
    if (!is.null(refseq)){
        if (file.exists(refseq)){
            refseq <- seqmagick::fa_read(refseq)
        }else if (inherits(refseq, "character")){
            refseq <- build_tree(refseq)
        }else if (!inherits(refseq, "XStringSet")){
            refseq <- NULL
        }
    }
    refsequence(mpse) <- refseq
    return(mpse)
}

#' Import function to load the output of MetaPhlAn.
#' @param profile the output file (text format) of MetaPhlAn.
#' @param mapfilename the sample information file or data.frame,
#' default is NULL.
#' @param treefile the path of MetaPhlAn tree file (
#' mpa_v30_CHOCOPhlAn_201901_species_tree.nwk), default is NULL.
#' @param linenum a integer, sometimes the output file of MetaPhlAn ( < 3)
#' contained the sample information in the first several lines.
#' The linenum should be required.
#' for example:
#' \preformatted{
#' group A A A A B B B B
#' subgroup A1 A1 A2 A2 B1 B1 B2 B2
#' subject S1 S2 S3 S4 S5 S6 S7 S8
#' Bacteria 99 99 99 99 99 99 99 99
#' ...
#' }
#' the \code{linenum} should be set to 3.
#' \preformatted{
#' sampleid A1 A2 A3 A4 A5
#' Bacteria 99 99 99 99 99
#' ...
#' }
#' The \code{linenum} should be set to 1.
#' @param ... additional parameters, meaningless now.
#' @details
#' When the output abundance of MetaPhlAn is relative abundance, the \code{force} of \code{mp_cal_abundance}
#' should be set to TRUE, and the \code{relative} of \code{mp_cal_abundance} should be set to FALSE.
#' Because the abundance profile will be rarefied in the default (force=FALSE), which requires the integer (count) 
#' abundance, then the relative abundance will be calculated in the default (relative=TRUE).
#' @author Shuangbin Xu
#' @export
#' @examples
#' file1 <- system.file("extdata/MetaPhlAn", "metaphlan_test.txt", package="MicrobiotaProcess")
#' sample.file <- system.file("extdata/MetaPhlAn", "sample_test.txt", package="MicrobiotaProcess")
#' readLines(file1, n=3) %>% writeLines()
#' mpse1 <- mp_import_metaphlan(profile=file1, mapfilename=sample.file)
#' mpse1
mp_import_metaphlan <- function(profile, mapfilename=NULL, treefile=NULL, linenum=NULL, ...){
    skipnrow <- guess_skip_nrow(profile)
    if (!is.null(linenum)){
        sampleda <- utils::read.table(profile, sep="\t", skip=skipnrow, nrow=linenum, comment.char="", quote="", check.names=FALSE) %>%
                    dplyr::mutate_at("V1", as.character) %>%
                    dplyr::mutate(V1=tidyr::replace_na(.data$V1, paste0("unknown", seq_len(sum(is.na(.data$V1)))))) %>%
                    tibble::column_to_rownames(var="V1") %>%
                    t() %>% as.data.frame()
        da <- utils::read.table(profile, sep="\t", skip=linenum + skipnrow, comment.char="", quote="", check.names=FALSE)
        sampleindx <- da %>%
                      vapply(.,function(x)is.numeric(x), logical(1)) %>%
                      which(.) 
        colnames(da)[sampleindx] <- paste0("sample", sampleindx - 1)
        sampleda %<>%
            dplyr::slice(sampleindx - 1) %>%
            magrittr::set_rownames(paste0("sample", sampleindx -1))
    }else{
        da <- utils::read.table(profile, sep="\t", head=TRUE, skip=skipnrow, comment.char="", quote="", check.names=FALSE)
        sampleda <- NULL
    }

    if (is.null(linenum) && !is.null(mapfilename)){
        if (inherits(mapfilename, "character") && file.exists(mapfilename)){
            sampleda <- read_qiime_sample(mapfilename)
        }else if (!inherits(mapfilename, "data.frame")){
            rlang::abort("The mapfilename should be a file or data.frame contained sample information!")
        }else{
            sampleda <- mapfilename
        }
    }

    if (!is.null(treefile)){
        otutree <- ape::read.tree(treefile)
        otutree$tip.label %<>% 
             strsplit("\\|") %>% 
             lapply(., function(x)x[length(x)]) %>% 
             unlist()
        otutree %<>% treeio::as.treedata()
    }else{
        otutree <- NULL
    }

    clnm <- colnames(da)

    max.sep <- da %>%
               dplyr::pull(1) %>%
               gregexpr("\\|", .) %>%
               lapply(length) %>%
               unlist %>%
               max

    dat <- da %>%
           dplyr::filter(!!as.symbol(clnm[1]) %>%
                         gregexpr("\\|", .) %>%
                         lapply(length) %>%
                         unlist %>%
                         magrittr::equals(max.sep))

    dat %<>% magrittr::set_rownames(paste0('row', seq_len(nrow(dat))))

    taxatab <- dat %>%
                 dplyr::select(1) %>%
                 split_str_to_list(sep="\\|") %>%
                 magrittr::set_colnames(c(taxaclass[seq_len(max.sep)], "OTU")) %>%
                 fillNAtax()  
    
    assay <- dat %>%
             dplyr::mutate(!!as.symbol(clnm[1]):=strsplit(!!as.symbol(clnm[1]), split="\\|") %>%
                           lapply(function(x) x %>% magrittr::extract2(max.sep+1))) %>%
             dplyr::select(-1)
    
    featureIndex <- intersect(rownames(assay), rownames(taxatab))
    taxatab %<>% 
        magrittr::extract(featureIndex,) %>%
        magrittr::set_rownames(NULL) %>%
        tibble::column_to_rownames(var="OTU")

    assay %<>% 
        magrittr::extract(featureIndex,) %>%
        magrittr::set_rownames(rownames(taxatab))

    #rownames(assay) <- rownames(taxatab)

    rowdaindx <- assay %>%
                 vapply(.,function(x)!is.numeric(x), logical(1)) %>%
                 which() %>%
                 unname()
                 
    rowdaindx <- union(rowdaindx, which(grepl("NCBI_tax_id", colnames(assay), ignore.case=TRUE)))
    
    if (length(rowdaindx)>0){
        rowda <- assay %>%
                 dplyr::select(rowdaindx)
        assay %<>% dplyr::select(-rowdaindx)
    }else{
        rowda <- NULL
    }

    res <- list(otutab = assay, 
                otutree = otutree,
                sampleda = sampleda, 
                taxatab = taxatab,
                otu.metada = rowda
               )
    mpse <- .build_mpse(res)

    return(mpse)
}    


#' Import function to load the output of human_regroup_table in HUMAnN.
#' @param profile the output file (text format) of human_regroup_table in HUMAnN.
#' @param mapfilename the sample information file or data.frame,
#' @param rm.unknown logical whether remove the unmapped and ungrouped features.
#' @param keep.contribute.abundance logical whether keep the abundance of contributed taxa,
#' default is FALSE, it will consume more memory if it set to TRUE.
#' @param ... additional parameters, meaningless now.
#' @author Shuangbin Xu
#' @export
mp_import_humann_regroup <- function(profile, mapfilename=NULL, rm.unknown=TRUE, keep.contribute.abundance=FALSE, ...){
    da <- read.table(profile, header=TRUE, sep='\t', comment.char="",
                     check.names=FALSE, row.names=1, quote="")
    if (rm.unknown){
        da <- da[!grepl('UNMAPPED|UNGROUPED|unclassified', rownames(da)),,drop=FALSE]
    }
    index <- grepl("\\|", rownames(da))
    feature.meta <- da[index, ,drop=FALSE]
    feature.meta <- read.table(text = rownames(feature.meta), sep='|')
    feature.meta$V2 <- gsub('.*s__', "s__", feature.meta$V2)
    colnames(feature.meta) <- c('OTU', 'contribute.taxa')
    if(keep.contribute.abundance){
       feature.meta <- cbind(feature.meta, da[index, ,drop = FALSE])
       rownames(feature.meta) <- NULL
       check_installed('tidyr', 'for `mp_import_humann_regroup()` with `keep.contribute.abundance=TRUE`.')
       feature.meta <- tidyr::nest(feature.meta, contribute.taxa = !'OTU')
    }
    res <- list(otutab = da[!index, ,drop=FALSE])
    if (!is.null(mapfilename)){
        if (inherits(mapfilename, 'data.frame')){
            res$sampleda <- mapfilename
        }else{
            res$sampleda <- read_qiime_sample(mapfilename)
        }
    }
    mpse <- .build_mpse(res)
    mpse <- dplyr::left_join(mpse, feature.meta, by="OTU")
    return(mpse)
}

#' @title read the qza file, output of qiime2.
#' @description
#' the function was designed to read the ouput of qiime2.
#' @param qzafile character, the format of file should be one of
#' `BIOMV210DirFmt`, `TSVTaxonomyDirectoryFormat`, `NewickDirectoryFormat`
#' and `DNASequencesDirectoryFormat`.
#' @param parallel logical, whether parsing the taxonomy by multi-parallel,
#' efault is FALSE.
#' @return list contained one or multiple object of feature table,
#' taxonomy table, tree and represent sequences.
#' @importFrom ape read.tree
#' @importFrom utils unzip
#' @export
#' @examples
#' \dontrun{
#' otuqzafile <- system.file("extdata", "table.qza",
#'                           package="MicrobiotaProcess")
#' otuqza <- read_qza(otuqzafile)
#' str(otuqza)
#' }
read_qza <- function(qzafile, parallel=FALSE){
    tmpdir <- tempdir()
    unzipfiles <- unzip(qzafile, exdir=tmpdir)
    metadafile <- unzipfiles[grep("metadata.yaml", unzipfiles)[1]]
    metaflag <- yaml::read_yaml(metadafile)
    formatflag <- metaflag$format
    datafile <- unzipfiles[3]
    formats <- c("BIOMV210DirFmt", "TSVTaxonomyDirectoryFormat",
                 "NewickDirectoryFormat","DNASequencesDirectoryFormat")
    formatflag <- match.arg(formatflag, formats)
    switch(formatflag,
           BIOMV210DirFmt={
               datafile <- unzipfiles[grep(".biom", unzipfiles)]
               x <- read.featuretab(datafile)},
           TSVTaxonomyDirectoryFormat={
               datafile <- unzipfiles[grep("data/taxonomy.tsv", unzipfiles)]
               x <- read.taxa(datafile, parallel=parallel)},
           DNASequencesDirectoryFormat={
               datafile <- unzipfiles[grep("data/.*\\.fasta", unzipfiles)]
               x <- Biostrings::readDNAStringSet(datafile)},
           NewickDirectoryFormat = {
               datafile <- unzipfiles[grep("data/tree.nwk", unzipfiles)]
               x <- read.tree(datafile)})
    return(x)
}

#' @keywords internal
read.featuretab <- function(file){
    obj <- suppressWarnings(biomformat::read_biom(file))
    res <- .internal_parse_biom(biomobj = obj)
    return(res)
}

#' @importFrom utils read.table
#' @keywords internal
read.taxa <- function(file, parallel=FALSE){
    skipn <- guess_skip_nrow(file)
    x <- utils::read.table(file, sep="\t", row.names=1, header=TRUE, comment.char="", skip=skipn, check.names=FALSE)
    if (ncol(x)>1){
       otu.metada <- x[,-1,drop=FALSE]
    }else{
       otu.metada <- NULL
    }
     
    x <- x[,1,drop=FALSE] 
    taxatab <- x %>% apply(., 2, function(i)gsub(";\\s+", ";", i)) %>% data.frame() %>%
        split_str_to_list(sep=";") %>%
        magrittr::set_colnames(taxaclass[seq_len(ncol(.))]) %>%
        magrittr::set_rownames(rownames(x)) %>%
        fillNAtax()
    return(list(taxatab = taxatab, otu.metada = otu.metada))
}


#' @title building tree
#'
#' @description
#' The function can be used to building tree.
#' @param seqs DNAStringSet or DNAbin, the object of R. 
#' @param ..., additional parameters, see also \code{\link[DECIPHER]{AlignSeqs}}.
#' @return the phylo class of tree.
#' @author Shuangbin Xu
#' @export
#' @examples
#' \dontrun{
#'     seqtabfile <- system.file("extdata", "seqtab.nochim.rds", 
#'                               package="MicrobiotaProcess")
#'     seqtab <- readRDS(seqtabfile)
#'     refseq <- colnames(seqtab)
#'     names(refseq) <- paste0("OTU_",seq_len(length(refseq)))
#'     refseq <- Biostrings::DNAStringSet(refseq)
#'     tree <- build_tree(refseq)
#'     or
#'     tree <- build_tree(refseq) 
#' }
setGeneric("build_tree", function(seqs, ...){standardGeneric("build_tree")})

#' @aliases build_tree,DNAStringSet
#' @rdname build_tree
#' @importFrom stats update
#' @export
setMethod("build_tree", "DNAStringSet", function(seqs, ...){
    alignment <- DECIPHER::AlignSeqs(seqs, anchor=NA, ...)
    phalign <- phangorn::phyDat(as.matrix(alignment), type="DNA")
    dm <- phangorn::dist.ml(phalign)
    treeNJ <- phangorn::NJ(dm)
    fit <- phangorn::pml(treeNJ, data=phalign)
    treeGTR <- update(fit, k=4, inv=0.2)
    tree <- phangorn::optim.pml(treeGTR, 
                      model="GTR", 
                      optInv=TRUE, 
                      optGamma=TRUE,
                      rearrangement = "stochastic", 
                      control = phangorn::pml.control(trace = 0))
    tree <- tree$tree
    return(tree)
})

#' @aliases build_tree,DNAbin
#' @rdname build_tree
#' @export
setMethod("build_tree", "DNAbin", function(seqs, ...){
    seqs <- DNAbin2DNASet(seqs)
    tree <- build_tree(seqs, ...)
    return(tree)
})

#' @aliases build_tree,character
#' @rdname build_tree
#' @export
setMethod("build_tree", "character", function(seqs,...){
    if (length(seqs)==1){
        seqs <- Biostrings::readDNAStringSet(seqs)
    }else{
        seqs <- Biostrings::DNAStringSet(seqs)
    }
    tree <- build_tree(seqs, ...)
    return (tree)
})

#' @keywords internal
DNAbin2DNASet <- function(seqs){
    seqs <- toupper(unlist(lapply(as.character(seqs), paste0, collapse="")))
    seqs <- Biostrings::DNAStringSet(seqs)
    return (seqs)
}
xiangpin/MicrobitaProcess documentation built on Nov. 6, 2024, 1:15 a.m.