#' Import Metaphlan results to \code{TreeSummarizedExperiment}
#
#' @param file a single \code{character} value defining the file
#' path of the Metaphlan file. The file must be in merged Metaphlan format.
#'
#' @param colData a DataFrame-like object that includes sample names in
#' rownames, or a single \code{character} value defining the file
#' path of the sample metadata file. The file must be in \code{tsv} format
#' (default: \code{colData = NULL}).
#'
#' @param sample_meta a DataFrame-like object that includes sample names in
#' rownames, or a single \code{character} value defining the file
#' path of the sample metadata file. The file must be in \code{tsv} format
#' (default: \code{sample_meta = NULL}).
#'
#' @param phy_tree a single \code{character} value defining the file
#' path of the phylogenetic tree.
#' (default: \code{phy_tree = NULL}).
#'
#' @param ... additional arguments:
#' \itemize{
#' \item{\code{assay.type}:} {A single character value for naming
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}}
#' (default: \code{assay.type = "counts"})}
#' \item{\code{assay_name}:} {A single \code{character} value for specifying which
#' assay to use for calculation. (Please use \code{assay.type} instead.
#' At some point \code{assay_name} will be disabled.)}
#' \item{\code{removeTaxaPrefixes}:} {\code{TRUE} or \code{FALSE}: Should
#' taxonomic prefixes be removed? (default:
#' \code{removeTaxaPrefixes = FALSE})}
#' \item{\code{remove.suffix}:} {\code{TRUE} or \code{FALSE}: Should
#' suffixes of sample names be removed? Metaphlan pipeline adds suffixes
#' to sample names. Suffixes are formed from file names. By selecting
#' \code{remove.suffix = TRUE}, you can remove pattern from end of sample
#' names that is shared by all. (default: \code{remove.suffix = FALSE})}
#' \item{\code{set.ranks}:} {\code{TRUE} or \code{FALSE}: Should the columns
#' in the rowData that are treated as taxonomy ranks be updated according to
#' the ranks found in the imported data?
#' (default: \code{set.ranks = FALSE})}
#' }
#'
#' @details
#' Import Metaphlan (versions 2, 3 and 4 supported) results.
#' Input must be in merged Metaphlan format.
#' (See
#' \href{https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-4#merging-tables}{
#' the Metaphlan documentation and \code{merge_metaphlan_tables} method.})
#' Data is imported so that data at the lowest rank is imported as a
#' \code{TreeSummarizedExperiment} object. Data at higher rank is imported as a
#' \code{SummarizedExperiment} objects which are stored to \code{altExp} of
#' \code{TreeSummarizedExperiment} object.
#'
#' Currently Metaphlan versions 2, 3, and 4 are supported.
#'
#' @return A
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' object
#'
#' @name importMetaPhlAn
#' @seealso
#' \code{\link[=importHUMAnN]{importHUMAnN}}
#' \code{\link[=makeTreeSEFromPhyloseq]{makeTreeSEFromPhyloseq}}
#' \code{\link[=makeTreeSEFromBiom]{makeTreeSEFromBiom}}
#' \code{\link[=makeTreeSEFromDADA2]{makeTreeSEFromDADA2}}
#' \code{\link[=importQIIME2]{importQIIME2}}
#' \code{\link[=importMothur]{importMothur}}
#'
#' @export
#' @author Leo Lahti and Tuomas Borman. Contact: \url{microbiome.github.io}
#'
#' @references
#' Beghini F, McIver LJ, Blanco-MÃguez A, Dubois L, Asnicar F, Maharjan S, Mailyan A,
#' Manghi P, Scholz M, Thomas AM, Valles-Colomer M, Weingart G, Zhang Y, Zolfo M,
#' Huttenhower C, Franzosa EA, & Segata N (2021) Integrating taxonomic, functional,
#' and strain-level profiling of diverse microbial communities with bioBakery 3.
#' \emph{eLife}. 10:e65088. doi: 10.7554/eLife.65088
#'
#' @examples
#' # (Data is from tutorial
#' # https://github.com/biobakery/biobakery/wiki/metaphlan3#merge-outputs)
#'
#' # File path
#' file_path <- system.file("extdata", "merged_abundance_table.txt", package = "mia")
#' # Import data
#' tse <- importMetaPhlAn(file_path)
#' # Data at the lowest rank
#' tse
#' # Data at higher rank is stored in altExp
#' altExps(tse)
#' # Higher rank data is in SE format, for example, Phylum rank
#' altExp(tse, "Phylum")
#'
NULL
importMetaPhlAn <- function(
file, colData = sample_meta, sample_meta = NULL, phy_tree = NULL, ...){
################################ Input check ################################
if(!.is_non_empty_string(file)){
stop("'file' must be a single character value.",
call. = FALSE)
}
if (!file.exists(file)) {
stop(file, " does not exist", call. = FALSE)
}
if(!is.null(colData) &&
!(.is_non_empty_string(colData) || is.data.frame(colData) ||
is.matrix(colData) || is(colData, "DataFrame")) ){
stop("'colData' must be a single character value, DataFrame or NULL.",
call. = FALSE)
}
if(!is.null(phy_tree) && !.is_non_empty_string(phy_tree)){
stop("'phy_tree' must be a single character value or NULL.",
call. = FALSE)
}
############################## Input check end #############################
# Get rowdata columns. metaphlan v2 has ID column. Metaphlan > v2 has
# clade_name for taxonomy names. Some has taxonomy.
rowdata_col <- c("clade_name", "ID", "_id", "taxonomy")
# Read metaphlan data
data <- .read_metaphlan(file, rowdata_col, ...)
# Parse data into separate tables, which include data at certain taxonomy rank
tables <- .parse_metaphlan(data, ...)
# Create multiple SE objects at different rank from the data
available_ranks <- names(tables)
se_objects <- lapply(tables, function(x){
.create_se_from_metaphlan(
x, rowdata_col, returned.ranks = available_ranks, ...)
})
# Get the object with lowest rank
tse <- se_objects[[ length(se_objects) ]]
# Convert it to TreeSE so that it has altExp
tse <- as(tse, "TreeSummarizedExperiment")
# Remove it, so that it is not added multiple times
se_objects[[ length(se_objects) ]] <- NULL
# Add rest of the objects to altExp
if( length(se_objects) > 0 ){
for( rank in names(se_objects) ){
# Add SEs to altExp of tse, give name according to rank
altExp(tse, rank) <- se_objects[[rank]]
}
}
# Set taxonomy ranks using .set_taxonomy_ranks
.set_ranks_based_on_rowdata(tse,...)
# Load sample meta data if it is provided
if( !is.null(colData) ) {
tse <- .add_coldata(tse, colData)
}
# Load tree if it is provided
if (!is.null(phy_tree)) {
tree <- ape::read.tree(phy_tree)
rowTree(tse) <- tree
}
return(tse)
}
################################ HELP FUNCTIONS ################################
# Read Metaphlan file, catch error if it occurs
.read_metaphlan <- function(file, rowdata_col, remove.suffix = FALSE, ...){
################################ Input check ###############################
if(!.is_a_bool(remove.suffix)){
stop("'remove.suffix' must be TRUE or FALSE.", call. = FALSE)
}
############################## Input check end #############################
# Read the table. Catch error and give more informative message
table <- tryCatch(
{
read.table(file, header = TRUE, comment.char = "#", check.names = FALSE)
},
error = function(condition){
stop("Error while reading ", file,
"\nPlease check that the file is in merged Metaphlan file format.",
call. = FALSE)
}
)
# Check that file is in right format
if( .check_metaphlan(table, rowdata_col) ){
stop("Error while reading ", file,
"\nPlease check that the file is in merged Metaphlan file format.",
call. = FALSE)
}
# Remove possible suffix from the colnames if user has specified
if( remove.suffix ){
table <- .remove_suffix(table, rowdata_col)
}
return(table)
}
# Check that metaphlan file contains correct information
.check_metaphlan <- function(data, rowdata_col){
# Get rowData columns
rowdata_id <- unlist(lapply(rowdata_col, grep, colnames(data)))
rowdata_columns <- data[ , rowdata_id, drop = FALSE]
# Get columns that go to assay
assay_columns <- data[ , -rowdata_id, drop = FALSE]
# Initialize result
result <- TRUE
# Check rowdata column names that they contain right information, and check that
# rest of the columns represents abundances in samples.
# If these requirements are met, give FALSE. Otherwise, give TRUE.
if( length(rowdata_id) > 0 &&
all(unlist(lapply(assay_columns, is.numeric))) ){
result <- FALSE
}
return(result)
}
# Get metaphlan table as input and return multiple tables which each include data at
# certain taxonomy rank
.parse_metaphlan <- function(table, ...){
# ID in Metaphlan v2, > 2 clade_name
col <- colnames(table) %in% c("clade_name", "ID")
if( sum(col) != 1 ){
stop("Error in parsing Metaphlan file.", call. = FALSE)
}
# Get the lowest level of each row
levels <- lapply(table[ , col], FUN = .get_lowest_taxonomic_level)
# Convert list to vector
levels <- unlist(levels)
# Split table so that each individual table contains information only
# at specific rank
tables <- split(table, levels)
# Get the order
metaphlan_tax = c(TAXONOMY_RANKS, "strain")
indices <- match(metaphlan_tax, tolower(names(tables)))
# Remove NAs which occurs if rank is not included
indices <- indices[!is.na(indices)]
# Order tables
tables <- tables[indices]
return(tables)
}
# Get the lowest level of the string that contains multiple taxonomic levels with prefixes
# Output is single character that specifies the rank, e.g, "s" == "Species"
.get_lowest_taxonomic_level <- function(string){
# Get indices that specify location of rank prefixes
levels <- gregexpr("([kpcofgst]+)__", string)[[1]]
# Get the location of lowest rank
lowest_level_ind <- levels[length(levels)]
# Get the lowest rank that was found
lowest_level <- substr(string, start = lowest_level_ind, stop = lowest_level_ind)
# List all ranks and what prefix they correspond
ranks <- c(
Domain = "d", Kingdom = "k", Phylum = "p", Class = "c", Order = "o",
Family = "f", Genus = "g", Species = "s", Strain = "t")
# Convert prefix into full rank name
lowest_level <- names(ranks[ match(lowest_level, ranks) ])
return(lowest_level)
}
# Create SE object that include rowdata and assay, from the metaphlan table
.create_se_from_metaphlan <- function(
table, rowdata_col, assay.type = assay_name, assay_name = "counts",
...){
# Check assay.type
if( !.is_non_empty_character(assay.type) ){
stop("'assay.type' must be a non-empty character value.",
call. = FALSE)
}
# Get rowdata columns
rowdata_id <- unlist(lapply(rowdata_col, grep, colnames(table)))
# Get those columns that belong to rowData
rowdata <- table[, rowdata_id, drop = FALSE]
# Get those columns that belong to assay
assay <- table[, -rowdata_id, drop = FALSE]
# Parse taxonomic levels
taxonomy <- .parse_taxonomy(
rowdata[ , 1, drop = FALSE], sep = "\\|",
column_name = colnames(rowdata)[[1]], ...)
# Add parsed taxonomy level information to rowdata
rowdata <- cbind(taxonomy, rowdata)
# Ensure that rowData is DataFrame
rowdata <- DataFrame(rowdata, check.names = FALSE)
# Ensure that assay is matrix
assay <- as.matrix(assay)
# Create assays list and add assay with specific name
assays <- S4Vectors::SimpleList()
assays[[assay.type]] <- assay
# Create TreeSE
se <- TreeSummarizedExperiment(
assays = assays, rowData = rowdata)
# Add taxonomy labels
rownames(se) <- getTaxonomyLabels(se)
return(se)
}
# This function can be used to add colData to TreeSE. It checks that sample
# names match (full or partial) and adds the metadata to altExps also.
.add_coldata <- function(tse, coldata){
# If the coldata is character specifying the path
if( .is_non_empty_character(coldata) ){
coldata <- read.table(file = coldata, header = TRUE, sep = "\t")
rownames(coldata) <- coldata[, 1]
}
# Ensure that the coldata is DF
coldata <- DataFrame(coldata)
# Usually when metaphlan sample names are created, the workflow adds file
# name as a suffix. Because of that sample names do not match anymore
# necessarily. Try partial match if full match is not found.
if( all(rownames(coldata) %in% colnames(tse)) ){
sample_names <- rownames(coldata)
names(sample_names) <- sample_names
} else{
sample_names <- lapply(rownames(coldata), function(x){
x <- colnames(tse)[grep(x, colnames(tse))]
if( length(x) != 1 ){
x <- NULL
}
return(x)
})
names(sample_names) <- rownames(coldata)
sample_names <- unlist(sample_names)
}
# Check if all samples were found. (In partial match certain sample name
# is missing if one matching name was not found.). In this part, all
# colnames should be found if data sets are matching. (More samples in
# metadata is allowed.)
if( !(all(colnames(tse) %in% sample_names) &&
length(sample_names) == ncol(tse)) ){
warning(
"The sample names in 'colData' do not match with the data. ",
"The sample metadata is not added.", call. = FALSE
)
return(tse)
}
# Reorder sample metadata based on the data
sample_names <- sample_names[match(colnames(tse), sample_names)]
coldata <- coldata[names(sample_names), ]
# Give warning if partial match was used
if( !all(rownames(coldata) %in% colnames(tse)) ){
warning("Partial match was used to match sample names between ",
"'colData' and the data. Please check that they are correct.",
call. = FALSE
)
# Replace colnames with names from sample metadata. They are without
# additional suffix.
coldata[["colnames_data"]] <- colnames(tse)
coldata[["colnames_metadata"]] <- rownames(coldata)
colnames(tse) <- rownames(coldata)
}
# Add sample metadata
colData(tse) <- coldata
# If there are altExps, add the metadata also there
if( length(altExps(tse)) ){
altexps <- altExps(tse)
altexps <- lapply(altexps, function(x){
colnames(x) <- rownames(coldata)
colData(x) <- coldata
return(x)
})
altexps <- SimpleList(altexps)
altExps(tse) <- altexps
}
return(tse)
}
# In humann/metaphlan pipeline, suffix is added to colnames. The suffix comes
# from file names. This can cause problems when, e.g., taxonomy and pathway
# information is combined. Because their suffixes differ, the sample names
# differ. The purpose of the function is to remove those file names.
.remove_suffix <- function(
data, rowdata_col = c("clade_name", "ID", "_id", "taxonomy")){
# Get rowdata columns
rowdata_id <- unlist(lapply(rowdata_col, grep, colnames(data)))
# Get sample names
sample_names <- colnames(data)[-rowdata_id]
# Get the possible suffixes, i.e., words that are divided by underscores
# based on first sample name.
suffix <- strsplit(sample_names[[1]], "_")[[1]]
# Loop over these suffixes. From the end, add words one by one. Store those
# suffixes that match with all sample names. The result is suffix that is
# the longest.
shared <- NULL
for( i in length(suffix):1 ){
pattern <- paste0("_", paste(suffix[i:length(suffix)], collapse="_"))
if( all(grepl(pattern, sample_names)) ){
shared <- pattern
}
}
# Remove suffix from sample names
if( !is.null(shared) ){
sample_names <- gsub(shared, "", sample_names)
colnames(data)[-rowdata_id] <- sample_names
}
return(data)
}
# This function sets taxonomy ranks based on rowData of TreeSE. With this,
# user can automatically set ranks based on imported data.
.set_ranks_based_on_rowdata <- function(tse, set.ranks = FALSE, ...){
#
if( !.is_a_bool(set.ranks) ){
stop("'set.ranks' must be TRUE or FALSE.", call. = FALSE)
}
#
# If user do not want to set ranks
if( !set.ranks ){
return(NULL)
}
# Get ranks from rowData
ranks <- colnames(rowData(tse))
# Ranks must be character columns
is_char <- lapply(rowData(tse), function(x) is.character(x) || is.factor(x))
is_char <- unlist(is_char)
ranks <- ranks[ is_char ]
# rowData is empty, cannot set ranks
if( length(ranks) == 0 ){
warning(
"Ranks cannot be set. rowData(x) does not include columns ",
"specifying character values.", call. = FALSE)
return(NULL)
}
# Finally, set ranks and give message
temp <- setTaxonomyRanks(ranks)
message("TAXONOMY_RANKS set to: '", paste0(ranks, collapse = "', '"), "'")
return(NULL)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.