#' Convert a \code{TreeSummarizedExperiment} object to/from \sQuote{BIOM}
#' results
#'
#' For convenience, a few functions are available to convert BIOM, DADA2 and
#' phyloseq objects to
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' objects, and
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' objects to phyloseq objects.
#'
#' @param prefix.rm \code{Logical scalar}. Should
#' taxonomic prefixes be removed? The prefixes is removed only from detected
#' taxa columns meaning that \code{rank.from.prefix} should be enabled in the
#' most cases. (Default: \code{FALSE})
#'
#' @param removeTaxaPrefixes Deprecated. Use \code{prefix.rm} instead.
#'
#' @param rank.from.prefix \code{Logical scalar}. If file does not have
#' taxonomic ranks on feature table, should they be scraped from prefixes?
#' (Default: \code{FALSE})
#'
#' @param rankFromPrefix Deprecated.Use \code{rank.from.prefix} instead.
#'
#' @param artifact.rm \code{Logical scalar}. If file have
#' some taxonomic character naming artifacts, should they be removed.
#' (default (Default: \code{FALSE})
#'
#' @param remove.artifacts Deprecated. Use \code{artifact.rm} instead.
#'
#' @details
#' \code{convertFromBIOM} coerces a \code{\link[biomformat:biom-class]{biom}}
#' object to a
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' object.
#'
#' \code{convertToBIOM} coerces a
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' object to a \code{\link[biomformat:biom-class]{biom}} object.
#'
#' @return
#' \code{convertFromBIOM} returns an object of class
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#'
#' @name importBIOM
#'
#' @seealso
#' \code{\link[=importQIIME2]{importQIIME2}}
#' \code{\link[=importMothur]{importMothur}}
#'
#' @examples
#'
#' # Convert BIOM results to a TreeSE
#' # Load biom file
#' library(biomformat)
#' biom_file <- system.file("extdata", "rich_dense_otu_table.biom",
#' package = "biomformat")
#'
#' # Make TreeSE from BIOM object
#' biom_object <- biomformat::read_biom(biom_file)
#' tse <- convertFromBIOM(biom_object)
#'
#' # Convert TreeSE object to BIOM
#' biom <- convertToBIOM(tse)
#'
NULL
#' Import BIOM results to \code{TreeSummarizedExperiment}
#'
#' @param file BIOM file location
#'
#' @param ... additional arguments to be passed to \code{convertFromBIOM}
#'
#' @details
#' \code{importBIOM} loads a BIOM file and creates a
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' object from the BIOM object contained in the loaded file.
#'
#' @return
#' \code{importBIOM} returns an object of class
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#'
#' @name importBIOM
#'
#' @seealso
#' \code{\link[=importMetaPhlAn]{importMetaPhlAn}}
#' \code{\link[=convertFromPhyloseq]{convertFromPhyloseq}}
#' \code{\link[=convertFromBIOM]{convertFromBIOM}}
#' \code{\link[=convertFromDADA2]{convertFromDADA2}}
#' \code{\link[=importQIIME2]{importQIIME2}}
#' \code{\link[=importMothur]{importMothur}}
#' \code{\link[=importHUMAnN]{importHUMAnN}}
#'
#' @examples
#' # Load biom file
#' library(biomformat)
#' biom_file <- system.file(
#' "extdata", "rich_dense_otu_table.biom", package = "biomformat")
#'
#' # Make TreeSE from biom file
#' tse <- importBIOM(biom_file)
#'
#' # Get taxonomyRanks from prefixes and remove prefixes
#' tse <- importBIOM(
#' biom_file, rank.from.prefix = TRUE, prefix.rm = TRUE)
#'
#' # Load another biom file
#' biom_file <- system.file(
#' "extdata", "Aggregated_humanization2.biom", package = "mia")
#'
#' # Clean artifacts from taxonomic data
#' tse <- importBIOM(biom_file, artifact.rm = TRUE)
#'
#' @export
importBIOM <- function(file, ...) {
.require_package("biomformat")
biom <- biomformat::read_biom(file)
convertFromBIOM(biom, ...)
}
#' @rdname importBIOM
#'
#' @param x object of type \code{\link[biomformat:biom-class]{biom}}
#'
#' @export
#' @importFrom S4Vectors make_zero_col_DFrame DataFrame
#' @importFrom dplyr %>% bind_rows
convertFromBIOM <- function(
x, prefix.rm = removeTaxaPrefixes,
removeTaxaPrefixes = FALSE, rank.from.prefix = rankFromPrefix,
rankFromPrefix = FALSE,
artifact.rm = remove.artifacts, remove.artifacts = FALSE, ...){
# input check
.require_package("biomformat")
if(!is(x,"biom")){
stop("'x' must be a 'biom' object", call. = FALSE)
}
if( !.is_a_bool(prefix.rm) ){
stop("'prefix.rm' must be TRUE or FALSE.", call. = FALSE)
}
if( !.is_a_bool(rank.from.prefix) ){
stop("'rank.from.prefix' must be TRUE or FALSE.", call. = FALSE)
}
if( !.is_a_bool(artifact.rm) ){
stop("'artifact.rm' must be TRUE or FALSE.", call. = FALSE)
}
#
counts <- as(biomformat::biom_data(x), "matrix")
sample_data <- biomformat::sample_metadata(x)
feature_data <- biomformat::observation_metadata(x)
# colData is initialized with empty tables with rownames if it is NULL
if( is.null(sample_data) ){
sample_data <- S4Vectors::make_zero_col_DFrame(ncol(counts))
rownames(sample_data) <- colnames(counts)
# Otherwise convert it into correct format if it is a list
} else if( is(sample_data, "list") ){
# Merge list of data.frames into one
sample_data <- bind_rows(sample_data)
sample_data < as.data.frame(sample_data)
}
# rowData is initialized with empty tables with rownames if it is NULL
if( is.null(feature_data) ){
feature_data <- S4Vectors::make_zero_col_DFrame(nrow(counts))
rownames(feature_data) <- rownames(counts)
# Otherwise convert it into correct format if it is a list
} else if( is(feature_data, "list") ){
# Feature data is a list of taxa info. Dfs are merged together
# differently than sample metadata since the column names are only
# "Taxonomy". If there is only one taxonomy level, the column name does
# not get a suffix.
# --> bind rows based on the index of column.
# Get the maximum length of list
max_length <- max( lengths(feature_data) )
# Get the column names from the taxa info that has all the levels that
# occurs in the data
colnames <- names( head(
feature_data[ lengths(feature_data) == max_length ], 1)[[1]])
# Convert the list so that all individual taxa info have the max length
# of the list objects. All vectors are appended with NAs, if they do not
# have all the levels. E.g., if only Kingdom level is found, all lower
# ranks are now NA
feature_data <- lapply(feature_data, function(x){
length(x) <- max_length
return(x)
})
# Create a data.frame from the list
feature_data <- do.call(rbind, feature_data)
# Transposing feature_data and make it df object
feature_data <- as.data.frame(feature_data)
# Add column that includes all the data
merged_col <- apply(feature_data, 1, paste0, collapse = ";")
feature_data <- cbind(feature_data, merged_col)
# Add correct colnames
colnames(feature_data) <- c(colnames, "taxonomy_unparsed")
}
# If there is only one column in the feature data, it is the most probable
# that the taxonomy is not parsed. Try to parse it.
if( ncol(feature_data) == 1 ){
colnames(feature_data) <- "taxonomy_unparsed"
tax_tab <- .parse_taxonomy(
feature_data, column_name = colnames(feature_data))
feature_data <- cbind(tax_tab, feature_data)
feature_data <- as.data.frame(feature_data)
}
# Clean feature_data from possible character artifacts if specified.
if( artifact.rm ){
feature_data <- .detect_taxa_artifacts_and_clean(feature_data, ...)
}
# Replace taxonomy ranks with ranks found based on prefixes. Test first
# if columns are already ranks. If they are, do not change them.
cols_not_already_ranks <- lapply(
colnames(feature_data), function(x) !x %in% TAXONOMY_RANKS) |>
unlist() |> all()
if( rank.from.prefix && cols_not_already_ranks ){
# Find ranks
ranks <- lapply(
colnames(feature_data), .replace_colnames_based_on_prefix,
x = feature_data)
# Replace old ranks with found ranks
colnames(feature_data) <- unlist(ranks)
}
# Adjust row and colnames
rownames(counts) <- rownames(feature_data) <- biomformat::rownames(x)
colnames(counts) <- rownames(sample_data) <- biomformat::colnames(x)
# Convert into DataFrame
sample_data <- DataFrame(sample_data)
feature_data <- DataFrame(feature_data)
# Convert into list
assays <- SimpleList(counts = counts)
# Create TreeSE
tse <- TreeSummarizedExperiment(
assays = assays,
colData = sample_data,
rowData = feature_data)
# Set ranks based on rowData columns if user has specified to do so
temp <- .set_ranks_based_on_rowdata(tse, ...)
# Remove prefixes if specified and rowData includes info
if(prefix.rm && ncol(rowData(tse)) > 0){
rowData(tse) <- .remove_prefixes_from_taxa(rowData(tse), ...)
}
return(tse)
}
#' @rdname importBIOM
#'
#' @param x
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#'
#' @param assay.type \code{Character scaler}. The name of assay.
#' (Default: \code{"counts"})
#'
#' @param ... Additional arguments. Not used currently.
#'
#'
#' @export
setGeneric(
"convertToBIOM", signature = c("x"),
function(
x, assay.type = "counts", ...)
standardGeneric("convertToBIOM"))
#' @rdname importBIOM
#' @export
setMethod(
"convertToBIOM", signature = c(x = "SummarizedExperiment"),
function(x, assay.type = "counts", ...){
############################## Input check #############################
.require_package("biomformat")
.check_assay_present(assay.type, x)
############################ Input check end ###########################
x <- .convert_se_to_biom(x, assay.type)
return(x)
}
)
################################ HELP FUNCTIONS ################################
# This function removes prefixes from taxonomy names
.remove_prefixes_from_taxa <- function(
feature_tab,
prefixes = paste0(
"(", paste0(.taxonomy_rank_prefixes, collapse = "|"), ")__"),
only.taxa.col = TRUE, ignore.col = "taxonomy_unparsed", ...){
#
if( !.is_a_bool(only.taxa.col) ){
stop("'only.taxa.col' must be TRUE or FALSE.", call. = FALSE)
}
#
if( !.is_a_string(prefixes) ){
stop("'prefixes' must be a single character value.", call. = FALSE)
}
#
if( !(is.character(ignore.col) || is.null(ignore.col)) ){
stop("'ignore.col' must be a character value or NULL.", call. = FALSE)
}
#
# Subset by taking only taxonomy info if user want to remove the pattern
# only from those. (Might be too restricting, e.g., if taxonomy columns are
# not detected in previous steps. That is way the default is FALSE)
ind <- rep(TRUE, ncol(feature_tab))
if( only.taxa.col ){
ind <- tolower(colnames(feature_tab)) %in% TAXONOMY_RANKS
}
# Also remove those columns that are specified to ignore (by default the
# column that has unparsed data)
ind <- ind & !colnames(feature_tab) %in% ignore.col
temp <- feature_tab[, ind, drop = FALSE]
# If there are columns left for removing the pattern
if( ncol(temp) > 0 ){
# Remove patterns
temp <- lapply(
temp, gsub, pattern = prefixes, replacement = "")
temp <- as.data.frame(temp)
# If cell had only prefix, it is now empty string. Convert to NA
temp[ temp == "" ] <- NA
# Combine table
feature_tab[, ind] <- temp
}
# Ensure that returned value is DF
feature_tab <- DataFrame(feature_tab)
return(feature_tab)
}
# Find taxonomy rank based on prefixes. If found, return
# corresponding rank. Otherwise, return the original
# rank that is fed to function.
.replace_colnames_based_on_prefix <- function(colname, x){
# Get column
col <- x[ , colname]
# List prefixes
all_ranks <- .taxonomy_rank_prefixes
prefixes <- paste0("^", all_ranks, "__")
names(prefixes) <- names(all_ranks)
# Find which prefix is found from each column value, if none.
found_rank <- lapply(prefixes, FUN = function(pref){
all(grepl(pattern = pref, col) | is.na(col)) && !all(is.na(col))
})
found_rank <- unlist(found_rank)
# If only one prefix was found (like it should be), get the corresponding
# rank name.
if( sum(found_rank) == 1 ){
colname <- names(prefixes)[found_rank]
}
return(colname)
}
# Detect and clean non wanted characters from Taxonomy data if needed.
.detect_taxa_artifacts_and_clean <- function(
x, pattern = "auto", ignore.col = "taxonomy_unparsed", ...) {
#
if( !.is_non_empty_character(pattern) ){
stop("'pattern' must be a single character value.", call. = FALSE)
}
#
if( !(is.character(ignore.col) || is.null(ignore.col)) ){
stop("'ignore.col' must be a character value or NULL.", call. = FALSE)
}
#
# Store rownames because they are lost during modifications
row_names <- rownames(x)
# Get those columns that are not modified
not_mod <- x[ , colnames(x) %in% ignore.col, drop = FALSE]
# Subset the data being modified so that it includes only those that user
# wants to modify
x <- x[ , !colnames(x) %in% ignore.col, drop = FALSE]
# If there are still columns to clean after subsetting
if( ncol(x) > 0 ){
# Remove artifacts
if( pattern == "auto" ){
.require_package("stringr")
# Remove all but these characters
pattern <- "[[:alnum:]]|-|_|\\[|\\]|,|;\\||[[:space:]]"
x <- lapply(x, function(col){
# Take all specified characters as a matrix where each column
# is a character
temp <- stringr::str_extract_all(
col, pattern = pattern, simplify = TRUE)
# Collapse matrix to strings
temp <- apply(temp, 1, paste, collapse = "")
# Now NAs are converted into characters. Convert them back
temp[ temp == "NA" ] <- NA
# Convert also empty strings to NA
temp[ temp == "" ] <- NA
return(temp)
})
} else{
# Remove pattern specified by user
x <- lapply(x, gsub, pattern = pattern, replacement = "")
}
}
# Ensure that the data is data.frame
x <- as.data.frame(x)
# Combine modified and non-modified data
x <- cbind(x, not_mod)
# Add rownames because they are dropped while removing artifacts
rownames(x) <- row_names
return(x)
}
# This function get's SE object as input and returns BIOM object.
.convert_se_to_biom <- function(x, assay.type){
# Get assay, rowData and colData
assay <- assay(x, assay.type)
rd <- rowData(x)
cd <- colData(x)
# Convert rowData and colData to data.frame since biomformat cannot handle
# DF.
rd <- as.data.frame(rd)
cd <- as.data.frame(cd)
# Check if assay contains integers or floats. biom constructor
# requires that information since the default value is "int".
mat_type <- ifelse(all(assay %% 1 == 0), "int", "float")
# Create argument list
args <- list(data = assay, matrix_element_type = mat_type)
# Add rowData and colData only if they contain information
if( ncol(rd) != 0 ){
args[["observation_metadata"]] <- rd
}
if( ncol(cd) != 0 ){
args[["sample_metadata"]] <- cd
}
# Create biom
biom <- do.call(biomformat::make_biom, args)
return(biom)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.