#' Get dominant taxa
#'
#' These functions return information about the most dominant taxa in a
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#' object.
#'
#' @param x A
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#' object.
#'
#' @param assay.type A single character value for selecting the
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}}
#' to use for identifying dominant taxa.
#'
#' @param assay_name a single \code{character} value for specifying which
#' assay to use for calculation.
#' (Please use \code{assay.type} instead. At some point \code{assay_name}
#' will be disabled.)
#'
#' @param rank A single character defining a taxonomic rank. Must be a value of
#' the output of \code{taxonomyRanks()}.
#'
#' @param name A name for the column of the \code{colData} where the dominant
#' taxa will be stored in when using \code{addPerSampleDominantFeatures}.
#'
#' @param other.name A name for features that are not included in n the most frequent
#' dominant features in the data. Default is "Other".
#'
#' @param n The number of features that are the most frequent dominant features.
#' Default is NULL, which defaults that each sample is assigned a dominant taxon.
#'
#' @param complete A boolean value to manage multiple dominant taxa for a sample.
#' Default for perSampleDominantTaxa is TRUE to include all equally dominant taxa
#' for each sample. complete = FALSE samples one taxa for the samples that have
#' multiple.
#' Default for addPerSampleDominantTaxa is FALSE to add a column with only one
#' dominant taxon assigned for each sample into colData. complete = TRUE adds a
#' list that includes all dominant taxa for each sample into colData.
#'
#' @param ... Additional arguments passed on to \code{agglomerateByRank()} when
#' \code{rank} is specified.
#'
#' @details
#' \code{addPerSampleDominantFeatures} extracts the most abundant taxa in a
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#' object, and stores the information in the \code{colData}. It is a wrapper for
#' \code{perSampleDominantFeatures}.
#'
#' With \code{rank} parameter, it is possible to agglomerate taxa based on taxonomic
#' ranks. E.g. if 'Genus' rank is used, all abundances of same Genus are added
#' together, and those families are returned. See \code{agglomerateByRank()} for
#' additional arguments to deal with missing values or special characters.
#'
#' @return \code{perSampleDominantFeatures} returns a named character vector \code{x}
#' while \code{addPerSampleDominantFeatures} returns
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#' with additional column in \code{\link{colData}} named \code{*name*}.
#'
#' @name perSampleDominantTaxa
#' @export
#'
#' @author Leo Lahti, Tuomas Borman and Sudarshan A. Shetty.
#'
#' @examples
#' data(GlobalPatterns)
#' x <- GlobalPatterns
#'
#' # Finds the dominant taxa.
#' sim.dom <- perSampleDominantFeatures(x, rank="Genus")
#'
#' # Add information to colData
#' x <- addPerSampleDominantFeatures(x, rank = "Genus", name="dominant_genera")
#' colData(x)
NULL
#' @rdname perSampleDominantTaxa
#' @aliases perSampleDominantTaxa
#' @export
setGeneric("perSampleDominantFeatures",signature = c("x"),
function(x, assay.type = assay_name, assay_name = "counts",
rank = NULL, other.name = "Other", n = NULL,
complete = TRUE, ...)
standardGeneric("perSampleDominantFeatures"))
#' @rdname perSampleDominantTaxa
#' @aliases perSampleDominantTaxa
#' @importFrom IRanges relist
#' @export
setMethod("perSampleDominantFeatures", signature = c(x = "SummarizedExperiment"),
function(x, assay.type = assay_name, assay_name = "counts",
rank = NULL, other.name = "Other", n = NULL, complete = TRUE, ...){
# Input check
# Check assay.type
.check_assay_present(assay.type, x)
# rank check
if(!is.null(rank)){
if(!.is_a_string(rank)){
stop("'rank' must be an single character value.",
call. = FALSE)
}
.check_taxonomic_rank(rank, x)
}
# If "rank" is not NULL, species are aggregated according to the
# taxonomic rank that is specified by user.
if(!is.null(rank)){
x <- agglomerateByRank(x, rank, ...)
mat <- assay(x, assay.type)
} # Otherwise, if "rank" is NULL, abundances are stored without ranking
else {
mat <- assay(x, assay.type)
}
# apply() function finds the indices of taxa's that has the highest
# abundance.
# rownames() returns the names of taxa that are the most abundant.
idx <- as.list(apply(t(mat) == colMaxs(mat),1L,which))
# Get rownames based on indices
taxa <- rownames(mat)[unlist(idx)]
# If multiple dominant taxa were found, names contain taxa in addition to
# sample name. Names are converted so that they include only sample names.
names(taxa) <- rep( names(idx), times = lengths(idx) )
# If individual sample contains multiple dominant taxa (they have equal
# counts) and if complete is FALSE, the an arbitrarily chosen dominant
# taxa is returned
if( length(taxa)>ncol(x) && !complete){
# Store order
order <- unique(names(taxa))
# there are multiple dominant taxa in one sample (counts are equal), length
# of dominant is greater than rows in colData.
taxa <- split(taxa, rep(names(taxa), lengths(taxa)) )
# Order the data
taxa <- taxa[order]
names <- names(taxa)
# If complete is set FALSE, and there are multiple dominant taxa,
# one of them is arbitrarily chosen
taxa <- lapply(taxa, function(item) {
return(sample(item, 1)) })
taxa <- unname(sapply(taxa, function (x) {
unlist(x)}))
names(taxa) <- names
warning("Multiple dominant taxa were found for some samples. Use complete = TRUE for details.", call. = FALSE)
}
# Name "Other" the features that are not included in n the most abundant
# in the data
if(!is.null(n)){
flat_taxa <- unlist(taxa, recursive = TRUE)
top <- .top_n(flat_taxa, n=n)
top <- names(top)
# Group the rest into the "Other" category
taxa <- lapply(taxa, function(x){
ind <- vapply(x, function(y) y %in% top, FUN.VALUE = logical(1))
if( any(ind) ){
res <- x[ ind ]
} else{
res <- other.name
}
return(res)
})
if ( all(lengths(taxa) == 1 ) ){
taxa <- unlist(taxa)
}
}
return(taxa)
}
)
#' @rdname perSampleDominantTaxa
#' @aliases perSampleDominantFeatures
#' @export
setGeneric("perSampleDominantTaxa", signature = c("x"),
function(x, ...)
standardGeneric("perSampleDominantTaxa"))
#' @rdname perSampleDominantTaxa
#' @aliases perSampleDominantFeatures
#' @export
setMethod("perSampleDominantTaxa", signature = c(x = "SummarizedExperiment"),
function(x, ...){
.Deprecated(old ="perSampleDominantTaxa",
new = "perSampleDominantFeatures",
msg = "The 'perSampleDominantTaxa' function is deprecated. Use 'perSampleDominantFeatures' instead.")
perSampleDominantFeatures(x, ...)
}
)
#' @rdname perSampleDominantTaxa
#' @aliases addPerSampleDominantTaxa
#' @export
setGeneric("addPerSampleDominantFeatures", signature = c("x"),
function(x, name = "dominant_taxa", other.name = "Other", n = NULL, ...)
standardGeneric("addPerSampleDominantFeatures"))
#' @rdname perSampleDominantTaxa
#' @aliases addPerSampleDominantTaxa
#' @export
setMethod("addPerSampleDominantFeatures", signature = c(x = "SummarizedExperiment"),
function(x, name = "dominant_taxa", other.name = "Other", n = NULL,
complete = FALSE, ...) {
# name check
if(!.is_non_empty_string(name)){
stop("'name' must be a non-empty single character value.",
call. = FALSE)
}
# other.name check
if(!.is_non_empty_string(other.name)){
stop("'other.name' must be a non-empty single character value.",
call. = FALSE)
}
dom.taxa <- perSampleDominantFeatures(x, other.name = other.name, n = n,
complete = complete, ...)
# Add list into colData if there are multiple dominant taxa
if(length(unique(names(dom.taxa))) < length(names(dom.taxa))) {
# Store order
order <- unique(names(dom.taxa))
grouped <- split(dom.taxa, rep(names(dom.taxa)), lengths(dom.taxa))
grouped <- grouped[order]
dom.taxa <- grouped
warning("A new column that was added in colData(x) is a list",
call. = FALSE)
}
colData(x)[[name]] <- dom.taxa
return(x)
}
)
#' @rdname perSampleDominantTaxa
#' @aliases addPerSampleDominantFeatures
#' @export
setGeneric("addPerSampleDominantTaxa", signature = c("x"),
function(x, ...)
standardGeneric("addPerSampleDominantTaxa"))
#' @rdname perSampleDominantTaxa
#' @aliases addPerSampleDominantFeatures
#' @export
setMethod("addPerSampleDominantTaxa", signature = c(x = "SummarizedExperiment"),
function(x, ...){
.Deprecated(old ="addPerSampleDominantTaxa",
new = "addPerSampleDominantFeatures",
msg = "The 'addPerSampleDominantTaxa' function is deprecated. Use 'addPerSampleDominantFeatures' instead.")
addPerSampleDominantFeatures(x, ...)
}
)
########################## HELP FUNCTIONS summary ##############################
# top entries in a vector or given field in a data frame
.top_n <- function(x, n = NULL, na.rm = FALSE) {
if (na.rm){
inds <- which(x == "NA")
if (length(inds) > 0){
x[inds] <- NA
warning(paste("Interpreting NA string as missing value NA.
Removing", length(inds), "entries"), call. = FALSE)
}
x <- x[!is.na(x)]
}
# Create a frequency table of unique values of the dominant taxa for each sample
s <- rev(sort(table(x)))
# Include only n the most frequent taxa
if (!is.null(n)){
s <- s[seq_len(min(n, length(s)))]
}
return(s)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.