#' Replace unknown, NA, or short tax_table values
#'
#' @description
#' Identifies phyloseq tax_table values as unknown or uninformative and
#' replaces them with the first informative value from a higher taxonomic rank.
#' - Short values in phyloseq tax_table are typically empty strings or " ", or "g__" etc.
#' so it is helpful to replace them. (If this is unwanted: set `min_length` = 0 to avoid filtering on length.)
#' - Values in `unknowns` are also removed, even if longer than `min_length`.
#' It is up to the user to specify sensible values in `unknowns` if their dataset has other unwanted values.
#' - NA values are also replaced.
#'
#' See this article for an extended discussion of tax_table fixing.
#' \url{https://david-barnett.github.io/microViz/articles/web-only/tax-fixing.html}
#'
#' @details
#' By default (unknowns = NA), unknowns is set to a vector containing:
#'
#' 's__' 'g__' 'f__' 'o__' 'c__' 'p__' 'k__' 'S__' 'G__' 'F__' 'O__' 'C__' 'P__' 'K__' 'NA' 'NaN' ' ' ''
#' 'unknown' 'Unknown' 's__unknown' 's__Unknown' 's__NA' 'g__unknown' 'g__Unknown' 'g__NA'
#' 'f__unknown' 'f__Unknown' 'f__NA' 'o__unknown' 'o__Unknown' 'o__NA' 'c__unknown' 'c__Unknown' 'c__NA'
#' 'p__unknown' 'p__Unknown' 'p__NA' 'k__unknown' 'k__Unknown' 'k__NA' 'S__unknown' 'S__Unknown' 'S__NA'
#' 'G__unknown' 'G__Unknown' 'G__NA' 'F__unknown' 'F__Unknown' 'F__NA' 'O__unknown' 'O__Unknown' 'O__NA'
#' 'C__unknown' 'C__Unknown' 'C__NA' 'P__unknown' 'P__Unknown' 'P__NA' 'K__unknown' 'K__Unknown' 'K__NA'
#'
#'
#' @param ps
#' phyloseq or tax_table (taxonomyTable)
#' @param min_length
#' replace strings shorter than this, must be integer > 0
#' @param unknowns
#' also replace strings matching any in this vector,
#' NA default vector shown in details!
#' @param suffix_rank
#' "classified" (default) or "current", when replacing an entry,
#' should the suffix be taken from the lowest classified rank
#' for that taxon, "classified", or the "current" unclassified rank?
#' @param sep
#' character(s) separating new name and taxonomic rank level suffix (see suffix_rank)
#' @param anon_unique
#' make anonymous taxa unique by replacing unknowns with taxa_name?
#' otherwise they are replaced with paste("unknown", first_rank_name),
#' which is therefore the same for every anonymous taxon, meaning they will be merged if tax_agg is used.
#' (anonymous taxa are taxa with all unknown values in their tax_table row,
#' i.e. cannot be classified even at highest rank available)
#' @param verbose
#' emit warnings when cannot replace with informative name?
#'
#' @return object same class as ps
#' @export
#'
#' @seealso \code{\link{tax_fix_interactive}} for interactive tax_fix help
#'
#' @examples
#' library(dplyr)
#' library(phyloseq)
#'
#' data(dietswap, package = "microbiome")
#' ps <- dietswap
#'
#' # create unknowns to test filling
#' tt <- tax_table(ps)
#' ntax <- ntaxa(ps)
#' set.seed(123)
#' g <- sample(1:ntax, 30)
#' f <- sample(g, 10)
#' p <- sample(f, 3)
#' tt[g, 3] <- "g__"
#' tt[f, 2] <- "f__"
#' tt[p, 1] <- "p__"
#' tt[sample(1:ntax, 10), 3] <- "unknown"
#' # create a row with only NAs
#' tt[1, ] <- NA
#'
#' tax_table(ps) <- tax_table(tt)
#'
#' ps
#' # tax_fix with defaults should solve most problems
#' tax_table(ps) %>% head(50)
#'
#' # this will replace "unknown"s as well as short values including "g__" and "f__"
#' tax_fix(ps) %>%
#' tax_table() %>%
#' head(50)
#'
#' # This will only replace short entries, and so won't replace literal "unknown" values
#' ps %>%
#' tax_fix(unknowns = NULL) %>%
#' tax_table() %>%
#' head(50)
#'
#' # Change rank suffix and separator settings
#' tax_fix(ps, suffix_rank = "current", sep = " - ") %>%
#' tax_table() %>%
#' head(50)
#'
#' # by default, completely unclassified (anonymous) taxa are named by their
#' # taxa_names / rownames at all ranks.
#' # This makes anonymous taxa distinct from each other,
#' # and so they won't be merged on aggregation with tax_agg.
#' # If you think your anonymous taxa should merge on tax_agg,
#' # or you just want them to be named the all same for another reason,
#' # set anon_unique = FALSE (compare the warning messages)
#' tax_fix(ps, anon_unique = FALSE)
#' tax_fix(ps, anon_unique = TRUE)
#'
#' # here's a larger example tax_table shows its still fast with 1000s rows,
#' # from microbiomeutilities package
#' # library(microbiomeutilities)
#' # data("hmp2")
#' # system.time(tax_fix(hmp2, min_length = 1))
tax_fix <- function(ps,
min_length = 4,
unknowns = NA,
suffix_rank = "classified", # or current
sep = " ",
anon_unique = TRUE,
verbose = TRUE) {
if (!rlang::is_scalar_integerish(min_length) || min_length < 1L) {
stop("min_length must be an integer greater than 0")
}
if (is(ps, "psExtra")) stop("ps is a psExtra, run tax_fix BEFORE tax_agg etc")
if (methods::is(ps, "phyloseq")) {
tt <- unclass(phyloseq::tax_table(ps))
} else if (inherits(ps, "taxonomyTable")) {
tt <- unclass(ps)
} else {
stop("ps must be phyloseq or taxonomyTable class, it is class: ", paste(class(ps), collapse = " "))
}
if (identical(unknowns, NA)) {
unknowns <- tax_common_unknowns(min_length = min_length)
}
# get rownames to ensure order doesn't change
original_rownames <- rownames(tt)
ranknames <- colnames(tt)
levels <- ranknames # TODO either fix levels functionality or remove completely
tt[is.na(tt) | nchar(tt) < min_length | tt %in% unknowns] <- ""
rowLengthOut <- ncol(tt) # save number of cols before adding .rownames.
tt_extra <- cbind(tt, .rownames. = original_rownames)
# split the matrix tt (splits as a vector: see https://stackoverflow.com/questions/6819804/how-to-convert-a-matrix-to-a-list-of-column-vectors-in-r/6823557)
tt_list <- split(tt_extra, row(tt_extra))
# repair lost names
names(tt_list) <- original_rownames
# replace tax_table unknowns
tt_out <- vapply(
X = tt_list,
FUN.VALUE = character(length = rowLengthOut),
FUN = function(vec) {
vec <- unlist(vec)
is_unknown <- vec == "" # all unknowns now replaced first!
# replace unknowns with nearest known if required and possible
if (any(is_unknown[1:rowLengthOut])) {
if (all(is_unknown[1:rowLengthOut])) {
if (isTRUE(anon_unique)) {
out <- vec[rowLengthOut + 1]
} else {
out <- paste("unclassified", ranknames[[1]])
}
if (isTRUE(verbose)) {
message(
"Row named: ", vec[[rowLengthOut + 1]],
"\ncontains no non-unknown values, returning:\n'",
out, "' for all replaced levels.\n",
"Consider editing this tax_table entry manually."
)
}
vec <- rep(out, times = rowLengthOut)
} else {
# edit each unknown value in this row
vec <- tax_fix_value(
vec = vec, is_unknown = is_unknown, ranknames = ranknames,
levels = levels, sep = sep, suffix_rank = suffix_rank,
rowLengthOut = rowLengthOut
)
}
}
return(vec[1:rowLengthOut])
}
)
# make match original tt format (taxa as rows, ranks as cols)
if (inherits(tt_out, "matrix")) {
tt_out <- t(tt_out)
} else {
# vapply returns vector if tt had only 1 rank
tt_out <- as.matrix(tt_out) # returns 1-column matrix
}
# ensure original row order
tt_out <- tt_out[original_rownames, , drop = FALSE]
# repair colnames
colnames(tt_out) <- ranknames
# preserve any columns not listed in levels
preserved <- !ranknames %in% levels
if (any(preserved)) tt_out[, preserved] <- tt[, preserved]
# return phyloseq or tax table (same as input)
if (inherits(ps, "phyloseq")) {
phyloseq::tax_table(ps) <- tt_out
return(ps)
} else {
return(phyloseq::tax_table(tt_out))
}
}
#' internal helper for tax_fix
#'
#' @param vec row over which iteration is occuring
#' @param is_unknown logical vector of length length(row)
#' @param ranknames character vector of length length(row)
#' @param rowLengthOut number of ranks, originally
#'
#' @inheritParams tax_fix
#' @return fixed value
#' @noRd
tax_fix_value <- function(vec,
is_unknown,
ranknames,
levels,
rowLengthOut,
suffix_rank,
sep) {
vec_out <- vapply(
X = seq_along(vec),
FUN.VALUE = vec[1],
FUN = function(i) {
if (is_unknown[i]) {
known_above <- !is_unknown[1:(i - 1)]
if (!any(known_above)) {
if (ranknames[[i]] %in% levels) {
stop(
"Unknown values detected to the left of known values\n",
"in row named: ", vec[[rowLengthOut + 1]],
"\nThis should not happen. Check/fix this row:\n",
paste(vec[1:rowLengthOut], collapse = "; ")
)
}
return(vec[i])
}
nearest_known <- max(which(known_above))
tax <- vec[nearest_known]
if (identical(suffix_rank, "classified")) {
level <- ranknames[[nearest_known]]
} else {
level <- ranknames[[i]]
}
tax <- paste(tax, level, sep = sep)
return(tax)
} else {
return(vec[i])
}
}
)
return(vec_out)
}
#' Helper function returns vector of common unknown/uninformative tax table entries
#'
#' @description
#' Returns values often found in tax_tables (as assigned by e.g. DECIPHER or NGTAX2)
#' This function is used inside tax_fix (these values are replaced)
#' This function is used inside phyloseq_validate (values are reported only)
#'
#' @param min_length
#' minimum length taxa allowed to be to not be flagged or replaced on grounds of nchar!
#' (set for purposes of enclosing function, not this function)
#' @return character vector
#'
#' @noRd
tax_common_unknowns <- function(min_length) {
# check tax_table for uninformative entries
unknowns <- c("unknown", "Unknown")
rank_letters <- c("s", "g", "f", "o", "c", "p", "k")
rank_letters <- c(rank_letters, toupper(rank_letters))
rank_stubs <- paste0(rank_letters, "__")
rank_junk <- sapply(X = rank_stubs, paste0, c(unknowns, "NA"))
unknowns <- c(unknowns, as.vector(rank_junk))
# include shorter names if min_tax_length means nchar won't catch them
if (min_length <= 1) unknowns <- c(" ", "", unknowns)
if (min_length <= 2) unknowns <- c("NA", unknowns)
if (min_length <= 3) unknowns <- c(rank_stubs, unknowns, "NaN")
return(unknowns)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.