#' @title A function to aggregate low prevalence, abundance, or unwanted taxa together
#' @name otu_filter
#' @description Will take a tidi_micro set and aggregate the raw counts of taxa with a low prevalence and/or abundance into a new "Other" taxa. Can also find specific taxa you'd like to include in the "Other" taxa counts. Once the counts are aggregated taxa relative abundance, centered log ratio (CLR) transformations, and presence will be recalculated. This recalculation will only change the "Other" category
#' @param micro_set A tidy_micro data set
#' @param prev_cutoff Minimum percent of subjects with OTU counts above 0
#' @param ra_cutoff At leat one subject must have RA above this subject
#' @param exclude_taxa A character vector of OTU names that you would like filter into your "Other" category
#' @details \eqn{\frac{1}{Total}}{1/Total} will be added to each taxa count for CLR tranformations in order to avoid issues with log(0)
#' @return Returns a tidy_micro set
#' @author Charlie Carpenter and Dan Frank
#' @examples
#' data(phy); data(cla); data(ord); data(fam); data(met)
#'
#' otu_tabs = list(Phylum = phy, Class = cla, Order = ord, Family = fam)
#' set <- tidy_micro(otu_tabs = otu_tabs, meta = met) %>%
#' filter(day == 7) ## Only including the first week
#'
#' filter_set <- set %>%
#' otu_filter(prev_cutoff = 5, ## 5% of subjects must have this bug, or it is filtered
#' ra_cutoff = 1, ## At least 1 subject must have RA of 1, or it is filtered
#' excluted_taxa = c("Unclassified", "Bacteria") ## Unclassified taxa we don't want
#' )
#' @export
otu_filter <- function(micro_set, prev_cutoff = 0, ra_cutoff = 0, exclude_taxa = NULL){
## Table, Lib, Taxa name, meta data left over
met <- micro_set %>% dplyr::select(-dplyr::matches("Table"), -dplyr::matches("Taxa"),
-dplyr::matches("Total"), -dplyr::matches("bin"),
-dplyr::matches("cts"), -dplyr::matches("clr"),
-dplyr::matches("ra")
) %>%
dplyr::distinct(Lib, .keep_all = T)
## Applies function to each rank
micro_set %>%
plyr::ddply(plyr::.(Table), function(set, meta){
## Pulls out Lib, Taxa, and counts and spreads to original otu structure
otu <- set %>%
dplyr::select(Lib, Taxa, cts) %>%
dplyr::filter(!is.na(Taxa)) %>%
tidyr::spread(Taxa, cts)
Lib <- otu$Lib; total <- rowSums(otu[,-1]); rr <- unique(as.character(set$Table))
## applies all filters specified
otu.f <- mul_filter(otu[,-1], prev_cutoff, ra_cutoff, total = total,
ex = exclude_taxa, rr = rr)
## recalculates ra, clr, and bin (just to recalculate the "Other")
ra <- my_ra(otu.f, total) %>% dplyr::mutate(Lib = Lib)
clr <- my_clr(otu.f, total) %>% dplyr::mutate(Lib = Lib)
bin <- my_bin(otu.f, total) %>% dplyr::mutate(Lib = Lib)
## Recalculating totals (for "Other" category) and remaking cts
Total <- rowSums(otu.f)
cts <- otu.f %>% dplyr::mutate(Lib = Lib, Total = Total)
## Melted data
m.ra <- ra %>% tidyr::gather(Taxa, ra, -Lib)
m.clr <- clr %>% tidyr::gather(Taxa, clr, -Lib)
m.bin <- bin %>% tidyr::gather(Taxa, bin, -Lib)
m.cts <- cts %>% tidyr::gather(Taxa, cts, -Lib, -Total)
## Joining all counts and transformations
long_OTU <- dplyr::left_join(m.bin, m.cts, by = c("Lib", "Taxa")) %>%
dplyr::left_join(m.clr, by = c("Lib", "Taxa")) %>%
dplyr::left_join(m.ra, by = c("Lib", "Taxa")) %>%
dplyr::mutate(Table = rr) %>%
dplyr::left_join(meta, by = "Lib")
return(long_OTU)
}, meta = met) %>%
## Reordering to original order
dplyr::select(Table, Lib, Taxa, Total, bin, cts, clr, ra, everything()) %>%
dplyr::filter(!is.na(Total)) %>%
return()
}
## Filter functions to be applied within tidy_mibi
mul_filter <- function(otu, prev_cutoff, ra_cutoff, total, ex, rr){
## Prints out curent filtering Table if any filtering is done
if( any( c(!is.null(ex), prev_cutoff > 0, ra_cutoff > 0) ) ){
cat("Filter for", rr, "counts\n\n")
}
## Different steps
if(!is.null(ex)) otu %<>% ex_filter(ex)
if(prev_cutoff > 0) otu %<>% prev_filter(prev_cutoff)
if(ra_cutoff > 0) otu %<>% ra_filter(ra_cutoff, total)
return(otu)
}
prev_filter <- function(otu, prev_cutoff){
if ((prev_cutoff > 100) | (prev_cutoff < 0)) stop("Prevalence Cutoff must be between 0 and 100")
starting_cts <- sum(otu)
prevs <- apply(otu, 2, function(x) 100*sum(x > 0)/length(x))
## calculating prevalence and converting to a percent
cat("Prevalence cutoff: ", prev_cutoff, "% (i.e., at least ", prev_cutoff,
"% of libaries must be represented to keep OTU)\n",
sep = "")
cat("Found", ncol(otu), "OTUs.\n")
if(sum(prevs < prev_cutoff) == ncol(otu)) stop("All OTUs would be excluded with this cutoff. Halting operation")
if(sum(prevs < prev_cutoff) > 0) {
include <- otu[,prevs >= prev_cutoff]
exclude <- otu[,prevs < prev_cutoff]
if(class(exclude) %in% c("matrix", "data.frame")) other <- rowSums(exclude)
else if(is.numeric(exclude)) other <- exclude
if ("Other" %in% names(include)) {
include[,"Other"] <- include[,"Other"] + other
cat("Found 'Other' category in input data.\n")
} else {
include$Other <- other
cat("Created new 'Other' Taxa in OTU table for", ncol(otu)+1, "counts.\n")
}
if("Other" %in% names(exclude)){
cat("Collapsed",sum(prevs < prev_cutoff)-1,"OTUs into 'Other' in OTU table.\n")
}
if("Other" %nin% names(exclude)){
cat("Collapsed",sum(prevs < prev_cutoff),"OTUs into 'Other' in OTU table.\n")
}
cat("Converted",sum(exclude),"counts to 'Other' in otu category.\n")
cat("Remaining OTUs:",ncol(include)," (Including 'Other').\n\n")
ending_cts <- sum(include)
if (ending_cts != starting_cts) stop("Ending counts not equal to starting counts!\n")
return(include)
}
else {
cat("No OTUs with prevalence < ", prev_cutoff, "\n\n", sep = "")
return(otu)
}
}
ra_filter <- function(otu, ra_cutoff, total) {
if ((ra_cutoff > 100) | (ra_cutoff < 0)){
stop("Relative abundance cutoff must be between 0 and 100 (%)")
}
starting_cts <- sum(otu) ## Starting counts
cat("Relative abundance cutoff:",ra_cutoff,
"\b% (i.e., at least one library must have RA >",ra_cutoff,
"\b% to keep OTU).\n")
cat("Found", ncol(otu), "OTUs.\n")
ra <- my_ra(otu,total) ## Relative abundance
ras <- apply(ra, 2, max) ## Max RA
below_cutoff <- sum(ras < ra_cutoff) #store TRUE/FALSE for whether an OTU has a max abundance < cutoff
if(below_cutoff == ncol(ra)) stop("All OTUs would be excluded with this RA cutoff. Halting operation!")
if(below_cutoff > 0) {
include <- otu[,ras >= ra_cutoff] ## OTUs above ra_cutoff
exclude <- otu[,ras < ra_cutoff] ## OTUs below ra_cutoff
if(class(exclude) %in% c("data.frame", "matrix")) other <- rowSums(exclude)
else if(is.numeric(exclude)) other <- exclude
if("Other" %in% names(otu)){
cat("Found 'Other' category in input data.\n")
} else cat("Created new 'Other' Taxa in OTU table for", ncol(otu)+1, "counts.\n")
if ("Other" %in% names(include)) {
include[,"Other"] <- include[,"Other"] + other
} else include$Other <- other
ending_cts <- sum(include)
if (ending_cts != starting_cts) stop("Ending counts not equal to starting counts!\n")
if("Other" %in% names(exclude)){
cat("Collapsed",below_cutoff - 1,"OTUs into 'Other' in OTU table.\n")
}
if("Other" %nin% names(exclude)){
cat("Collapsed",below_cutoff,"OTUs into 'Other' in OTU table.\n")
}
cat("Converted",sum(exclude),"counts to 'Other' otu category.\n")
cat("Remaining OTUs:",ncol(include)," (Including 'Other').\n\n")
return(include)
}
else {
cat("No OTUs with max relative abundance <", ra_cutoff, "\b%.\n\n")
return(otu)
}
}
ex_filter <- function(otu, ex){
starting_cts <- sum(otu) ## Starting counts
starting_nms <- names(otu) ## Starting names
if("Other" %in% ex){
warning("Can not remove 'Other' category from OTU tables. 'Other' will be the aggregated counts of filtered OTUs")
ex <- ex[ex != "Other"] ## Removing "Other" from ex
}
if(any(ex %in% colnames(otu))){
## Printing the unwanted taxa that were found
unc_sum <- 0
for(i in 1:length(ex)) {
if(ex[i] %in% names(otu)){
cat("Found '", ex[i], "' category in input data.\n", sep = "")
uncounts <- otu[,ex[i]] ## Pulling out the unwanted taxa
unc_sum <- unc_sum + uncounts ## Running counter for counts collapsed
if("Other" %in% colnames(otu)) { ## Combining unclassified with other if Other exists
if(class(uncounts) %in% c("data.frame", "matrix")){
otu[ ,"Other"] <- otu[ ,"Other"] + rowSums(uncounts) ## rowSums if many unclassified
} else otu[ ,"Other"] <- otu[ ,"Other"] + uncounts ## for only a vector of unclassified
# cat("Found 'Other' category in input data.\n")
} else {
if(class(uncounts) %in% c("data.frame", "matrix")){
otu$Other <- rowSums(uncounts) ## rowSums if many unclassified
} else otu$Other <- uncounts ## Making Other column if one doesn't exist
cat("Created new 'Other' category.\n")
}
} else{
cat("'", ex[i], "' not found in input data.","'", ex[i], "' might have been misspelled.
\n", sep = "")
}
}
otu <- otu[,names(otu) %nin% ex] ## Taking out ex so that it isn't there twice
cat("Found", length(starting_nms), "OTUs.\n")
cat("Collapsed",sum(ex %in% starting_nms),"OTUs into 'Other' in OTU table.\n")
cat("Converted",sum(unc_sum),"counts to 'Other' otu category.\n")
cat("Remaining OTUs:", ncol(otu)," (Including 'Other').\n\n")
} else {
cat("No taxa found in 'exlude_taxa' found: OTU counts unchanged.\n\n")
}
ending_cts <- sum(otu)
if (ending_cts != starting_cts) stop("Ending counts not equal to starting counts!\n")
otu
}
sum_fun <- function(Exp, obj, r){
if(!is.numeric(Exp[,obj])) stop("obj from tidy_mibi must be numeric")
if(!is.character(r)) stop("Table must be a character")
Exp <- Exp %>% filter(Table==r)
n <- tapply(Exp[,obj], Exp$Taxa, median) %>% names()
m <- tapply(Exp[,obj],Exp$Taxa,median) %>% as.data.frame()
i <- tapply(Exp[,obj],Exp$Taxa,IQR) %>% as.data.frame()
Q2 <- tapply(Exp[,obj],Exp$Taxa,quantile,0.025) %>% as.data.frame()
Q9 <- tapply(Exp[,obj],Exp$Taxa,quantile,0.975) %>% as.data.frame()
if(r != "phylum") cat("Note: Phylum level names are unclassified taxa.")
data.frame(
n = n[!is.na(m)],
Median = m[!is.na(m)],
IQR = i[!is.na(m)],
Quantile_2.5 = Q2[!is.na(Q2)],
Quantile_97.5 = Q9[!is.na(Q9)]
) %>% return()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.