#' Differential prevalence testing
#'
#' @description Perform differential prevalence test on individual taxa in a phylseq object.
#' If \code{block} is specified, then Cochran-Mantel-Haenszel test is used.
#' If \code{block} is NULL, then Fisher's exact test is used.
#'
#' @param physeq Phyloseq object.
#' @param group Name of column in sample metadata to perform group-wise comparisons on.
#' @param compare List of 2 groups to be compared, which are present in the \code{group} column.
#' Can be ignored or set to \code{NULL} when there are only 2 levels in the factor.
#' @param block Name of column in sample metadata to control for the group-wise comparisons.
#' @param p.adjust.method Method for adjusting P-values for multiple hypothesis testing.
#' @param minCount Minimal threshold for a taxon to be considered present in a sample.
#' Note: the check is \code{count > minCount} to distinguish zero from non-zero.
#'
#' @return Returns a list containing the following:
#' \itemize{
#' \item \code{table}: data frame containing the prevalence test results, with the following columns: \cr
#' \code{Taxon, pvalue, Direction, OR, OR_lb, OR_ub, padj, Significance, tax_table(phyloseq)}. \cr
#' \code{OR_lb} and \code{OR_ub} represent 95-percent confidence intervals for odds-ratio (OR).
#' \item \code{test_name}: Name of test used - one of \code{c("Fisher's exact test", "Cochran-Mantel-Haenszel test")}.
#' \item \code{block} : Name of variable used as blocking factor.
#' }
#'
#' @export
#'
#' @importFrom stats mantelhaen.test fisher.test
#'
#' @examples
#' \dontrun{
#'
#' # If there are only 2 levels for \code{Diet}
#' res <- test_differential_prevalence(physeq, group = "Diet", block = "Gender")
#'
#' # If there are multiple levels for \code{Diet}
#' res <- test_differential_prevalence(physeq, group = "Diet", compare=c("Control", "High_Fat"), block = "Gender")
#' res <- test_differential_prevalence(physeq, group = "Diet", compare=c("Control", "High_Fibre"), block = "Gender")
#' }
test_differential_prevalence <-
function(physeq,
group,
compare = NULL,
block = NULL,
p.adjust.method = "fdr",
minCount = 0L) {
if (taxa_are_rows(physeq)) {
physeq <- t(physeq)
}
CT <- as(otu_table(physeq), "matrix")
group_fac <- sample_data(physeq)[[group]]
if (!is.null(compare)) {
group_var_levels <- compare
} else {
group_var_levels <- levels(group_fac)
}
if (length(group_var_levels) > 2) {
stop(paste0("test_differential_prevalence() can only generate results for comparing 2 groups - you asked for ", paste(group_var_levels, collapse=",")))
}
if (!is.null(block)) {
test_name = "Cochran-Mantel-Haenszel test"
} else {
test_name = "Fisher's exact test"
}
i <- group_var_levels[1]
j <- group_var_levels[2]
# Apply to every taxon
#
res_mat <- apply(CT, 2, function(taxon_counts) {
# taxon_counts <- CT[,index]
# taxon_counts = CT[,13]
x <- taxon_counts[group_fac == i]
y <- taxon_counts[group_fac == j]
presence <- taxon_counts
presence[presence > minCount] = 1
presence <- as.factor(presence)
data <-
as.data.frame(cbind(presence, as.character(sample_data(physeq)[[group]])))
pvalue <- NA
OR <- NA
OR_lb <- NA
OR_ub <- NA
direction <- NA
if (sum(x == 0) != 0 | sum(y == 0) != 0) {
if (!is.null(block)) {
data <- cbind(data, as.character(sample_data(physeq)[[block]]))
colnames(data) <- c("Presence", group, block)
block_fac <- factor(sample_data(physeq)[[block]])
block_levels <- levels(block_fac)
w1 <-
with(subset(data, get(block) == block_levels[1]),
table(Presence, get(group)))
w2 <-
with(subset(data, get(block) == block_levels[2]),
table(Presence, get(group)))
if (sum(w1) > 1 && sum(w2) > 1) {
dim_names = list(
Presence = c("Yes", "No"),
var2 = c(i, j),
var3 = c(block_levels[1], block_levels[2])
)
names(dim_names) <- c("Presence", group, block)
w <-
array(
c(as.vector(w1), as.vector(w2)),
dim = c(2, 2, 2),
dimnames = dim_names
)
test_results <-
mantelhaen.test(
w,
alternative = "two.sided",
correct = TRUE,
conf.level = 0.95
)
pvalue <- test_results$p.value
OR <- round(test_results$estimate[[1]], 2)
#print(paste(pvalue, OR))
if (!is.nan(OR)) {
if (OR > 1) {
direction <- j
OR_lb <- round(test_results$conf.int[1], 2)
OR_ub <- round(test_results$conf.int[2], 2)
} else {
direction <- i
OR <- round(1/OR, 2)
OR_lb <- round(1/test_results$conf.int[2], 2)
OR_ub <- round(1/test_results$conf.int[1], 2)
}
}
}
} else {
colnames(data) <- c("Presence", group)
w <- with(data, table(Presence, get(group)))
if (dim(w)[1] > 1) {
test_results <-
fisher.test(
w,
alternative = "two.sided",
conf.int = TRUE,
conf.level = 0.95
)
pvalue <- test_results$p.value
OR <- round(test_results$estimate[[1]], 2)
if (OR > 1) {
direction <- j
OR_lb <- round(test_results$conf.int[1], 2)
OR_ub <- round(test_results$conf.int[2], 2)
} else {
direction <- i
OR <- round(1/OR, 2)
OR_lb <- round(1/test_results$conf.int[2], 2)
OR_ub <- round(1/test_results$conf.int[1], 2)
}
}
}
}
c(
pvalue = pvalue,
Direction = direction,
OR = OR,
OR_lb = OR_lb,
OR_ub = OR_ub
)
})
res_mat <- t(res_mat)
padj <- p.adjust(res_mat[, "pvalue"], method = p.adjust.method)
significance <- get_significance_string(padj)
DF <-
data.frame(
Taxon = rownames(res_mat),
res_mat,
padj = padj,
Significance = significance,
stringsAsFactors = FALSE
)
DF <- cbind(DF, tax_table(physeq))
my_results <- vector(mode="list")
my_results$table <- DF
my_results$test_name <- test_name
my_results$block <- block
return(my_results)
}
#
# Run DESeq2 pipeline
#
#######################################
### gm_own: calculate geometric mean
#######################################
# NB: this function comes from
# <http://stackoverflow.com/questions/2602583/geometric-mean-is-there-a-built-in>
## Input:
# x numeric vector
# na.rm:
# zeros.count: This is IMPORTANT, if TRUE 0s count, so if x contains a 0 the GM will be lower than when zero.count = FALSE, in
# which case the gm is calculated for the same vector in which all 0 have been removed.
## Output:
# the geometric mean of x
#' Calculate geometric mean of a vector.
#'
#' @param x Numeric vector
#' @param na.rm Should NA values be removed? If FALSE, you get NA when "x" data contains NA.
#' If TRUE NA gets ignored (treated as 1 in multiplication). (but NOTE these zeros always count also when zero.count = FALSE)
#' @param zeros.count Should I count 0's when estimating the mean?
#'
#' @return Geometric mean of elements in \code{x}
#' @export
#'
#' @examples
#' x <- c(1,64,0)
#'
#' # Returns 64^3 = 4
#' gm_own(x, zeros.count = TRUE)
#'
#' # Returns 64^2 = 8
#' gm_own(x, zeros.count = FALSE)
gm_own = function(x,
na.rm = FALSE,
zeros.count = TRUE) {
# If any negative values, then return NaN
if (any(x < 0, na.rm = TRUE)) {
return(NaN)
}
if (zeros.count) {
exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
} else {
exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x[x > 0]))
}
}
run_contrast <- function(dds, physeq, group, block = NULL, group_var_levels) {
my_group_fac <- sample_data(physeq)[[group]]
DF <- as.data.frame(results(dds, contrast = c(group, group_var_levels), cooksCutoff = TRUE))
CT <- counts(dds, normalized = TRUE)
DF$Mean_grp1 <- apply(CT[, my_group_fac == group_var_levels[1]], 1, mean)
DF$Mean_grp2 <- apply(CT[, my_group_fac == group_var_levels[2]], 1, mean)
DF$sig <- get_significance_string(DF$padj)
DF$sig <- as.factor(DF$sig)
DF$dir <- rep(group_var_levels[2], nrow(DF))
DF$dir[DF$log2FoldChange > 0] <- group_var_levels[1]
DF$log2FoldChange = round(DF$log2FoldChange, digits=2)
DF$Taxon <- rownames(DF)
DF <- dplyr::select(DF, Taxon, teststat = stat, pvalue, padj,
Significance = sig, Direction = dir, Mean_grp1,
Mean_grp2, baseMean, log2FC = log2FoldChange)
# add the taxonomic information and order based on test-statistic (since some pvalues are NA, not sure why)
DF <- cbind(DF, tax_table(physeq))
DF <- dplyr::arrange(DF, Direction, desc(abs(teststat)))
my_results <- vector(mode="list")
my_results$table <- DF
my_results$dds <- dds
#my_results$physeq_adj <- physeq_DESadjust
my_results$test_name <- "Negative binomial Wald test"
my_results$contrast <- paste0(group_var_levels, collapse = " vs ")
my_results$block <- block
return(my_results)
}
#' Extract DESeq2 results
#'
#' @description Extract differential results for two classes from DESeq2 results object.
#'
#' @param dds DESeq2 results object.
#' @param physeq Phyloseq object.
#' @param group Name of column in sample metadata to perform group-wise comparisons on.
#' @param block Name of column in sample metadata to control for the group-wise comparisons.
#' @param compare List of 2 groups to be compared, which are present in the \code{group} column.
#' @param cooksCutoff Cook's cutoff passed on to \code{contrast()} function in DESeq2.
#'
#' @return Returns a list containing the following:
#' \itemize{
#' \item \code{table}: data frame containing the negative binomial Wald test results from DESeq2,
#' with the following columns: \cr
#' \code{Taxon, teststat, pvalue, padj, significance, direction, Mean_grp1, Mean_grp2, baseMean, log2FC, tax_table(phyloseq)}.
#' \item \code{dds}: raw analysis results from DESeq2.
#' \item \code{physeq_adj}: adjusted counts from DESeq2.
#' \item \code{test_name}: Name of test used - in this case \code{"Negative binomial Wald test"}.
#' \item \code{block} : Name of variable used as blocking factor.
#' }
#'
#' @export
#'
#' @import DESeq2
#' @import dplyr
#' @import phyloseq
#'
#' @examples
#' \dontrun{
#'
#' res <- run_contrast_2_levels(dds, physeq, group = "Diet", compare=c("Control", "High_Fat"), block = "Gender")
#' res <- run_contrast_2_levels(dds, physeq, group = "Diet", compare=c("Control", "High_Fibre"), block = "Gender")
#' }
run_contrast_2_levels <- function(dds, physeq, group, block = NULL, compare = NULL, cooksCutoff = TRUE) {
# - get the results of the DESeq2 differential abundance analysis and add further info -
my_group_fac <- sample_data(physeq)[[group]]
if (!is.null(compare)) {
group_var_levels <- compare
} else {
group_var_levels <- levels(sample_data(physeq)[[group]])
}
if (length(group_var_levels) > 2) {
stop(paste0("DESeq2 can only generate results for comparing 2 groups - you asked for ", paste(group_var_levels, collapse=",")))
}
DF <- as.data.frame(results(dds, contrast = c(group, group_var_levels), cooksCutoff = cooksCutoff))
CT <- counts(dds, normalized = TRUE)
DF$Mean_grp1 <- apply(CT[, my_group_fac == group_var_levels[1]], 1, mean)
DF$Mean_grp2 <- apply(CT[, my_group_fac == group_var_levels[2]], 1, mean)
DF$sig <- get_significance_string(DF$padj)
DF$sig <- as.factor(DF$sig)
DF$dir <- rep(group_var_levels[2], nrow(DF))
DF$dir[DF$log2FoldChange > 0] <- group_var_levels[1]
DF$log2FoldChange = round(DF$log2FoldChange, digits=2)
DF$Taxon <- rownames(DF)
DF <- dplyr::select(DF, Taxon, teststat = stat, pvalue, padj,
Significance = sig, Direction = dir, Mean_grp1,
Mean_grp2, baseMean, log2FC = log2FoldChange)
# add the taxonomic information and order based on test-statistic (since some pvalues are NA, not sure why)
DF <- cbind(DF, tax_table(physeq))
DF <- dplyr::arrange(DF, Direction, desc(abs(teststat)))
# Transformed counts
physeq_DESadjust <- physeq
otu_table(physeq_DESadjust) <- otu_table(t(counts(dds, normalized = TRUE)), taxa_are_rows = FALSE)
# Results
my_results <- vector(mode="list")
my_results$table <- DF
my_results$dds <- dds
my_results$physeq_adj <- physeq_DESadjust
my_results$test_name <- "Negative binomial Wald test"
my_results$block <- block
return(my_results)
}
#' DESeq2 - negative binomial Wald test
#'
#' @description Run negative binomial Wald test using DESeq2 on all taxa in a phylseq object.
#' If \code{block} is specified, then it is used as an additional factor.
#' If \code{block} is NULL, then standard test is used.
#'
#' @param physeq Phyloseq object.
#' @param group Name of column in sample metadata to perform group-wise comparisons on.
#' @param compare List of 2 groups to be compared, which are present in the \code{group} column.
#' @param block Name of column in sample metadata to control for the group-wise comparisons.
#' @param cooksCutoff Cook's cutoff passed on to \code{contrast()} function in DESeq2.
#'
#' @return Returns a list containing the following:
#' \itemize{
#' \item \code{table}: data frame containing the negative binomial Wald test results from DESeq2,
#' with the following columns: \cr
#' \code{Taxon, teststat, pvalue, padj, significance, direction, Mean_grp1, Mean_grp2, baseMean, log2FC, tax_table(phyloseq)}.
#' \item \code{dds}: raw analysis results from DESeq2.
#' \item \code{physeq_adj}: adjusted counts from DESeq2.
#' \item \code{test_name}: Name of test used - in this case \code{"Negative binomial Wald test"}.
#' \item \code{block} : Name of variable used as blocking factor.
#' }
#'
#' @export
#'
#' @import DESeq2
#' @import phyloseq
#'
#' @examples
#' \dontrun{
#'
#' # If there are only 2 levels for \code{Diet}
#' res <- test_differential_abundance_DESeq2(physeq, group = "Diet", block = "Gender")
#'
#' # If there are multiple levels for \code{Diet}
#' res <- test_differential_abundance_DESeq2(physeq, group = "Diet", compare=c("Control", "High_Fat"), block = "Gender")
#' res <- test_differential_abundance_DESeq2(physeq, group = "Diet", compare=c("Control", "High_Fibre"), block = "Gender")
#' }
test_differential_abundance_DESeq2 <-
function(physeq, group, compare = NULL, block = NULL, cooksCutoff = TRUE) {
# NB: DESeq2 can not deal with ordered factors, therefore we have to "unorder" the group_fac
sample_data(physeq)[[group]] <-
factor(sample_data(physeq)[[group]],
levels = levels(sample_data(physeq)[[group]]),
ordered = FALSE)
if (!is.null(block)) {
sample_data(physeq)[[block]] <-
factor(sample_data(physeq)[[block]],
levels = levels(sample_data(physeq)[[block]]),
ordered = FALSE)
}
if (!is.null(block)) {
formula <- paste0("~", block, "+", group)
} else {
formula <- paste0("~", group)
}
DES = phyloseq_to_deseq2(physeq, as.formula(formula))
# Estimate size factors etc
# Custom solution:
# calculate geometric means of the species over all samples
# Thorsten had zeros.count = FALSE. I changed it, because it gave taxa with 0 everywhere
# as significant!
#
#GM <- apply(otu_table(physeq), 2, gm_own, zeros.count = TRUE)
#dds <- estimateSizeFactors(DES, type = "ratio", geoMeans = GM)
#sizeFactors(dds) # would give you the used size factors for library size adjustment
#rm(GM)
# DESeq2 standard solution:
dds <- estimateSizeFactors(DES, type = "poscounts")
dds <- DESeq(dds,
fitType = "parametric",
test = "Wald",
quiet = TRUE,
minReplicatesForReplace = Inf
# parallel = TRUE, BPPARAM = SnowParam(workers = 3, type = "SOCK")
)
# --
# # - get the results of the DESeq2 differential abundance analysis and add further info -
# my_group_fac <- sample_data(physeq)[[group]]
# if (!is.null(compare)) {
# group_var_levels <- compare
# } else {
# group_var_levels <- levels(sample_data(physeq)[[group]])
# }
#
# if (length(group_var_levels) > 2) {
# stop(paste0("DESeq2 can only generate results for comparing 2 groups - you asked for ", paste(group_var_levels, collapse=",")))
# }
#
# DF <- as.data.frame(results(dds, contrast = c(group, group_var_levels), cooksCutoff = TRUE))
# CT <- counts(dds, normalized = TRUE)
#
# DF$Mean_grp1 <- apply(CT[, my_group_fac == group_var_levels[1]], 1, mean)
# DF$Mean_grp2 <- apply(CT[, my_group_fac == group_var_levels[2]], 1, mean)
# DF$sig <- get_significance_string(DF$padj)
# DF$sig <- as.factor(DF$sig)
# DF$dir <- rep(group_var_levels[2], nrow(DF))
# DF$dir[DF$log2FoldChange > 0] <- group_var_levels[1]
# DF$log2FoldChange = round(DF$log2FoldChange, digits=2)
# DF$Taxon <- rownames(DF)
# DF <- dplyr::select(DF, Taxon, teststat = stat, pvalue, padj,
# Significance = sig, Direction = dir, Mean_grp1,
# Mean_grp2, baseMean, log2FC = log2FoldChange)
#
# # add the taxonomic information and order based on test-statistic (since some pvalues are NA, not sure why)
# DF <- cbind(DF, tax_table(physeq))
# DF <- dplyr::arrange(DF, Direction, desc(abs(teststat)))
#
# my_results <- vector(mode="list")
# my_results$table <- DF
# my_results$dds <- dds
# my_results$physeq_adj <- physeq_DESadjust
# my_results$test_name <- "Negative binomial Wald test"
# my_results$block <- block
#
# return(my_results)
run_contrast_2_levels(dds, physeq, group, block, compare, cooksCutoff)
}
#######################################
### Kruskal-Wallis Test
#######################################
#' Kruskal-Wallis differential abundance test
#'
#' @description Perform Kruskal-Wallis test on individual taxa in a phylseq object.
#' If \code{block} is specified, then it is used as a blocking factor.
#' If \code{block} is NULL, then standard Kruskal-Wallis test is applied.
#'
#' @param physeq Phyloseq object.
#' @param group Name of column in sample metadata to perform group-wise comparisons on.
#' @param compare List of 2 groups to be compared, which are present in the \code{group} column.
#' Can be ignored or set to \code{NULL} when there are only 2 levels in the factor.
#' @param block Name of column in sample metadata to control for the group-wise comparisons.
#' @param p.adjust.method Method for adjusting P-values for multiple hypothesis testing.
#'
#' @return Returns a list containing the following:
#' \itemize{
#' \item \code{table}: data frame containing the prevalence test results, with the following columns: \cr
#' \code{Taxon, H, pvalue, padj, Significance, tax_table(phyloseq)}. \cr
#' \item \code{test_name}: Name of test used - in this case \code{"Kruskal-Wallis test"}.
#' \item \code{block} : Name of variable used as blocking factor.
#' }
#'
#' @export
#'
#' @importFrom coin wilcox_test
#'
#' @examples
#' \dontrun{
#'
#' # If there are only 2 levels for \code{Diet}
#' res <- test_differential_abundance_Kruskal(physeq, group = "Diet", block = "Gender")
#'
#' # If there are multiple levels for \code{Diet}
#' res <- test_differential_abundance_Kruskal(physeq, group = "Diet", compare=c("Control", "High_Fat"), block = "Gender")
#' res <- test_differential_abundance_Kruskal(physeq, group = "Diet", compare=c("Control", "High_Fibre"), block = "Gender")
#' }
test_differential_abundance_Kruskal <-
function(physeq,
group,
compare = NULL,
block = NULL,
p.adjust.method = "fdr") {
if (taxa_are_rows(physeq)) {
physeq <- t(physeq)
}
CT <- as(otu_table(physeq), "matrix")
group_fac <- sample_data(physeq)[[group]]
if (!is.null(compare)) {
group_var_levels <- compare
subset <- group_fac %in% group_var_levels
} else {
group_var_levels <- levels(group_fac)
subset <- NULL
}
# Apply to every taxon
#
res_mat <- apply(CT, 2, function(taxon_counts) {
data <- cbind("RA" = taxon_counts, sample_data(physeq))
formula <- paste0("RA ~ ", group)
if (!is.null(block)) {
formula <- paste0(formula, " | ", block)
}
formula <- as.formula(formula)
pval <- NA
H <- NA
standStat <- NA
wt <-
coin::kruskal_test(
formula,
data = data,
subset = subset,
conf.int = FALSE,
distribution = "asymptotic",
alternative = "two.sided"
)
pval <- pvalue(wt)
H <- statistic(wt)
c(
H,
pval = pval
)
})
res_mat <- as.data.frame(t(res_mat))
padj <-
p.adjust(res_mat$pval, method = p.adjust.method)
significance <- get_significance_string(padj)
colnames(res_mat) <-
c(
"H",
"pvalue"
)
DF <-
data.frame(
Taxon = as.character(rownames(res_mat)),
res_mat,
padj = padj,
Significance = significance
)
DF <- cbind(DF, tax_table(physeq))
DF <- dplyr::arrange(DF, desc(abs(H)))
DF$Direction = NA
my_results <- vector(mode="list")
my_results$table <- DF
my_results$test_name <- "Kruskal-Wallis test"
my_results$block <- block
return(my_results)
}
#######################################
### Wilcoxon Test
#######################################
#' Wilcoxon rank sum differential abundance test
#'
#' @description Perform Wilcoxon rank sum test on individual taxa in a phylseq object.
#' If \code{block} is specified, then it is used as a blocking factor.
#' If \code{block} is NULL, then standard Wilcoxon test is applied.
#'
#' @param physeq Phyloseq object.
#' @param group Name of column in sample metadata to perform group-wise comparisons on.
#' @param compare List of 2 groups to be compared, which are present in the \code{group} column.
#' Can be ignored or set to \code{NULL} when there are only 2 levels in the factor.
#' @param block Name of column in sample metadata to control for the group-wise comparisons.
#' @param excludeZeros logical. Should \code{abundance=0} for a taxon in a sample be ignored when performing Wilcoxon test?
#' @param p.adjust.method Method for adjusting P-values for multiple hypothesis testing.
#'
#' @return Returns a list containing the following:
#' \itemize{
#' \item \code{table}: data frame containing the prevalence test results, with the following columns: \cr
#' \code{Taxon, teststat, W, pvalue, Med_1, Med_2, padj, Significance, Direction, tax_table(phyloseq)}. \cr
#' The teststatistic is based on the standardized teststatistic, equation provided by \code{multtest::mt.minP}.
#' \item \code{test_name}: Name of test used - in this case \code{"Wilcoxon rank sum test"}.
#' \item \code{block} : Name of variable used as blocking factor.
#' }
#'
#' @export
#'
#' @importFrom coin wilcox_test
#'
#' @examples
#' \dontrun{
#'
#' # If there are only 2 levels for \code{Diet}
#' res <- test_differential_abundance_Wilcoxon(physeq, group = "Diet", block = "Gender")
#'
#' # If there are multiple levels for \code{Diet}
#' res <- test_differential_abundance_Wilcoxon(physeq, group = "Diet", compare=c("Control", "High_Fat"), block = "Gender")
#' res <- test_differential_abundance_Wilcoxon(physeq, group = "Diet", compare=c("Control", "High_Fibre"), block = "Gender")
#' }
test_differential_abundance_Wilcoxon <-
function(physeq,
group,
compare = NULL,
block = NULL,
excludeZeros = FALSE,
p.adjust.method = "fdr") {
if (taxa_are_rows(physeq)) {
physeq <- t(physeq)
}
CT <- as(otu_table(physeq), "matrix")
group_fac <- sample_data(physeq)[[group]]
if (!is.null(compare)) {
group_var_levels <- compare
subset <- group_fac %in% group_var_levels
} else {
group_var_levels <- levels(group_fac)
subset = NULL
}
if (length(group_var_levels) > 2) {
stop(paste0("test_differential_prevalence() can only generate results for comparing 2 groups - you asked for ", paste(group_var_levels, collapse=",")))
}
i <- group_var_levels[1]
j <- group_var_levels[2]
# Apply to every taxon
#
res_mat <- apply(CT, 2, function(taxon_counts) {
x <- taxon_counts[group_fac == i]
if (excludeZeros) {
x <- x[x != 0]
}
Median_grp1 <- median(x, na.rm = T) # NA in case all 0
y <- taxon_counts[group_fac == j]
if (excludeZeros) {
#if(all(y == 0)){y[1] <- ceiling(mean(taxon_counts))+1}
y <- y[y != 0]
}
Median_grp2 <- median(y, na.rm = T)
# Do not worry about the subsetting of x and y above under excludeZeros.
# The data used by the actual test is untouched - it is the original data.
data <- cbind("RA" = taxon_counts, sample_data(physeq))
formula <- paste0("RA ~ ", group)
if (!is.null(block)) {
formula <- paste0(formula, " | ", block)
}
formula <- as.formula(formula)
pval <- NA
W <- NA
standStat <- NA
if (length(x) != 0 && length(y) != 0) {
wt <-
coin::wilcox_test(
formula,
data = data,
subset = subset,
conf.int = FALSE,
distribution = "asymptotic",
alternative = "two.sided"
)
pval <- pvalue(wt)
W <- statistic(wt)
Ranks <- rank(c(x, y))
n1 <- length(x)
n2 <- length(y)
standStat <-
-1 * ((sum(Ranks[1:n1]) - n1 * (n1 + n2 + 1) / 2) / sqrt(n1 * n2 * (n1 +
n2 + 1) / 12))
}
c(
teststat = standStat,
W,
pval = pval,
Median_grp1,
Median_grp2
)
})
res_mat <- as.data.frame(t(res_mat))
padj <-
p.adjust(res_mat$pval, method = p.adjust.method)
significance <- get_significance_string(padj)
direction <- rep(i, length(padj))
direction[res_mat$teststat > 0] <- j
colnames(res_mat) <-
c(
"teststat",
"W",
"pvalue",
"Med_1",
"Med_2"
)
DF <-
data.frame(
Taxon = as.character(rownames(res_mat)),
res_mat,
padj = padj,
Significance = significance,
Direction = direction
)
DF <- cbind(DF, tax_table(physeq))
DF <- dplyr::arrange(DF, Direction, desc(abs(teststat)))
my_results <- vector(mode="list")
my_results$table <- DF
my_results$test_name <- "Wilcoxon rank sum test"
my_results$block <- block
return(my_results)
}
#' Format differential test results to be presentable
#'
#' @description Format differential abundance/prevalence results for elegant presentation, selecting only the significant hits.
#'
#' @param results Results from \code{test_differential_abundance_DESeq2()},
#' or \code{test_differential_abundance_Wilcoxon()} or \code{test_differential_prevalence()}.
#' @param p.adjust.threshold Adjusted P-value threshold to use when returning the significant hits.
#' @param p.adjust.method P-value adjustment method, in case you want it to be re-estimated.
#' Use this only if you know what you are doing!!
#'
#' @return Returns a list containing all the entities in the input \code{results} variable,
#' with the following changes:
#' \itemize{
#' \item \code{table}: Data frame consisting of taxa that pass given FDR threshold, with names formatted for elegant presentation.
#' All hits up to and including the P-value threshold will be returned.
#' Columns depend on the type of results.\cr
#' Common columns are: \code{Taxon, Annotation, padj, Significance, Direction}.\cr
#' Depending on the result type, the following extra columns will be present:
#' \itemize{
#' \item Prevalence test: \code{OR, OR_CE}
#' \item DESeq2: \code{log2FC}
#' \item Wilcoxon: \code{Med_1, Med_2}
#' }
#' \item \code{nhits}: Number of significant hits that pass the threshold for adjusted P-value.
#' }
#'
#' @export
#'
#' @examples
#' \dontrun{
#'
#' # Run DESeq2
#'
#' x <- test_differential_abundance_DESeq2(physeq, group = "Health_status", compare = c("T2D", "Healthy"))
#' y <- format_hits(x, p.adjust.method= "bonferroni") # Overwrite DESeq's FDR padj with Bonferroni padj
#' z <- format_hits_for_heatmap(y)
#' draw_taxa_heatmap(physeq, taxa_data = z, group = "Health_status", compare = c("T2D", "Healthy"))
#'
#' # Run Wilcoxon rank sum test
#'
#' x <- test_differential_abundance_Wilcoxon(physeq, group = "Health_status")
#' y <- format_hits(x)
#' z <- format_hits_for_heatmap(y)
#' draw_taxa_heatmap(physeq, taxa_data = z, group = "Health_status", block = "Enterotype")
#'
#' # Set up custom palette
#' pal = list(Health_status = "Set2", Enterotype = "Pastel1", Significance = "PuRd")
#'
#' # Run differential prevalence test
#'
#' x <- test_differential_prevalence(physeq, group = "Health_status")
#' y <- format_hits(x)
#' z <- format_hits_for_heatmap(y, p.adjust.threshold = 0.01)
#' draw_taxa_heatmap(physeq, taxa_data = z, group = "Health_status", custom_palette = pal)
#' }
format_hits <- function(results, p.adjust.threshold = 0.10, p.adjust.method = NULL) {
df <- results$table
df <- dplyr::filter(df, !is.na(pvalue) & !is.na(padj))
if (!is.null(p.adjust.method)) {
df$padj = p.adjust(df$pvalue, method = p.adjust.method)
}
df <- dplyr::filter(df, padj <= p.adjust.threshold)
df$Annotation <- apply(df, 1, get_pretty_taxon_name)
specific_fields <- NULL
# Handle Fisher/Cochran
if (!is.null(df$OR)) {
df$OR_lb <- as.numeric(df$OR_lb)
df <- dplyr::arrange(df, Direction, desc(OR_lb), padj, OR, Taxon)
df$OR_CE <- paste0("[", df$OR_lb, ", ", df$OR_ub, "]")
specific_fields = c("OR", "OR_CE")
}
# Handle DESeq2
if (!is.null(df$log2FC)) {
df <- dplyr::arrange(df, Direction, desc(abs(log2FC)), padj, Taxon)
specific_fields = c("log2FC")
}
# Handle Wilcoxon
if (!is.null(df$W)) {
df <- dplyr::arrange(df, Direction, desc(abs(teststat)), padj, Med_1, Taxon)
specific_fields = c("Med_1", "Med_2")
}
# Handle Kruskal
if (!is.null(df$H)) {
df <- dplyr::arrange(df, Direction, desc(abs(H)), padj, Taxon)
specific_fields = c("H")
}
df <-
dplyr::select(df,
Taxon,
Annotation,
padj,
Significance,
Direction,
specific_fields)
rownames(df) <- df$Taxon
my_results <- results
my_results$table <- df
my_results$nhits <- dim(df)[1]
return(my_results)
}
write_hit_table <- function(tab, filename = NULL) {
write.table(tab, file = filename, sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE)
}
read_hit_table <-function(filename = NULL) {
read.table(file = filename, sep = "\t", header = TRUE, row.names = 1, quote = "")
}
#' Filter differential test results for specific taxa
#'
#' @description Filter differential rest results by removing taxa. For example, a genus level table obtained
#' using \code{tax_glom()} function will put all unknown genera under a family as \code{Genus = NA}
#' under that family. But this is a Frankenstein genus that could potentially contain multiple
#' genera. Thus it would be a good idea to remove such genera from the results table, and only
#' report those with \code{Genus != NA}.
#' @param results Results from \code{test_differential_abundance_DESeq2()},
#' or \code{test_differential_abundance_Wilcoxon()} or \code{test_differential_prevalence()}.
#' @param rank Name of taxonomic rank to be tested for any \code{NA} values.
#'
#' @return Results object with same variables as in the input \code{results} object, except that
#' the \code{table} variable is now free of any entries that have \code{NA} and the \code{rank} level.
#' @export
#'
#' @examples
#' \dontrun{
#' res <- test_differential_prevalence(physeq, group = "Diet")
#'
#' # Remove Genus=NA because it is a chimeric genus.
#' res <- filter_results(res, "Genus")
#'
#' # Now format them
#' hits <- format_hits(res)
#' }
filter_results <- function(results, rank) {
table <- results$table
filtered_table <- dplyr::filter(table, !is.na(get(rank)))
my_results <- results
my_results$table <- filtered_table
return(my_results)
}
#' Make knitr table of differential results
#'
#' @description Make a table using knitr reporting the differential abundance/prevalence test results.
#' Table caption is standardized based on the test and comparisons made.
#'
#' @param hits results object containing significant hits, obtained from \code{format_hits()}.
#' @param units units that were used for differential test, one of \code{c("OTUs", "Genera")}.
#' @param group1 First group that is being compared.
#' @param group2 Second group that is being compared.
#' @param bold_rows List of row identifiers that should be highlighted.
#'
#' @return \code{knitr} table object.
#' @export
#'
#' @examples
#' \dontrun{
#' x <- test_differential_prevalence(physeq, group = "Health_status", compare = c("T2D", "Healthy"))
#' y <- format_hits(x)
#' z <- knit_hits_table(y, units="OTUs", group1 = "T2D", group2 = "Healthy")
#' z
#'
#' # Finding overlap between two result sets and highlighting common taxa.
#'
#' res1 <- test_differential_abundance_Wilcoxon(physeq = physeq, group = "Health_status", compare = c("T2D", "Healthy"))
#' hits1 <- format_hits(res1)
#'
#' res2 <- test_differential_abundance_Wilcoxon(physeq = physeq, group = "Health_status", compare = c("Prediabetic", "Healthy"))
#' hits2 <- format_hits(res2)
#'
#' overlap <- intersect(hits1$table$Taxon, hits2$table$Taxon)
#'
#' o1 = which(rownames(hits1$table) %in% overlap)
#' o2 = which(rownames(hits2$table) %in% overlap)
#'
#' knit_hits_table(hits1, units="OTUs", group1="T2D", group2="Healthy", bold_rows = o1)
#' knit_hits_table(hits2, units="OTUs", group1="Prediabetic", group2="Healthy", bold_rows = o2)
#' }
knit_hits_table <- function(hits, units = NULL, group1 = NULL, group2 = NULL, bold_rows = NULL) {
df <- hits$table
test_name <- hits$test_name
# Handle Fisher/Cochran
if (!is.null(df$OR)) {
test_type <- "prevalence"
adverb <- "prevalent"
extra_fields <- "OR - Odds ratio; OR-CE - 95 percent confidence interval for odds ratio."
}
# Handle DESeq2
if (!is.null(df$log2FC)) {
test_type <- "abundance"
adverb <- "abundant"
extra_fields <- "log2FC - log2 of fold change."
}
# Handle Wilcoxon
if (!is.null(df$Med_1)) {
test_type <- "abundance"
adverb <- "abundant"
extra_fields <- paste0("Med-1 - Median in 1st group; Med-2 - Median in 2nd group.")
df$Med_1 <- format(df$Med_1, digits = 3)
df$Med_2 <- format(df$Med_2, digits = 3)
}
# Handle Kruskal-Wallis
if (!is.null(df$H)) {
test_type <- "abundance"
adverb <- "abundant"
extra_fields <- paste0(".")
}
if (is.null(test_name)) {
stop("Cannot handle the significant hits list. Please check format.")
}
my_caption = paste(
test_name,
":",
units,
"that are differentially",
adverb,
"between",
group1,
"and",
group2,
"individuals.",
"padj - adjusted P-value;",
"Significance level, *** - P < 0.001, ** - P < 0.01, * - P < 0.05; . - P < 0.10",
"Direction - direction of enrichment;",
extra_fields
)
df$padj <- format(df$padj, digits=3)
rownames(df) <- NULL
longtable = FALSE
if (hits$nhits > 40) {
longtable = TRUE
}
my_table <- NULL
if (longtable == TRUE) {
my_table <- knitr::kable(df, caption = my_caption, longtable = longtable) %>%
kable_styling(font_size = 7, latex_options = c("repeat_header")) %>%
{if(!is.null(bold_rows)) row_spec(., row=bold_rows, bold = T, color="white", background="black") else .}
} else {
my_table <- knitr::kable(df, caption = my_caption) %>%
kable_styling(font_size = 7) %>%
{if(!is.null(bold_rows)) row_spec(., row=bold_rows, bold = T, color="white", background="black") else .}
}
return(my_table)
}
#' Format significant hits results so that it can be shown as an annotation by \code{pheatmap()}.
#'
#' @param hits results object consisting the significant hits obtained from \code{format_hits()}.
#'
#' @return data frame consisting the significant hits and their statistical significance as
#' a character vector - \code{c(" NS", "*", "**", "***")}.
#' @export
#'
#' @examples
#' \dontrun{
#'
#' x <- test_differential_prevalence(physeq, group = "Diet", block = "Enterotype")
#' y <- format_hits(x)
#' z <- format_hits_for_heatmap(y)
#' p <- draw_taxa_heatmap(physeq, taxa_data = z, group = "Diet", block = "Enterotype")
#' }
format_hits_for_heatmap <- function(hits) {
df <- hits$table
heatmap_data <- dplyr::select(df, Taxon, Significance)
rownames(heatmap_data) <- heatmap_data$Taxon
heatmap_data$Taxon <- NULL
return(heatmap_data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.