#####
#Hacking phyloseq
#####
################################################################################
# Define S3 generic extract_eigenvalue function; formerly S4 generic get_eigenvalue()
# Function is used by `plot_scree` to get the eigenvalue vector from different
# types of ordination objects.
# Used S3 generic in this case because many ordination objects, the input, are
# not formally-defined S4 classes, but vaguely-/un-defined S3. This throws
# warnings during package build if extract_eigenvalue were S4 generic method,
# because the ordination classes don't appear to have any definition in phyloseq
# or dependencies.
#' @keywords internal
extract_eigenvalue = function(ordination) UseMethod("extract_eigenvalue", ordination)
# Default is to return NULL (e.g. for NMDS, or non-supported ordinations/classes).
extract_eigenvalue.default = function(ordination) NULL
# for pcoa objects
extract_eigenvalue.pcoa = function(ordination) ordination$values$Relative_eig
# for CCA objects
extract_eigenvalue.cca = function(ordination) c(ordination$CCA$eig, ordination$CA$eig)
# for RDA objects
extract_eigenvalue.rda = function(ordination) c(ordination$CCA$eig, ordination$CA$eig)
# for dpcoa objects
extract_eigenvalue.dpcoa = function(ordination) ordination$eig
# for decorana (dca) objects
extract_eigenvalue.decorana = function(ordination) ordination$evals
################################################################################
rm.na.phyloseq <- function(DF, key.var){
# (1) Remove elements from DF if key.var has NA
# DF[!is.na(DF[, key.var]), ]
DF <- subset(DF, !is.na(eval(parse(text=key.var))))
# (2) Remove NA from the factor level, if a factor.
if( class(DF[, key.var]) == "factor" ){
DF[, key.var] <- factor(as(DF[, key.var], "character"))
}
return(DF)
}
my_plot <- function (physeq, ordination, type = "samples", axes = 1:2, color = NULL,
shape = NULL, label = NULL, title = NULL, justDF = FALSE)
{
if (length(type) > 1) {
warning("`type` can only be a single option,\\n but more than one provided. Using only the first.")
type <- type[[1]]
}
if (length(color) > 1) {
warning("The `color` variable argument should have length equal to 1.",
"Taking first value.")
color = color[[1]][1]
}
if (length(shape) > 1) {
warning("The `shape` variable argument should have length equal to 1.",
"Taking first value.")
shape = shape[[1]][1]
}
if (length(label) > 1) {
warning("The `label` variable argument should have length equal to 1.",
"Taking first value.")
label = label[[1]][1]
}
official_types = c("sites", "species", "biplot", "split",
"scree")
if (!inherits(physeq, "phyloseq")) {
if (inherits(physeq, "character")) {
if (physeq == "list") {
return(official_types)
}
}
warning("Full functionality requires `physeq` be phyloseq-class ",
"with multiple components.")
}
type = gsub("^.*site[s]*.*$", "sites", type, ignore.case = TRUE)
type = gsub("^.*sample[s]*.*$", "sites", type, ignore.case = TRUE)
type = gsub("^.*species.*$", "species", type, ignore.case = TRUE)
type = gsub("^.*taxa.*$", "species", type, ignore.case = TRUE)
type = gsub("^.*OTU[s]*.*$", "species", type, ignore.case = TRUE)
type = gsub("^.*biplot[s]*.*$", "biplot", type, ignore.case = TRUE)
type = gsub("^.*split[s]*.*$", "split", type, ignore.case = TRUE)
type = gsub("^.*scree[s]*.*$", "scree", type, ignore.case = TRUE)
if (!type %in% official_types) {
warning("type argument not supported. `type` set to 'samples'.\\n",
"See `plot_ordination('list')`")
type <- "sites"
}
if (type %in% c("scree")) {
return(plot_scree(ordination, title = title))
}
is_empty = function(x) {
length(x) < 2 | suppressWarnings(all(is.na(x)))
}
specDF = siteDF = NULL
trash1 = try({
siteDF <- scores(ordination, choices = axes, display = "sites",
physeq = physeq)
}, silent = TRUE)
trash2 = try({
specDF <- scores(ordination, choices = axes, display = "species",
physeq = physeq)
}, silent = TRUE)
siteSampIntx = length(intersect(rownames(siteDF), sample_names(physeq)))
siteTaxaIntx = length(intersect(rownames(siteDF), taxa_names(physeq)))
specSampIntx = length(intersect(rownames(specDF), sample_names(physeq)))
specTaxaIntx = length(intersect(rownames(specDF), taxa_names(physeq)))
if (siteSampIntx < specSampIntx & specTaxaIntx < siteTaxaIntx) {
co = specDF
specDF <- siteDF
siteDF <- co
rm(co)
} else {
if (siteSampIntx < specSampIntx) {
siteDF <- specDF
specDF <- NULL
}
if (specTaxaIntx < siteTaxaIntx) {
specDF <- siteDF
siteDF <- NULL
}
}
if (is_empty(siteDF) & is_empty(specDF)) {
warning("Could not obtain coordinates from the provided `ordination`. \\n",
"Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.")
return(NULL)
}
if (is_empty(specDF) & type != "sites") {
message("Species coordinates not found directly in ordination object. Attempting weighted average (`vegan::wascores`)")
specDF <- data.frame(wascores(siteDF, w = veganifyOTU(physeq)),
stringsAsFactors = FALSE)
}
if (is_empty(siteDF) & type != "species") {
message("Species coordinates not found directly in ordination object. Attempting weighted average (`vegan::wascores`)")
siteDF <- data.frame(wascores(specDF, w = t(veganifyOTU(physeq))),
stringsAsFactors = FALSE)
}
specTaxaIntx <- siteSampIntx <- NULL
siteSampIntx <- length(intersect(rownames(siteDF), sample_names(physeq)))
specTaxaIntx <- length(intersect(rownames(specDF), taxa_names(physeq)))
if (siteSampIntx < 1L & !is_empty(siteDF)) {
warning("`Ordination site/sample coordinate indices did not match `physeq` index names. Setting corresponding coordinates to NULL.")
siteDF <- NULL
}
if (specTaxaIntx < 1L & !is_empty(specDF)) {
warning("`Ordination species/OTU/taxa coordinate indices did not match `physeq` index names. Setting corresponding coordinates to NULL.")
specDF <- NULL
}
if (is_empty(siteDF) & is_empty(specDF)) {
warning("Could not obtain coordinates from the provided `ordination`. \\n",
"Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.")
return(NULL)
}
if (type %in% c("biplot", "split") & (is_empty(siteDF) |
is_empty(specDF))) {
if (is_empty(siteDF)) {
warning("Could not access/evaluate site/sample coordinates. Switching type to 'species'")
type <- "species"
}
if (is_empty(specDF)) {
warning("Could not access/evaluate species/taxa/OTU coordinates. Switching type to 'sites'")
type <- "sites"
}
}
if (type != "species") {
sdf = NULL
sdf = data.frame(access(physeq, slot = "sam_data"), stringsAsFactors = FALSE)
if (!is_empty(sdf) & !is_empty(siteDF)) {
siteDF <- cbind(siteDF, sdf[rownames(siteDF), ])
}
}
if (type != "sites") {
tdf = NULL
tdf = data.frame(access(physeq, slot = "tax_table"),
stringsAsFactors = FALSE)
if (!is_empty(tdf) & !is_empty(specDF)) {
specDF = cbind(specDF, tdf[rownames(specDF), ])
}
}
if (!inherits(siteDF, "data.frame")) {
siteDF <- as.data.frame(siteDF, stringsAsFactors = FALSE)
}
if (!inherits(specDF, "data.frame")) {
specDF <- as.data.frame(specDF, stringsAsFactors = FALSE)
}
DF = NULL
DF <- switch(EXPR = type, sites = siteDF, species = specDF,
{
specDF$id.type <- "Taxa"
siteDF$id.type <- "Samples"
colnames(specDF)[1:2] <- colnames(siteDF)[1:2]
DF = merge(specDF, siteDF, all = TRUE)
if (!is.null(shape)) {
DF <- rp.joint.fill(DF, shape, "Samples")
}
if (!is.null(shape)) {
DF <- rp.joint.fill(DF, shape, "Taxa")
}
if (!is.null(color)) {
DF <- rp.joint.fill(DF, color, "Samples")
}
if (!is.null(color)) {
DF <- rp.joint.fill(DF, color, "Taxa")
}
DF
})
if (justDF) {
return(DF)
}
if (!is.null(color)) {
if (!color %in% names(DF)) {
warning("Color variable was not found in the available data you provided.",
"No color mapped.")
color <- NULL
}
}
if (!is.null(shape)) {
if (!shape %in% names(DF)) {
warning("Shape variable was not found in the available data you provided.",
"No shape mapped.")
shape <- NULL
}
}
if (!is.null(label)) {
if (!label %in% names(DF)) {
warning("Label variable was not found in the available data you provided.",
"No label mapped.")
label <- NULL
}
}
x = colnames(DF)[1]
y = colnames(DF)[2]
if (ncol(DF) <= 2) {
message("No available covariate data to map on the points for this plot `type`")
ord_map = aes_string(x = x, y = y)
}
else if (type %in% c("sites", "species", "split")) {
ord_map = aes_string(x = x, y = y, color = color, shape = shape,
na.rm = TRUE)
}
else if (type == "biplot") {
if (is.null(color)) {
ord_map = aes_string(x = x, y = y, size = "id.type",
color = "id.type", shape = shape, na.rm = TRUE)
}
else {
ord_map = aes_string(x = x, y = y, size = "id.type",
color = color, shape = shape, na.rm = TRUE)
}
}
p <- ggplot(DF, ord_map) + geom_point(na.rm = TRUE)
if (type == "split") {
p <- p + facet_wrap(~id.type, nrow = 1)
}
if (type == "biplot") {
if (is.null(color)) {
p <- update_labels(p, list(colour = "Ordination Type"))
}
p <- p + scale_size_manual("type", values = c(Samples = 5,
Taxa = 2))
}
if (!is.null(label)) {
label_map <- aes_string(x = x, y = y, label = label,
na.rm = TRUE)
p = p + geom_text(label_map, data = rm.na.phyloseq(DF,
label), size = 2, vjust = 1.5, na.rm = TRUE)
}
if (!is.null(title)) {
p = p + ggtitle(title)
}
if (length(extract_eigenvalue(ordination)[axes]) > 0) {
eigvec = extract_eigenvalue(ordination)
fracvar = eigvec[axes]/sum(eigvec)
percvar = round(100 * fracvar, 1)
strivar = as(c(p$label$x, p$label$y), "character")
strivar = paste0(strivar, " [", percvar, "%]")
p = p + xlab(strivar[1]) + ylab(strivar[2])
}
return(p)
}
#####
# Local methods
#####
# Multiple plot function
# (from http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/)
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
pam.clustering <-
function(x, k) {
# x is a distance matrix and k the number of clusters
require(cluster)
c = as.vector(pam(as.dist(x), k, diss = TRUE)$clustering)
return(c)
}
# Two functions below correspond to
# more efficient implementations of pairwise JSD and cohortwise JSD.
# Orders of magnitude faster than the one on enterotype tutorial website.
# Also does not use Pseudocount because only non-zero values are used.
# x - List of R.A. in this sample
# ShortX - Those in 'x' that are non-zero
# FlagX - Flag suggesting whether element in 'x' is non-zero, essentially result of 'x>0'
# Same for y
JSD <- function(x, ShortX, FlagX, y, ShortY, FlagY) {
M <- (x + y) / 2
sqrt((sum(ShortX * log(ShortX / M[FlagX])) + sum(ShortY * log(ShortY /
M[FlagY]))) / 2)
}
dist.JSD <- function(inMatrix, ...) {
matrixColSize <- length(colnames(inMatrix))
matrixRowSize <- length(rownames(inMatrix))
colnames <- colnames(inMatrix)
resultsMatrix <- matrix(0, matrixColSize, matrixColSize)
vec <- vector(mode = "list", length = matrixColSize)
shortvec <- vector(mode = "list", length = matrixColSize)
flags <- vector(mode = "list", length = matrixColSize)
for (i in 1:matrixColSize) {
v <- as.vector(inMatrix[, i])
f <- v > 0
s <- v[f]
vec[[i]] <- v
flags[[i]] <- f
shortvec[[i]] <- s
}
for (i in 1:(matrixColSize - 1)) {
resultsMatrix[i, i] <- 0
for (j in (i + 1):matrixColSize) {
d <-
JSD(vec[[i]], shortvec[[i]], flags[[i]], vec[[j]], shortvec[[j]], flags[[j]])
resultsMatrix[i, j] <- d
resultsMatrix[j, i] <- d
}
}
resultsMatrix[matrixColSize, matrixColSize] <- 0
colnames -> colnames(resultsMatrix) -> rownames(resultsMatrix)
resultsMatrix <- as.dist(resultsMatrix)
attr(resultsMatrix, "method") <- "dist"
return(resultsMatrix)
}
#' Within- and between-group beta-diversity analysis
#'
#' @param dist_list Named list of dist objects (one object for each distance)
#' @param physeq Phyloseq object
#' @param group_var Character identifying the grouping variable in the sample_data of the phyloseq
#'
#' @return list of two named lists, one with the plots and one with the data_frames of the p_values from pairwise t.tests
#' @export
#'
distance_t_analyse <- function(dist_list, physeq, group_var) {
TrList <- vector(mode = "list", length = length(dist_list))
pValList <- vector(mode = "list", length = length(dist_list))
for (i in 1:length(dist_list)) {
DistMat <- as(dist_list[[i]], "matrix")
rowCol <- expand.grid(rownames(DistMat), colnames(DistMat))
labs <- rowCol[as.vector(lower.tri(DistMat, diag = F)), ]
df <- cbind(labs, DistMat[lower.tri(DistMat, diag = F)])
colnames(df) <- c("Row", "Col", "Distance")
samdf <-
data.frame(Sample = sample_names(physeq), Group = sample_data(physeq)[[group_var]])
df$Row_Group <- samdf$Group[match(df$Row, samdf$Sample)]
df$Col_Group <- samdf$Group[match(df$Col, samdf$Sample)]
# add GroupvsGroup using sort (order + match) to make sure that Case vs Control and Control vs Case are equal
df$GroupvsGroup <-
apply(df[, c("Row_Group", "Col_Group")], 1, function(x) {
paste(x[order(match(x, levels(samdf$Group)))], collapse = " vs ")
})
# to make sure group comparisons are in the "order" of levels "group_var"
df$GroupvsGroupOrder <-
apply(df[, c("Row_Group", "Col_Group")], 1, function(x) {
x1 <- match(x[1], levels(samdf$Group))
x2 <- match(x[2], levels(samdf$Group))
if (x1 <= x2) {
as.numeric(paste(x1, x2, sep = ""))
} else {
as.numeric(paste(x2, x1, sep = ""))
}
})
df$Type <- "between"
df$Type[df$Row_Group == df$Col_Group] <- "within"
# NB: for within group distances, you have samples*(samples-1)/2 distances, for between group distances
# you have samplesgrp1 * samplesgrp2 distances.
GvsGLeveldf <-
unique(df[, c("GroupvsGroup", "GroupvsGroupOrder", "Type")])
GroupvsGroupLevels <-
GvsGLeveldf[order(GvsGLeveldf$GroupvsGroupOrder),]$GroupvsGroup
GvsGLeveldfBetween <-
GvsGLeveldf[GvsGLeveldf$Type == "between",]
GroupvsGroupLevelsBetween <-
GvsGLeveldfBetween[order(GvsGLeveldfBetween$GroupvsGroupOrder),]$GroupvsGroup
df$GroupvsGroup <-
factor(df$GroupvsGroup, levels = GroupvsGroupLevels, ordered = TRUE)
pairwise_dist_ttest <-
pairwise.t.test(
x = df$Distance,
g = df$GroupvsGroup,
alternative = "two",
p.adjust.method = "none",
var.equal = F,
pool.sd = F
)
pairwise_dist_ttest <- pairwise_dist_ttest$p.value
rowCol <-
expand.grid(rownames(pairwise_dist_ttest),
colnames(pairwise_dist_ttest))
labs <-
rowCol[as.vector(lower.tri(pairwise_dist_ttest, diag = T)), ]
df_p <-
cbind(labs, pairwise_dist_ttest[lower.tri(pairwise_dist_ttest, diag = T)])
colnames(df_p) <- c("Distances_2", "Distances_1", "p_value")
df_p <- df_p[, c("Distances_1", "Distances_2", "p_value")]
df_p$Significance <- ""
df_p$Significance[df_p$p_value <= 0.05] <- "*"
df_p$Significance[df_p$p_value <= 0.01] <- "**"
df_p$Significance[df_p$p_value <= 0.001] <- "***"
pValList[[i]] <- df_p
# in case there are more than two groups in group_var I want a faceted plot, i.e. one facet for each GroupvsGroupLevelsBetween, for this I need to duplicate some data
if (length(GroupvsGroupLevels) > 1) {
df_list <- lapply(GroupvsGroupLevelsBetween, function(level) {
grps <- unlist(strsplit(level, " vs "))
df_current <-
df[df$Row_Group %in% grps & df$Col_Group %in% grps,]
df_current$Level <- level
df_current
})
df_plot <- do.call(rbind, df_list)
df_plot$Level <-
factor(df_plot$Level, levels = GroupvsGroupLevelsBetween, ordered = TRUE)
Tr <-
ggplot(df_plot,
aes(x = GroupvsGroup, y = Distance, col = GroupvsGroup))
Tr <- Tr + geom_boxplot(outlier.color = NA) +
# geom_jitter(position = position_jitter(width = 0.3, height = 0), alpha = 0.65) +
facet_wrap( ~ Level, ncol = 2, scales = "free_x") +
theme_bw() +
xlab("") +
ylab(paste(names(dist_list)[i], "distance", sep = " ")) +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
vjust = 0.5
),
legend.title = element_blank()
)
if (length(GroupvsGroupLevels) <= 7) {
Tr <- Tr +
scale_color_manual(values = cbPalette[2:8])
}
} else {
Tr <-
ggplot(df, aes(x = GroupvsGroup, y = Distance, col = GroupvsGroup))
Tr <- Tr + geom_boxplot(outlier.color = NA) +
geom_jitter(position = position_jitter(width = 0.3, height = 0),
alpha = 0.65) +
theme_bw() +
xlab("") +
ylab(paste(names(dist_list)[i], "distance", sep = " ")) +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
vjust = 0.5
),
legend.title = element_blank()
)
if (length(GroupvsGroupLevels) <= 7) {
Tr <- Tr +
scale_color_manual(values = cbPalette[2:8])
}
}
TrList[[i]] <- Tr
}
names(TrList) <- names(dist_list)
names(pValList) <- names(dist_list)
out <- list(DistanceBoxplots = TrList, DistancePValues = pValList)
}
#' Make knitr table from Permanova results
#'
#' @param adonis Results object from a call to \code{vegan::adonis()}.
#'
#' @return knitr table object.
#' @export
#'
#' @examples
#' \dontrun{
#' adonis_results <- vegan::adonis(dist ~ Diet + Gender)
#' get_permanova_table(adonis_results)
#' }
get_permanova_table <- function(adonis) {
kable_table <- adonis[[1]]
kable_table[["Pr(>F)"]] <- format(kable_table[["Pr(>F)"]], digits = 3)
knitr::kable(kable_table,
digits = c(0, 3, 3, 3, 5, 3),
caption = "Permanova results")
}
####################################
## calculate_raw_TbTmatrixes:
###################################
## Input:
# - physeq = a phyloseq object
# - group_var, name of the group_fac in sample_data(physeq)
## Output:
# - list of TbTmatrixes, one list for each combi of levels in group_fac, list items are named by level_vs_level
calculate_raw_TbTmatrixes = function(physeq, group_var) {
if (taxa_are_rows(physeq)) {
physeq <- t(physeq)
}
group_fac <- factor(sample_data(physeq)[[group_var]])
fac_levels <- levels(group_fac)
combinations <- combn(fac_levels, m=2, simplify=FALSE)
TbTmatrixes_list <- vector("list", length = length(combinations))
for (k in 1:length(combinations)) {
i <- combinations[[k]][1]
j <- combinations[[k]][2]
CT <-
t(as(otu_table(physeq), 'matrix')) # now taxa are rows and samples are columns
group_fac_current <-
droplevels(group_fac[group_fac %in% c(i, j)])
CT <- CT[, group_fac %in% group_fac_current]
TbTmatrixes <-
lapply(1:nrow(CT), function(i) {
apply(CT, 2, function(samp_cnts) {
samp_cnts[i] / samp_cnts
})
})
# produces for each taxon (= host taxon) a TbTMatrix
# NB: there are -Inf, and NaN values in the matrixes, specifically
# 0/x = 0, x/0 = Inf; 0/0 = NaN!
names(TbTmatrixes) <- rownames(TbTmatrixes[[1]])
TbTmatrixes_list[[k]] <- TbTmatrixes
names(TbTmatrixes_list)[[k]] <-
paste(i, "_vs_", j, sep = "")
}
TbTmatrixes_list
}
################################## plot functions ##########################
cbPalette <-
c(
"#999999",
"#E69F00",
"#56B4E9",
"#009E73",
"#F0E442",
"#0072B2",
"#D55E00",
"#CC79A7"
)
#######################################
#### plot_abundance_prev_filter
#######################################
plot_abundance_prev_filter <-
function(physeq, prevalence, taxa_sums_quantile = 100) {
df_ab_prev <- data.frame(
SV_ID = 1:ntaxa(physeq),
total_counts_of_ASV = taxa_sums(physeq),
prevalence = colSums(as(otu_table(physeq), "matrix") != 0),
sparsity = colSums(as(otu_table(physeq), "matrix") == 0),
mean_count_nonzero = apply(as(otu_table(physeq), "matrix"), 2, function(x) {
mean(x[x > 0])
}),
median_count_nonzero = apply(as(otu_table(physeq), "matrix"), 2, function(x) {
median(x[x > 0])
})
)
for (i in 1:ncol(df_ab_prev)) {
df_ab_prev[is.nan(df_ab_prev[, i]), i] <- NA
}
df_ab_prev <- cbind(df_ab_prev, tax_table(physeq))
prev_thresh <- (prevalence / 100) * nsamples(physeq)
abund_thresh <-
quantile(taxa_sums(physeq), probs = taxa_sums_quantile / 100)
df_ab_prev_filt <-
dplyr::filter(df_ab_prev,
prevalence > prev_thresh | total_counts_of_ASV > abund_thresh)
no_samples <- df_ab_prev$prevalence[1] + df_ab_prev$sparsity[1]
shade_df <- data.frame(total_counts_of_ASV = 0, prevalence = 0)
Tr_prev_vs_log10_ab <-
ggplot(df_ab_prev, aes(x = total_counts_of_ASV, y = prevalence))
Tr_prev_vs_log10_ab <- Tr_prev_vs_log10_ab +
geom_point(col = cbPalette[2],
size = 2,
alpha = 0.7) +
scale_x_log10() +
geom_rect(
data = shade_df,
xmin = -Inf,
xmax = log10(abund_thresh),
ymin = -Inf,
ymax = prev_thresh,
fill = "#660033",
alpha = 0.4
) +
geom_hline(
yintercept = (prevalence / 100) * nsamples(physeq),
col = cbPalette[1],
lty = "dashed"
) +
geom_vline(
xintercept = quantile(taxa_sums(physeq), probs = taxa_sums_quantile / 100),
col = cbPalette[1],
lty = "dashed"
) +
xlab("total abundance in cohort") +
theme_bw(12) +
ggtitle(
paste(
nrow(df_ab_prev_filt),
" of ",
nrow(df_ab_prev),
" features (",
round(100 * nrow(df_ab_prev_filt) / nrow(df_ab_prev), 1),
" %) and ",
round(sum(df_ab_prev_filt$total_counts_of_ASV)),
" of ",
round(sum(df_ab_prev$total_counts_of_ASV)),
" total abundance (",
round((
sum(df_ab_prev_filt$total_counts_of_ASV) / sum(df_ab_prev$total_counts_of_ASV)
) * 100, 1),
" %) remain",
sep = ""
)
)
Tr_prev_vs_log10_ab_col <-
ggplot(df_ab_prev, aes(x = total_counts_of_ASV, y = prevalence))
Tr_prev_vs_log10_ab_col <- Tr_prev_vs_log10_ab_col +
geom_point(aes(col = Phylum), size = 2, alpha = 0.7) +
scale_x_log10() +
geom_rect(
data = shade_df,
xmin = -Inf,
xmax = log10(abund_thresh),
ymin = -Inf,
ymax = prev_thresh,
fill = "#660033",
alpha = 0.4
) +
geom_hline(
yintercept = (prevalence / 100) * nsamples(physeq),
col = cbPalette[1],
lty = "dashed"
) +
geom_vline(
xintercept = quantile(taxa_sums(physeq), probs = taxa_sums_quantile / 100),
col = cbPalette[1],
lty = "dashed"
) +
xlab("total abundance in cohort") +
facet_wrap( ~ Phylum) +
theme_bw(12) +
theme(legend.position = "none")
out <- list(Tr_prev_vs_log10_ab = Tr_prev_vs_log10_ab,
Tr_prev_vs_log10_ab_col = Tr_prev_vs_log10_ab_col)
}
#######################################
### plot_top_abundances_boxAndviolin##
#################
# generates box and violin plots of the taxons in physeq in the order given by tax_order and named by tax_names
# if tax_order and tax_names are NULL the order and names of physeq will be used
# facet_cols determines ncol in facet_wrap
# OUTPUT:
# generates for each level combination of the grouping factor (defined by group_var) seven plots, so for each combi a list of 7 plots
plot_top_abundances_boxAndviolin <-
function(physeq,
group_var,
tax_order = NULL,
tax_names = NULL,
facet_cols = 5) {
if (taxa_are_rows(physeq)) {
physeq <- t(physeq)
}
# I prefer taxa_are_rows = FALSE so rows (= observations = samples), and colums = variables = taxa
if (!is.factor(sample_data(physeq)[[group_var]])) {
sample_data(physeq)[[group_var]] <-
as.factor(sample_data(physeq)[[group_var]])
}
CT <- as(otu_table(physeq), "matrix") # taxa are columns
DF_CT <- as.data.frame(t(CT))
if (is.null(tax_order)) {
tax_order <- taxa_names(physeq)
}
if (length(tax_order) != nrow(DF_CT) ||
!all(tax_order %in% rownames(DF_CT))) {
stop(
"the given tax_order must contain all taxa_names(physeq). If you want to subset,
do it before with prune_taxa"
)
}
DF_CT <- DF_CT[tax_order,]
if (is.null(tax_names)) {
tax_names <- paste("Taxon_", 1:nrow(DF_CT), sep = "")
}
if (length(tax_names) != nrow(DF_CT)) {
stop("the given tax_names must be a character vector of length = ntaxa(physeq)")
}
rownames(DF_CT) <- make.unique(tax_names)
DF_CT$Taxa <- rownames(DF_CT)
DF_CT <- tidyr::gather(DF_CT, key = Sample , value = Count, -Taxa)
DF_CT$Taxa <-
factor(DF_CT$Taxa,
levels = unique(DF_CT$Taxa),
ordered = TRUE)
LookUpDF <-
data.frame(Sample = sample_names(physeq), Group = sample_data(physeq)[[group_var]])
DF_CT$Group <-
LookUpDF$Group[match(DF_CT$Sample, LookUpDF$Sample)]
group_fac <- factor(sample_data(physeq)[[group_var]])
fac_levels <- levels(group_fac)
# -- get the level combis --
fac_levels_num <- setNames(seq_along(fac_levels), fac_levels)
i_s <- outer(fac_levels_num, fac_levels_num, function(ivec, jvec) {
sapply(seq_along(ivec), function(x) {
i <- ivec[x]
})
})
j_s <- outer(fac_levels_num, fac_levels_num, function(ivec, jvec) {
sapply(seq_along(ivec), function(x) {
j <- jvec[x]
})
})
i_s <- i_s[upper.tri(i_s)]
j_s <- j_s[upper.tri(j_s)]
# ----
if (length(levels(LookUpDF$Group)) <= 7) {
color_lookup <-
data.frame(level = levels(LookUpDF$Group), color = cbPalette[2:(length(levels(LookUpDF$Group)) + 1)])
} else {
color_lookup <-
data.frame(level = levels(LookUpDF$Group),
color = viridis(length(levels(LookUpDF$Group))))
}
for (e in seq_along(color_lookup)) {
color_lookup[, e] <- as.character(color_lookup[, e])
}
colors_to_use <- color_lookup$color
names(colors_to_use) <- color_lookup$level
plot_list <- vector("list", length = length(i_s))
for (k in seq_along(i_s)) {
i <- i_s[k]
j <- j_s[k]
group_fac_current <-
droplevels(group_fac[as.numeric(group_fac) %in% c(i, j)])
DF_CT_current <- filter(DF_CT, Group %in% group_fac_current)
DF_CT_current$Group <-
factor(
DF_CT_current$Group,
levels = c(fac_levels[i], fac_levels[j]),
ordered = TRUE
)
Tr <-
ggplot(DF_CT_current, aes(x = Taxa, y = Count, col = Group))
Tr <- Tr +
geom_boxplot(outlier.color = NA) +
geom_point(size = 1,
alpha = 0.6,
position = position_jitterdodge()) +
scale_color_manual("", values = colors_to_use) +
xlab("") +
ylab("abundance") +
theme_bw() +
theme(axis.text.x = element_text(
angle = 90,
hjust = 0,
vjust = 0
))
Tr1 <-
ggplot(DF_CT_current, aes(x = Group, y = Count, col = Group))
Tr1 <- Tr1 +
geom_boxplot(outlier.color = NA) +
geom_point(size = 1,
alpha = 0.6,
position = position_jitterdodge()) +
facet_wrap( ~ Taxa, ncol = facet_cols, scales = "free_y") +
scale_color_manual("", values = colors_to_use) +
xlab("") +
ylab("abundance") +
theme_bw()
Tr2 <-
ggplot(DF_CT_current, aes(x = Taxa, y = Count, col = Group))
Tr2 <- Tr2 +
geom_violin(fill = NA) +
geom_point(size = 1,
alpha = 0.6,
position = position_jitterdodge()) +
scale_color_manual("", values = colors_to_use) +
xlab("") +
ylab("abundance") +
theme_bw() +
theme(axis.text.x = element_text(
angle = 90,
hjust = 0,
vjust = 0
))
Tr3 <-
ggplot(DF_CT_current, aes(x = Group, y = Count, col = Group))
Tr3 <- Tr3 +
geom_violin(fill = NA) +
geom_point(size = 1,
alpha = 0.6,
position = position_jitterdodge()) +
facet_wrap( ~ Taxa, ncol = facet_cols, scales = "free_y") +
scale_color_manual("", values = colors_to_use) +
xlab("") +
ylab("abundance") +
theme_bw() +
theme(legend.position = "none")
# same for log10
# if (min(DF_CT_current$Count[DF_CT_current$Count > 0]) > 1e-6) {
# DF_CT_current$Count[DF_CT_current$Count == 0] <- 1e-6
# } else {
DF_CT_current$Count[DF_CT_current$Count == 0] <-
min(DF_CT_current$Count[DF_CT_current$Count > 0])
# }
Tr4 <-
ggplot(DF_CT_current, aes(x = Taxa, y = Count, col = Group))
Tr4 <- Tr4 +
geom_boxplot(outlier.color = NA) +
geom_point(size = 1,
alpha = 0.6,
position = position_jitterdodge()) +
scale_color_manual("", values = colors_to_use) +
xlab("") +
ylab("abundance") +
theme_bw() +
theme(axis.text.x = element_text(
angle = 90,
hjust = 0,
vjust = 0
)) +
scale_y_log10()
Tr5 <-
ggplot(DF_CT_current, aes(x = Group, y = Count, col = Group))
Tr5 <- Tr5 +
geom_boxplot(outlier.color = NA) +
geom_point(size = 1,
alpha = 0.6,
position = position_jitterdodge()) +
facet_wrap( ~ Taxa, ncol = facet_cols, scales = "free_y") +
scale_color_manual("", values = colors_to_use) +
xlab("") +
ylab("abundance") +
theme_bw() +
scale_y_log10() +
theme(legend.position = "none")
Tr6 <-
ggplot(DF_CT_current, aes(x = Taxa, y = Count, col = Group))
Tr6 <- Tr6 +
geom_violin(fill = NA) +
geom_point(size = 1,
alpha = 0.6,
position = position_jitterdodge()) +
scale_color_manual("", values = colors_to_use) +
xlab("") +
ylab("abundance") +
theme_bw() +
theme(axis.text.x = element_text(
angle = 90,
hjust = 0,
vjust = 0
)) +
scale_y_log10()
Tr7 <-
ggplot(DF_CT_current, aes(x = Group, y = Count, col = Group))
Tr7 <- Tr7 +
geom_violin(fill = NA) +
geom_point(size = 1,
alpha = 0.6,
position = position_jitterdodge()) +
facet_wrap( ~ Taxa, ncol = facet_cols, scales = "free_y") +
scale_color_manual("", values = colors_to_use) +
xlab("") +
ylab("abundance") +
theme_bw() +
scale_y_log10() +
theme(legend.position = "none")
plot_list[[k]] <- list(Tr, Tr1, Tr2, Tr3, Tr4, Tr5, Tr6, Tr7)
names(plot_list)[k] <-
paste(fac_levels[i], "_vs_", fac_levels[j], sep = "")
}
plot_list
}
#######################################
#### plot_bar_own
#######################################
# based on plot_bar from phyloseq, difference, orders and colors the Samples based on group_var, and orders the fill based on abundance
# I guess inputs can be guessed on
plot_bar_own <-
function(physeq,
x = "Sample",
y = "Abundance",
group_var,
fill = NULL,
color_sample_names = TRUE,
facet_grid = NULL) {
if (taxa_are_rows(physeq)) {
physeq <- t(physeq)
}
if (!is.factor(sample_data(physeq)[[group_var]])) {
sample_data(physeq)[[group_var]] <-
as.factor(sample_data(physeq)[[group_var]])
}
if (is.null(fill)) {
fill = "Phylum"
}
mdf <- psmelt(physeq)
# order samples accoridng to levels
LookUpDF <-
data.frame(Sample = sample_names(physeq), Group = sample_data(physeq)[[group_var]])
LookUpDF <-
LookUpDF[order(match(LookUpDF$Group, levels(LookUpDF$Group))),]
mdf$Sample <-
factor(mdf$Sample, levels = LookUpDF$Sample, ordered = TRUE)
# order fill levels according to abundance over all samples
mdf[, fill] <- as.character(mdf[, fill])
mdf[is.na(mdf[, fill]), fill] <- "NA"
sums <-
group_by_(mdf, fill) %>% summarise(sum_abundance = sum(Abundance)) %>% arrange(sum_abundance)
mdf[, fill] <-
factor(mdf[, fill], levels = as.character(sums[[1]]), ordered = TRUE)
if (length(levels(LookUpDF$Group)) <= 7 && color_sample_names) {
color_lookup <-
data.frame(level = levels(LookUpDF$Group), color = cbPalette[2:(length(levels(LookUpDF$Group)) + 1)])
#LookUpDF$Group[match(levels(mdf$Sample), LookUpDF$Sample)]
colxaxis <-
as.character(color_lookup$color[match(LookUpDF$Group[match(levels(mdf$Sample), LookUpDF$Sample)], color_lookup$level)])
} else {
colxaxis <- rep("black", nrow(LookUpDF))
}
if (length(levels(mdf[, fill])) <= 8) {
fill_colors <- cbPalette[1:length(levels(mdf[, fill]))]
names(fill_colors) <- rev(levels(mdf[, fill]))
} else {
fill_colors <- rev(viridis(length(levels(mdf[, fill]))))
names(fill_colors) <- rev(levels(mdf[, fill]))
}
Tr <- ggplot(mdf, aes_string(x = x, y = y, fill = fill))
Tr <- Tr +
geom_bar(stat = "identity", position = "stack") +
theme_bw() +
scale_fill_manual(values = fill_colors) +
xlab("") +
theme(axis.text.x = element_text(
angle = 90,
hjust = 0,
vjust = 0,
colour = colxaxis
))
if (!is.null(facet_grid)) {
Tr <- Tr + facet_grid(facet_grid)
}
return(Tr)
}
####################################
## create_taxa_ratio_violin_plots:
###################################
# wilcoxon.test is used
## Input:
# - TbTmatrixes_list: The list with the lists of TbTmatrixes for each level combi in group factor
# NB: here it should be raw_TbTmatrixes with 0 = 0/x, Inf = x/0, NaN = 0/0, all these values will be ignored in the ratio plots by being set to NA!!
# - physeq: used for TbTmatrixes_list generation
# - group_var: the factor used to separate the samples
# - tax_names: the names of the taxa in physeq you want to use, e.g. taxa_names(physeq) if you like them, if NULL T1 to Tn will be used
# - taxa_nom: the nominator taxon of the abundance ratios, NB: only 1 allowed, must be included in tax_names
# - taxa_den: the denominator taxa of the abundance ratios, several allowed, all must be included in tax_names, if NULL all are used,
# i.e you get plots facet_wrapped around the taxa_den taxa
# - test: either "t.test" or "wilcoxon"
# - p_adjust: default "BH", method in p.adjust
## Output:
# - list of pVals data frame (the pValues from t.test or wilcox.test after p.adjust) plus Violin plot,
# so for each level combi in group_var a list of length 2
create_taxa_ratio_violin_plots <-
function(TbTmatrixes_list,
physeq,
group_var,
tax_names = NULL,
taxa_nom = "Firmicutes",
taxa_den = NULL,
test = "t.test",
p_adjust = "BH") {
if (!identical(length(TbTmatrixes_list[[1]]), ntaxa(physeq))) {
stop("TbTmatrixes can not fit to physeq")
}
group_fac <- factor(sample_data(physeq)[[group_var]])
fac_levels <- levels(group_fac)
# -- get the level combis --
fac_levels_num <- setNames(seq_along(fac_levels), fac_levels)
i_s <- outer(fac_levels_num, fac_levels_num, function(ivec, jvec) {
sapply(seq_along(ivec), function(x) {
i <- ivec[x]
})
})
j_s <- outer(fac_levels_num, fac_levels_num, function(ivec, jvec) {
sapply(seq_along(ivec), function(x) {
j <- jvec[x]
})
})
i_s <- i_s[upper.tri(i_s)]
j_s <- j_s[upper.tri(j_s)]
# ----
result_list <- vector("list", length = length(i_s))
LookUpDF <-
data.frame(Sample = sample_names(physeq), Group = sample_data(physeq)[[group_var]])
LookUpDF <-
LookUpDF[order(match(LookUpDF$Group, levels(LookUpDF$Group))),]
# Color the sample names based on levels in group fac
if (length(levels(LookUpDF$Group)) <= 7) {
color_lookup <-
data.frame(level = levels(LookUpDF$Group), color = cbPalette[2:(length(levels(LookUpDF$Group)) + 1)])
} else {
color_lookup <-
data.frame(level = levels(LookUpDF$Group),
color = viridis(length(levels(LookUpDF$Group))))
}
for (e in seq_along(color_lookup)) {
color_lookup[, e] <- as.character(color_lookup[, e])
}
colors_to_use <- color_lookup$color
names(colors_to_use) <- color_lookup$level
for (k in seq_along(i_s)) {
i <- i_s[k]
j <- j_s[k]
TbTmatrixes <- TbTmatrixes_list[[k]]
if (is.null(tax_names)) {
tax_names <- paste("T", 1:length(TbTmatrixes), sep = "_")
} else {
if (!identical(length(TbTmatrixes), length(tax_names))) {
stop("tax_names do not fit in length to TbTmatrixes")
}
}
names(TbTmatrixes) <- tax_names
TbTmatrix <- TbTmatrixes[[taxa_nom]]
if (is.null(TbTmatrix)) {
stop("taxa_nom not found")
}
rownames(TbTmatrix) <- tax_names
TbT_DF <- as.data.frame(TbTmatrix)
TbT_DF$Taxon <- rownames(TbT_DF)
if (is.null(taxa_den)) {
taxa_den <- tax_names
}
TbT_DF <- TbT_DF[TbT_DF$Taxon %in% taxa_den,]
TbT_DF_l <- gather(TbT_DF, key = Sample, value = Ratio, -Taxon)
TbT_DF_l$Group <-
as.character(LookUpDF$Group[match(TbT_DF_l$Sample, LookUpDF$Sample)])
group_fac_current <-
droplevels(group_fac[as.numeric(group_fac) %in% c(i, j)])
TbT_DF_l$Group <-
factor(TbT_DF_l$Group,
levels = levels(group_fac_current),
ordered = T)
TbT_DF_l$Ratio[!is.finite(TbT_DF_l$Ratio) |
TbT_DF_l$Ratio == 0] <- NA
var_plus_length_check <-
group_by(TbT_DF_l, Taxon, Group) %>% summarise(Variance = var(Ratio, na.rm = T),
NotNA = sum(!is.na(Ratio)))
if (test == "t.test") {
var_plus_length_check <-
filter(var_plus_length_check, !(Variance > 0) | NotNA < 2)
} else if (test == "wilcoxon") {
var_plus_length_check <- filter(var_plus_length_check, NotNA < 1)
}
if (nrow(var_plus_length_check) != 0) {
TbT_DF_l_test <-
filter(TbT_DF_l, !(Taxon %in% unique(var_plus_length_check$Taxon)))
} else {
TbT_DF_l_test <- TbT_DF_l
}
# needs probably some sanity checks here on TbT_DF_l_test
TbT_DF_l_test <- group_by(TbT_DF_l_test, Taxon)
if (test == "t.test") {
pVals <-
summarise(TbT_DF_l_test,
pVal = t.test(Ratio ~ Group, alternative = "two")$p.value)
} else if (test == "wilcoxon") {
pVals <-
summarise(
TbT_DF_l_test,
pVal = wilcox.test(
Ratio ~ Group,
alternative = "two",
paired = F,
exact = F
)$p.value
)
}
pVals$padj <- p.adjust(pVals$pVal, method = p_adjust)
pVals$significance <- ""
pVals$significance[pVals$padj <= 0.05] <- "*"
pVals$significance[pVals$padj <= 0.01] <- "**"
pVals$significance[pVals$padj <= 0.001] <- "***"
pVals$Name <- paste(pVals$Taxon, pVals$significance)
TbT_DF_l$Taxon[TbT_DF_l$Taxon %in% pVals$Taxon] <-
pVals$Name[match(TbT_DF_l$Taxon[TbT_DF_l$Taxon %in% pVals$Taxon], pVals$Taxon)]
# order levels of TbT_DF_l$Taxon based on pValues
pVals <- arrange(pVals, pVal)
TbT_DF_l$Taxon <-
factor(TbT_DF_l$Taxon,
levels = c(pVals$Name, unique(TbT_DF_l$Taxon)[!(unique(TbT_DF_l$Taxon) %in% pVals$Name)]),
ordered = T)
Tr <- ggplot(TbT_DF_l, aes(x = Group, y = Ratio, col = Group))
Tr <- Tr +
geom_violin() +
geom_point(
size = 1,
alpha = 0.6,
position = position_jitterdodge(dodge.width = 1)
) +
# scale_color_manual(values = c(color_lookup$color[i], color_lookup$color[j])) +
scale_color_manual(values = colors_to_use) +
facet_wrap( ~ Taxon, scales = "free_y") +
xlab("") +
ylab(paste("abundance ratio of", taxa_nom, "to stated taxon")) +
theme_bw() +
theme(legend.position = "none")
result_list[[k]] <- list(pVals, Tr)
rm(Tr, TbT_DF_l, TbT_DF, TbT_DF_l_test, pVals)
}
names(result_list) <- names(TbTmatrixes_list)
result_list
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.