knitr::opts_chunk$set( message = FALSE, warning = FALSE, collapse = TRUE, comment = "#>" )
Load the required libraries.
library(llperm) library(ggplot2)
Use parallel processing (optional).
library(foreach) library(doParallel) numCores <- 16 #detectCores() registerDoParallel(numCores)
Load the data set. We have two data frames: OTU table and person metadata.
fn.sequence <- system.file("extdata", "VEGA_sequences.txt", package = "llperm") fn.metadata <- system.file("extdata", "VEGA_metadata.txt", package = "llperm") otu.table <- read.table(fn.sequence, sep="\t", header=T, row.names=1) otu.metadata <- read.table(fn.metadata, sep="\t", header=T, row.names=1, stringsAsFactors=T)
These are all the taxa:
otu.names <- colnames(otu.table)
This is a data frame of abundances as raw counts
otu.counts <- cbind(otu.table, otu.metadata) otu.counts$DietRandom <- otu.counts$Diet[sample(1:nrow(otu.counts))] otu.counts$Sample <- rownames(otu.counts) otu.counts$library_size <- rowSums(otu.table)
This is a data frame of relative abundances
otu.relative <- cbind(log((otu.table + 1) / rowSums(otu.table + 1)), otu.metadata) otu.relative$DietRandom <- otu.relative$Diet[sample(1:nrow(otu.relative))] otu.relative$Sample <- rownames(otu.relative)
Visualize the 'ASV159' as an example taxa:
p <- ggplot(otu.counts, aes(x=ASV159, color=Diet, fill=Diet)) + geom_histogram(bins=30) + facet_wrap(~Diet) print(p)
The original implementation of glmperm does not support factors with several levels, so we create a new binary indicator vegetarian or vegan = Y/N
# Binary vegetarian Y/N otu.relative$DietVeg <- factor(otu.relative$Diet, levels=c("meateater", "fisheater", "vegan", "vegetarian"), labels=c("no", "no", "yes", "yes")) otu.relative$DietVeg <- as.numeric(otu.relative$DietVeg == "yes") # Binary vegetarian Y/N from randomly permuted diet otu.relative$DietVegRandom <- factor(otu.relative$DietRandom, levels=c("meateater", "fisheater", "vegan", "vegetarian"), labels=c("no", "no", "yes", "yes")) otu.relative$DietVegRandom <- as.numeric(otu.relative$DietVegRandom == "yes")
Apply the original glmperm implementation to the example OTU:
fit <- prr.test(ASV159 ~ DietVeg + AgeCat + UrbanizationCat, var = "DietVeg", data=otu.relative, family="gaussian", nrep=1000) summary(fit)
Apply the original glmperm implementation all OTUs, saving the likelihood and PRR test based p-values for Diet and DietRandom:
results <- as.data.frame(foreach (i=1:length(otu.names), .combine=rbind) %dopar% { taxa = otu.names[[i]] eq1 <- as.formula(sprintf("%s ~ DietVeg + AgeCat + UrbanizationCat ", taxa)) eq2 <- as.formula(sprintf("%s ~ DietVegRandom + AgeCat + UrbanizationCat ", taxa)) fit1 <- prr.test(eq1, data=otu.relative, var = "DietVeg", nrep=100) fit2 <- prr.test(eq2, data=otu.relative, var = "DietVegRandom", nrep=100) c(loglik.diet=fit1$p.value.obs, llperm.diet=fit1$p.value.perm$p0[[1]], loglik.null=fit2$p.value.obs, llperm.null=fit2$p.value.perm$p0[[1]]) })
Likelihood:
df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$loglik.diet, Variable="Diet"), data.frame(experiment='p-values: null hypothesis', p=results$loglik.null, Variable="Random Diet")) ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) + ggtitle("Likelihood: Histogram of p-values")
PRR test:
df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$llperm.diet, Variable="Diet"), data.frame(experiment='p-values: null hypothesis', p=results$llperm.null, Variable="Random Diet")) ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) + ggtitle("PRR test: Histogram of p-values")
Apply the new implementation to the example OTU:
prr.test.glm(ASV159 ~ Diet + AgeCat + UrbanizationCat, data=otu.relative, test.var = "Diet", n.rep=1000, family=gaussian())
Apply the new implementation all OTUs, saving the likelihood and PRR test based p-values for Diet and DietRandom:
results <- as.data.frame(foreach (i=1:length(otu.names), .combine=rbind) %dopar% { taxa = otu.names[[i]] eq1 <- as.formula(sprintf("%s ~ Diet + AgeCat + UrbanizationCat ", taxa)) eq2 <- as.formula(sprintf("%s ~ DietRandom + AgeCat + UrbanizationCat ", taxa)) fit1 <- prr.test.glm(eq1, data=otu.relative, test.var = "Diet", n.rep=100, family=gaussian()) fit2 <- prr.test.glm(eq2, data=otu.relative, test.var = "DietRandom", n.rep=100, family=gaussian()) c(loglik.diet=fit1$p.value.obs, llperm.diet=fit1$p.value.sim, loglik.null=fit2$p.value.obs, llperm.null=fit2$p.value.sim) })
Likelihood:
df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$loglik.diet, Variable="Diet"), data.frame(experiment='p-values: null hypothesis', p=results$loglik.null, Variable="Random Diet")) ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) + ggtitle("Likelihood: Histogram of p-values")
PRR test:
df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$llperm.diet, Variable="Diet"), data.frame(experiment='p-values: null hypothesis', p=results$llperm.null, Variable="Random Diet")) ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) + ggtitle("PRR test: Histogram of p-values")
Apply the new implementation to the example OTU:
prr.test.glm(ASV159 ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size)), data=otu.counts, test.var = "Diet", n.rep=1000, family=poisson())
Apply the new implementation all OTUs, saving the likelihood and PRR test based p-values for Diet and DietRandom:
results <- as.data.frame(foreach (i=1:length(otu.names), .combine=rbind) %dopar% { taxa = otu.names[[i]] eq1 <- as.formula(sprintf("%s ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size))", taxa)) eq2 <- as.formula(sprintf("%s ~ DietRandom + AgeCat + UrbanizationCat + offset(log(library_size))", taxa)) fit1 <- prr.test.glm(eq1, data=otu.counts, test.var = "Diet", n.rep=100, family=poisson()) fit2 <- prr.test.glm(eq2, data=otu.counts, test.var = "DietRandom", n.rep=100, family=poisson()) c(loglik.diet=fit1$p.value.obs, llperm.diet=fit1$p.value.sim, loglik.null=fit2$p.value.obs, llperm.null=fit2$p.value.sim) })
Likelihood:
df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$loglik.diet, Variable="Diet"), data.frame(experiment='p-values: null hypothesis', p=results$loglik.null, Variable="Random Diet")) ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) + ggtitle("Likelihood: Histogram of p-values")
PRR test:
df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$llperm.diet, Variable="Diet"), data.frame(experiment='p-values: null hypothesis', p=results$llperm.null, Variable="Random Diet")) ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) + ggtitle("PRR test: Histogram of p-values")
This is a standard negative binomial model:
fit <- prr.test.ll(ASV159 ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size)), otu.counts, test.var="Diet", n.rep=1000, family=NegativeBinomial()) fit[c("p.value.obs", "p.value.sim")]
Apply the new implementation to all OTUs, saving the likelihood and PRR test based p-values for Diet and DietRandom:
results <- as.data.frame(foreach (i=1:length(otu.names), .combine=rbind) %dopar% { taxa = otu.names[[i]] eq1 <- as.formula(sprintf("%s ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size))", taxa)) eq2 <- as.formula(sprintf("%s ~ DietRandom + AgeCat + UrbanizationCat + offset(log(library_size)) ", taxa)) fit1 <- prr.test.ll(eq1, otu.counts, test.var = "Diet", test="both", n.rep=100, family=NegativeBinomial()) fit2 <- prr.test.ll(eq2, otu.counts, test.var = "DietRandom", test="both", n.rep=100, family=NegativeBinomial()) c(loglik.diet=fit1$p.value.obs, llperm.diet=fit1$p.value.sim, loglik.null=fit2$p.value.obs, llperm.null=fit2$p.value.sim) })
Likelihood:
df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$loglik.diet, Variable="Diet"), data.frame(experiment='p-values: null hypothesis', p=results$loglik.null, Variable="Random Diet")) ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) + ggtitle("Likelihood: Histogram of p-values")
PRR test:
df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$llperm.diet, Variable="Diet"), data.frame(experiment='p-values: null hypothesis', p=results$llperm.null, Variable="Random Diet")) ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) + ggtitle("PRR test: Histogram of p-values")
The zero-inflated negative binomial often provides a better fit. We define a model specification for both 'count' and 'zero' components:
formula = ASV159 ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size)) | Diet + AgeCat + offset(log(library_size))
This tests both 'count' and 'zero' components for association with diet:
fit <- prr.test.ll(formula, otu.counts, test.var="Diet", n.rep=1000, test="both", family=ZINegativeBinomial()) fit[c("p.value.obs", "p.value.sim")]
we can also test just the 'count' component:
fit <- prr.test.ll(formula, otu.counts, test.var="Diet", test="count", n.rep=1000, family=ZINegativeBinomial()) fit[c("p.value.obs", "p.value.sim")]
... or just the 'zero' component:
fit <- prr.test.ll(formula, otu.counts, test.var="Diet", test="zero", n.rep=1000, family=ZINegativeBinomial()) fit[c("p.value.obs", "p.value.sim")]
Apply the new implementation to all OTUs, saving the likelihood and PRR test based p-values for Diet and DietRandom:
results <- as.data.frame(foreach (i=1:length(otu.names), .combine=rbind) %dopar% { taxa = otu.names[[i]] eq1 <- as.formula(sprintf("%s ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size)) | Diet + AgeCat + offset(log(library_size)) ", taxa)) eq2 <- as.formula(sprintf("%s ~ DietRandom + AgeCat + UrbanizationCat + offset(log(library_size)) | DietRandom + AgeCat + offset(log(library_size)) ", taxa)) fit1 <- prr.test.ll(eq1, otu.counts, test.var = "Diet", test="both", n.rep=100, family=ZINegativeBinomial()) fit2 <- prr.test.ll(eq2, otu.counts, test.var = "DietRandom", test="both", n.rep=100, family=ZINegativeBinomial()) c(loglik.diet=fit1$p.value.obs, llperm.diet=fit1$p.value.sim, loglik.null=fit2$p.value.obs, llperm.null=fit2$p.value.sim) })
Likelihood:
df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$loglik.diet, Variable="Diet"), data.frame(experiment='p-values: null hypothesis', p=results$loglik.null, Variable="Random Diet")) ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) + ggtitle("Likelihood: Histogram of p-values")
PRR test:
df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$llperm.diet, Variable="Diet"), data.frame(experiment='p-values: null hypothesis', p=results$llperm.null, Variable="Random Diet")) ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) + ggtitle("PRR test: Histogram of p-values")
Test the part of the function that splits the data into two model matrices, xyz=TRUE means to return the implicit model matrix 1) X: the count model matrix (before '|' in formula) 2) Z: the zero-inflation model matrix (after '|' in formula)
formula = ASV159 ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size)) | Diet + AgeCat + offset(log(library_size)) data <- prr.test.ll(formula, otu.counts, test.var="Diet", xyz=TRUE) data$Y2 <- with(otu.counts, cbind(ASV159, library_size - ASV159)) str(data)
Test the part of the function that fits an arbitrary distribution to data:
for (model in c("Poisson", "ZIPoisson", "Binomial", "ZIBinomial", "NegativeBinomial", "ZINegativeBinomial", "BetaBinomial", "ZIBetaBinomial")) { Y <- if (model %in% c("Binomial", "ZIBinomial", "BetaBinomial", "ZIBetaBinomial")) data$Y2 else data$Y family <- get(model, mode = "function", envir = parent.frame()) fit <- fitdist(data$X, Y, data$Z, 0, 0, data$weights, family=family(), control=fitdist.control(trace=F)) print(model) print(fit$loglik) }
stopImplicitCluster()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.