knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  collapse = TRUE,
  comment = "#>"
)

Preliminaries

Load the required libraries.

library(llperm)
library(ggplot2)

Use parallel processing (optional).

library(foreach)
library(doParallel)
numCores <- 16 #detectCores()
registerDoParallel(numCores) 

Load the data set

Load the data set. We have two data frames: OTU table and person metadata.

fn.sequence <- system.file("extdata", "VEGA_sequences.txt", package = "llperm")
fn.metadata <- system.file("extdata", "VEGA_metadata.txt", package = "llperm")

otu.table    <- read.table(fn.sequence, sep="\t", header=T, row.names=1)
otu.metadata <- read.table(fn.metadata, sep="\t", header=T, row.names=1, stringsAsFactors=T)

These are all the taxa:

otu.names <- colnames(otu.table)

This is a data frame of abundances as raw counts

otu.counts <- cbind(otu.table, otu.metadata)
otu.counts$DietRandom <- otu.counts$Diet[sample(1:nrow(otu.counts))] 
otu.counts$Sample <- rownames(otu.counts)
otu.counts$library_size <- rowSums(otu.table)

This is a data frame of relative abundances

otu.relative <- cbind(log((otu.table + 1) / rowSums(otu.table + 1)), otu.metadata)
otu.relative$DietRandom <- otu.relative$Diet[sample(1:nrow(otu.relative))] 
otu.relative$Sample <- rownames(otu.relative)

Visualize the 'ASV159' as an example taxa:

p <- ggplot(otu.counts, aes(x=ASV159, color=Diet, fill=Diet)) + geom_histogram(bins=30) + facet_wrap(~Diet)
print(p)

Experiments

Original PRR-test: Linear model on relative abundances (DietVeg = yes/no)

The original implementation of glmperm does not support factors with several levels, so we create a new binary indicator vegetarian or vegan = Y/N

# Binary vegetarian Y/N
otu.relative$DietVeg <- factor(otu.relative$Diet, 
                               levels=c("meateater", "fisheater", "vegan", "vegetarian"), 
                               labels=c("no", "no", "yes", "yes"))
otu.relative$DietVeg <- as.numeric(otu.relative$DietVeg == "yes")

# Binary vegetarian Y/N from randomly permuted diet
otu.relative$DietVegRandom <- factor(otu.relative$DietRandom, 
                                     levels=c("meateater", "fisheater", "vegan", "vegetarian"), 
                                     labels=c("no", "no", "yes", "yes"))
otu.relative$DietVegRandom <- as.numeric(otu.relative$DietVegRandom == "yes")

Apply the original glmperm implementation to the example OTU:

fit <- prr.test(ASV159 ~ DietVeg + AgeCat + UrbanizationCat, var = "DietVeg", data=otu.relative, family="gaussian", nrep=1000)
summary(fit)

Apply the original glmperm implementation all OTUs, saving the likelihood and PRR test based p-values for Diet and DietRandom:

results <- as.data.frame(foreach (i=1:length(otu.names), .combine=rbind) %dopar% {
  taxa = otu.names[[i]]
  eq1 <- as.formula(sprintf("%s ~ DietVeg + AgeCat + UrbanizationCat ", taxa))
  eq2 <- as.formula(sprintf("%s ~ DietVegRandom + AgeCat + UrbanizationCat ", taxa))
  fit1 <- prr.test(eq1, data=otu.relative, var = "DietVeg", nrep=100)
  fit2 <- prr.test(eq2, data=otu.relative, var = "DietVegRandom", nrep=100) 
  c(loglik.diet=fit1$p.value.obs, llperm.diet=fit1$p.value.perm$p0[[1]], 
    loglik.null=fit2$p.value.obs, llperm.null=fit2$p.value.perm$p0[[1]])
})

Likelihood:

df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$loglik.diet, Variable="Diet"),
            data.frame(experiment='p-values: null hypothesis', p=results$loglik.null, Variable="Random Diet"))
ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) +
  ggtitle("Likelihood: Histogram of p-values")

PRR test:

df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$llperm.diet, Variable="Diet"),
            data.frame(experiment='p-values: null hypothesis', p=results$llperm.null, Variable="Random Diet"))
ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) +
  ggtitle("PRR test: Histogram of p-values")

New PRR-test: Linear model on relative abundances (Diet)

Apply the new implementation to the example OTU:

prr.test.glm(ASV159 ~ Diet + AgeCat + UrbanizationCat, data=otu.relative, test.var = "Diet", n.rep=1000, family=gaussian())

Apply the new implementation all OTUs, saving the likelihood and PRR test based p-values for Diet and DietRandom:

results <- as.data.frame(foreach (i=1:length(otu.names), .combine=rbind) %dopar% {
  taxa = otu.names[[i]]
  eq1 <- as.formula(sprintf("%s ~ Diet + AgeCat + UrbanizationCat ", taxa))
  eq2 <- as.formula(sprintf("%s ~ DietRandom + AgeCat + UrbanizationCat ", taxa))
  fit1 <- prr.test.glm(eq1, data=otu.relative, test.var = "Diet", n.rep=100, family=gaussian())
  fit2 <- prr.test.glm(eq2, data=otu.relative, test.var = "DietRandom", n.rep=100, family=gaussian()) 
  c(loglik.diet=fit1$p.value.obs, llperm.diet=fit1$p.value.sim, 
    loglik.null=fit2$p.value.obs, llperm.null=fit2$p.value.sim)
})

Likelihood:

df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$loglik.diet, Variable="Diet"),
            data.frame(experiment='p-values: null hypothesis', p=results$loglik.null, Variable="Random Diet"))
ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) +
  ggtitle("Likelihood: Histogram of p-values")

PRR test:

df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$llperm.diet, Variable="Diet"),
            data.frame(experiment='p-values: null hypothesis', p=results$llperm.null, Variable="Random Diet"))
ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) +
  ggtitle("PRR test: Histogram of p-values")

New PRR-test: Poisson model on counts (Diet)

Apply the new implementation to the example OTU:

prr.test.glm(ASV159 ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size)), 
             data=otu.counts, test.var = "Diet", n.rep=1000, family=poisson())

Apply the new implementation all OTUs, saving the likelihood and PRR test based p-values for Diet and DietRandom:

results <- as.data.frame(foreach (i=1:length(otu.names), .combine=rbind) %dopar% {
  taxa = otu.names[[i]]
  eq1 <- as.formula(sprintf("%s ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size))", taxa))
  eq2 <- as.formula(sprintf("%s ~ DietRandom + AgeCat + UrbanizationCat + offset(log(library_size))", taxa))
  fit1 <- prr.test.glm(eq1, data=otu.counts, test.var = "Diet", n.rep=100, family=poisson())
  fit2 <- prr.test.glm(eq2, data=otu.counts, test.var = "DietRandom", n.rep=100, family=poisson()) 
  c(loglik.diet=fit1$p.value.obs, llperm.diet=fit1$p.value.sim, 
    loglik.null=fit2$p.value.obs, llperm.null=fit2$p.value.sim)
})

Likelihood:

df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$loglik.diet, Variable="Diet"),
            data.frame(experiment='p-values: null hypothesis', p=results$loglik.null, Variable="Random Diet"))
ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) +
  ggtitle("Likelihood: Histogram of p-values")

PRR test:

df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$llperm.diet, Variable="Diet"),
            data.frame(experiment='p-values: null hypothesis', p=results$llperm.null, Variable="Random Diet"))
ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) +
  ggtitle("PRR test: Histogram of p-values")

New PRR-test implementation for any model

Standard Negative Binomial

This is a standard negative binomial model:

fit <- prr.test.ll(ASV159 ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size)), 
                   otu.counts, test.var="Diet", n.rep=1000, family=NegativeBinomial())
fit[c("p.value.obs", "p.value.sim")]

Apply the new implementation to all OTUs, saving the likelihood and PRR test based p-values for Diet and DietRandom:

results <- as.data.frame(foreach (i=1:length(otu.names), .combine=rbind) %dopar% {
  taxa = otu.names[[i]]
  eq1 <- as.formula(sprintf("%s ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size))", taxa))
  eq2 <- as.formula(sprintf("%s ~ DietRandom + AgeCat + UrbanizationCat + offset(log(library_size)) ", taxa))
  fit1 <- prr.test.ll(eq1, otu.counts, test.var = "Diet", test="both", n.rep=100, family=NegativeBinomial())
  fit2 <- prr.test.ll(eq2, otu.counts, test.var = "DietRandom", test="both", n.rep=100, family=NegativeBinomial()) 
  c(loglik.diet=fit1$p.value.obs, llperm.diet=fit1$p.value.sim, 
    loglik.null=fit2$p.value.obs, llperm.null=fit2$p.value.sim)
})

Likelihood:

df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$loglik.diet, Variable="Diet"),
            data.frame(experiment='p-values: null hypothesis', p=results$loglik.null, Variable="Random Diet"))
ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) +
  ggtitle("Likelihood: Histogram of p-values")

PRR test:

df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$llperm.diet, Variable="Diet"),
            data.frame(experiment='p-values: null hypothesis', p=results$llperm.null, Variable="Random Diet"))
ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) +
  ggtitle("PRR test: Histogram of p-values")

Zero-inflated Negative Binomial

The zero-inflated negative binomial often provides a better fit. We define a model specification for both 'count' and 'zero' components:

formula = ASV159 ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size)) | Diet + AgeCat + offset(log(library_size))

This tests both 'count' and 'zero' components for association with diet:

fit <- prr.test.ll(formula, otu.counts, test.var="Diet", n.rep=1000, test="both", family=ZINegativeBinomial())
fit[c("p.value.obs", "p.value.sim")]

we can also test just the 'count' component:

fit <- prr.test.ll(formula, otu.counts, test.var="Diet", test="count", n.rep=1000, family=ZINegativeBinomial())
fit[c("p.value.obs", "p.value.sim")]

... or just the 'zero' component:

fit <- prr.test.ll(formula, otu.counts, test.var="Diet", test="zero", n.rep=1000, family=ZINegativeBinomial())
fit[c("p.value.obs", "p.value.sim")]

Apply the new implementation to all OTUs, saving the likelihood and PRR test based p-values for Diet and DietRandom:

results <- as.data.frame(foreach (i=1:length(otu.names), .combine=rbind) %dopar% {
  taxa = otu.names[[i]]
  eq1 <- as.formula(sprintf("%s ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size)) | Diet + AgeCat + offset(log(library_size)) ", taxa))
  eq2 <- as.formula(sprintf("%s ~ DietRandom + AgeCat + UrbanizationCat + offset(log(library_size)) | DietRandom + AgeCat + offset(log(library_size)) ", taxa))
  fit1 <- prr.test.ll(eq1, otu.counts, test.var = "Diet", test="both", n.rep=100, family=ZINegativeBinomial())
  fit2 <- prr.test.ll(eq2, otu.counts, test.var = "DietRandom", test="both", n.rep=100, family=ZINegativeBinomial()) 
  c(loglik.diet=fit1$p.value.obs, llperm.diet=fit1$p.value.sim, 
    loglik.null=fit2$p.value.obs, llperm.null=fit2$p.value.sim)
})

Likelihood:

df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$loglik.diet, Variable="Diet"),
            data.frame(experiment='p-values: null hypothesis', p=results$loglik.null, Variable="Random Diet"))
ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) +
  ggtitle("Likelihood: Histogram of p-values")

PRR test:

df <- rbind(data.frame(experiment='p-values: diet in data set', p=results$llperm.diet, Variable="Diet"),
            data.frame(experiment='p-values: null hypothesis', p=results$llperm.null, Variable="Random Diet"))
ggplot(df, aes(x=p, color=Variable, fill=Variable)) + geom_histogram(alpha=0.6, breaks = seq(0,1,0.1)) + facet_wrap(~experiment) +
  ggtitle("PRR test: Histogram of p-values")

Test and illustrate implementation

Test the part of the function that splits the data into two model matrices, xyz=TRUE means to return the implicit model matrix 1) X: the count model matrix (before '|' in formula) 2) Z: the zero-inflation model matrix (after '|' in formula)

formula = ASV159 ~ Diet + AgeCat + UrbanizationCat + offset(log(library_size)) | Diet + AgeCat + offset(log(library_size)) 
data   <- prr.test.ll(formula, otu.counts, test.var="Diet", xyz=TRUE)
data$Y2 <- with(otu.counts, cbind(ASV159, library_size - ASV159))
str(data)

Test the part of the function that fits an arbitrary distribution to data:

for (model in c("Poisson", "ZIPoisson", "Binomial", "ZIBinomial", "NegativeBinomial", "ZINegativeBinomial", "BetaBinomial", "ZIBetaBinomial")) {
  Y <- if (model %in% c("Binomial", "ZIBinomial", "BetaBinomial", "ZIBetaBinomial")) data$Y2 else data$Y
  family <- get(model, mode = "function", envir = parent.frame())
  fit <- fitdist(data$X, Y, data$Z, 0, 0, data$weights, family=family(), control=fitdist.control(trace=F))
  print(model)
  print(fit$loglik)
}

Finally

stopImplicitCluster()


majuvi/llperm documentation built on May 2, 2022, 5:20 p.m.