knitr::opts_chunk$set(echo = TRUE, rows.print=25)
Load the required libraries.
library(coin) library(llperm) library(ggplot2) library(dplyr) library(tidyr) library(stringr) library(RColorBrewer) library(ggpubr)
Use parallel processing (optional).
library(foreach) library(doParallel) numCores <- 10 #detectCores() registerDoParallel(numCores)
# Data sets: # - Data: Real / Simulated (ZINB, ZIBB, log.reg + log-linear ls) # - Variables: Diet group, Diet group + confounding variables # - Diet group signal: Counts / Zeros / Both # - Confounding non-signal: library size, counts variance # - Setting: Sample size, Effect size, (Overdispersion) # # Distributions # Poisson family ----------- overdispersion ------------> # | zeros Poisson Negative Binomial # \_/ Zero-Inflated Poisson Zero-Inflated Negative Binomial # # Binomial family ----------- overdispersion ------------> # | zeros Binomial Beta Binomial # \_/ Zero-Inflated Binomial Zero-Inflated Beta Binomial #
Define two functions to generate a simulated data set based on a real data set:
# Generate simulated signal from strata defined by a combination of features, # each feature value makes given OTUs independently differentially abundant generate_signal <- function (strata, sample.size, otu.names, diff.abundant, effects) { # Randomly sample a data frame of features from a given strata definition idx <- sample(1:nrow(strata), sample.size, replace=T, prob=strata$Probability) df.features <- strata[idx,] # Define effects X X <- array(1, c(sample.size,length(otu.names)), list(1:sample.size, otu.names)) # Loop over every person and define their OTU effect for (i in 1:sample.size) { person <- df.features[i,] for (feature in names(diff.abundant)) { otus <- diff.abundant[[feature]][[person[[feature]]]] X[i, otus] = X[i, otus] * effects[[feature]][[person[[feature]]]] } } # These OTUs are truly differentially abundant (affected by Group) diff.abundant <- unique(unlist(diff.abundant$Group)) return(list(df.effects=X, df.features=df.features, diff.abundant=diff.abundant)) } # Given a real data set of OTU counts, simulate signal in the data set by multiplying counts data_simulate <- function(otu.counts, otu.names, strata=NULL, signal, which="both") { # These are the real data OTU counts otu.simulated <- otu.counts[,otu.names] # Add signal and features if relevant if (!is.null(strata)) { if (which != "count") { for (i in 1:ncol(otu.simulated)) { otu.n <- otu.simulated[,i] otu.n.nonzero <- otu.n[otu.n > 0] odds <- sum(otu.n > 0) / sum(otu.n == 0) * signal$df.effects[,i] non.zero <- sample(1:length(otu.n), size=length(otu.n.nonzero), replace=F, prob=1 / (1 + 1/odds)) otu.simulated[,i] <- 0 otu.simulated[non.zero,i] <- otu.n.nonzero } } if (which != "zero") otu.simulated <- round(otu.simulated * signal$df.effects) otu.simulated$library_size <- rowSums(otu.simulated) otu.simulated <- data.frame(otu.simulated, signal$df.features) } return(list(X=otu.simulated, y=if (!is.null(strata)) signal$diff.abundant else NULL)) }
Calculating model metrics:
# Simple ROC curve calculation simple_roc <- function(labels, scores){ labels <- labels[order(scores, decreasing=TRUE)] data.frame(TPR=cumsum(labels)/sum(labels), FPR=cumsum(!labels)/sum(!labels), labels) } # Simple ROC curve calculation (with interpolation) simple_roc2 <- function(labels, scores, FPR.out=seq(0.00,1.00,0.01)){ labels <- labels[order(scores, decreasing=TRUE)] temp <- data.frame(TPR=cumsum(labels)/sum(labels), FPR=cumsum(!labels)/sum(!labels)) if (!is.null(FPR.out)) temp <- suppressWarnings(approx(temp$FPR, temp$TPR, xout=FPR.out, yleft=0, yright=1)) data.frame(FPR=temp$x, TPR=temp$y) } # Simple statistics calculation simple_stats <- function(labels, scores, threshold=0.05, threshold.auc=0.10, decreasing=FALSE){ labels <- labels[order(scores, decreasing=decreasing)] scores <- scores[order(scores, decreasing=decreasing)] # Calculate AUC 1 - sum(labels * cumsum(!labels)) / (sum(!labels) * sum(labels)) AUC <- sum(labels * (sum(!labels) - cumsum(!labels))) / (sum(!labels) * sum(labels)) TPR <- cumsum(labels)/sum(labels) FPR <- cumsum(!labels)/sum(!labels) # Calculate Power at a given False Discovery Rate (FDR) idx <- which.min(abs(FPR - threshold)) PowerAt <- TPR[(1:length(labels))[(FPR - threshold) >= 0][1]] idx <- (1:length(labels))[(FPR - threshold.auc) >= 0][1] AUCAt <- sum(labels[1:idx] * (sum(!labels[1:idx]) - cumsum(!labels[1:idx]))) / (sum(!labels[1:idx]) * sum(labels[1:idx])) # Calculate Power = TP/P and FDR = FP/N labels_pred <- if (decreasing) scores > threshold else scores <= threshold Power <- sum(labels_pred & labels)/sum(labels) FPR <- sum(labels_pred & !labels)/sum(!labels) results <- data.frame(AUC=AUC, PowerAt=PowerAt, AUCAt=AUCAt, Power=Power, FPR=FPR) return(results) }
Load the data set. We have two data frames: OTU table and person metadata.
fn.sequence <- system.file("extdata", "VEGA_sequences.txt", package = "llperm") fn.metadata <- system.file("extdata", "VEGA_metadata.txt", package = "llperm") otu.table <- read.table(fn.sequence, sep="\t", header=T, row.names=1) otu.metadata <- read.table(fn.metadata, sep="\t", header=T, row.names=1, stringsAsFactors=T)
Combine these to a single data frame
otu.names <- colnames(otu.table) otu.counts <- cbind(otu.table, otu.metadata) otu.counts$Sample <- rownames(otu.counts) otu.counts$library_size <- rowSums(otu.table)
Visualize the 'ASV159' as an example taxa:
p <- ggplot(otu.counts, aes(x=ASV305, color=Diet, fill=Diet)) + geom_histogram(bins=30) + facet_wrap(~Diet) + scale_x_continuous() + theme_bw() + theme(legend.position = "none") print(p) ggsave("example.png", height = 4 , width = 5, dpi=240)
Calculate mean and variance:
with(otu.counts, print(sprintf("%.2f, %.2f", mean(ASV305), var(ASV305))))
set.seed(42) n.simulations <- 50 # simulate data and repeat the experiments n times to reduce sampling variance effects <- c(1.25, 1.5, 2.0, 3.0, 5.0) # multiply counts by effect, i.e. +25%, +50%, +100%, +200%, +400% model.names <- c("Poisson", "ZIPoisson", "NegativeBinomial", "ZINegativeBinomial", "Binomial", "ZIBinomial", "BetaBinomial", "ZIBetaBinomial") model.names <- with(expand.grid(c("loglik", "llperm"), model.names), paste0(sprintf("%s (%s)", Var2, Var1))) #test.names <- with(expand.grid(c("basic"), c("oneway_test", "kruskal_test")), # paste0(sprintf("%s (%s)", Var2, Var1))) #model.names <- c(test.names, model.names) model.family <- setNames(c(rep("Poisson",8), rep("Binomial", 8)), model.names)
Specification of models:
tests <- list( "oneway_test" = list(basic="%s ~ Group"), "kruskal_test" = list(basic="%s ~ Group") ) models <- list( "Poisson" = "%s ~ Group + offset(log(library_size))", "Binomial" = "cbind(%s, library_size - %s) ~ Group ", "NegativeBinomial" = "%s ~ Group + offset(log(library_size))", "BetaBinomial" = "cbind(%s, library_size - %s) ~ Group ", "ZIPoisson" = "%s ~ Group + offset(log(library_size)) | Group + offset(log(library_size))", "ZIBinomial" = "cbind(%s, library_size - %s) ~ Group | Group ", "ZINegativeBinomial" = "%s ~ Group + offset(log(library_size)) | Group + offset(log(library_size))", "ZIBetaBinomial" = "cbind(%s, library_size - %s) ~ Group | Group " )
Specification of signal:
# Specify a simple categeorical distribution for each group strata <- data.frame(Group = c("meateater", "fisheater", "vegetarian", "vegan"), Probability=c(0.25, 0.25, 0.25, 0.25)) # These taxa are differentially abundant in each group diff.abundant <- list( "Group" = list( "meateater" = sample(otu.names, length(otu.names)/10), "fisheater" = sample(otu.names, length(otu.names)/10), "vegetarian" = sample(otu.names, length(otu.names)/10), "vegan" = sample(otu.names, length(otu.names)/10) ) )
Experiment:
fn <- "results_group_count.csv" if (file.exists(fn)) file.remove(fn) # Loop over generated data sets for (i in 1:n.simulations) { print(sprintf("===== Experiment %d =====", i)) for (effect in effects) { print(sprintf("*** Effect %.2f ***", effect)) # Create simulated signal in real data diff.effects <- list( "Group" = list("meateater" = effect, "fisheater" = effect, "vegetarian" = effect, "vegan" = effect) ) signal <- generate_signal(strata, sample.size=nrow(otu.counts), otu.names, diff.abundant, diff.effects) data <- data_simulate(otu.counts, otu.names, strata=strata, signal, which="count") # This is the data.frame of simulated data and true answers otu.simulated <- data$X otu.simulated$Group <- as.factor(otu.simulated$Group) otu.true <- data.frame(otu=otu.names, y_true=as.integer(otu.names %in% data$y)) # Loop over statistical tests results.tests <- list() for (test in names(tests)) { print(test) start <- Sys.time() str.tests <- tests[[test]] testf <- get(test, mode = "function", envir = parent.frame()) # Loop over OTUs in paraller results.test <- foreach (i=1:length(otu.names), .combine=rbind) %dopar% { otu <- otu.names[[i]] pval <- list() for (test.name in names(str.tests)) { str.test <- str.tests[[test.name]] eq <- as.formula(do.call("sprintf", as.list(c(str.test, rep(otu, str_count(str.test, "%s")))))) subset = NULL if (test.name == "strata") { n.in.strata <- table(otu.simulated$Strata) subset = otu.simulated$Strata %in% names(n.in.strata[n.in.strata > 1]) } fit <- testf(eq, data=otu.simulated, subset=subset, distribution=approximate(nresample=500)) pval[[test.name]] <- pvalue(fit) } otu.result <- c(otu=otu, unlist(pval)) otu.result } # Save p-values from different test types results.test <- as.data.frame(results.test) %>% mutate(basic = basic %>% as.numeric) %>% inner_join(otu.true, by="otu") %>% pivot_longer(c(basic), names_to="type", values_to="p_value") %>% mutate(y_pred = p_value <= 0.05) results.tests[[test]] <- results.test # Time per test end <- Sys.time() print(end-start) } results.tests <- bind_rows(results.tests, .id = "family") # Loop over models results.models <- list() for (model in names(models)) { print(model) start <- Sys.time() str.model <- models[[model]] family <- get(model, mode = "function", envir = parent.frame()) # Loop over OTUs in paraller results.model <- foreach (i=1:length(otu.names), .combine=rbind) %dopar% { otu <- otu.names[[i]] eq <- as.formula(do.call("sprintf", as.list(c(str.model, rep(otu, str_count(str.model, "%s")))))) fit <- prr.test.ll(eq, otu.simulated, test.var="Group", test="both", n.rep=500, family=family) c(otu=otu, loglik=fit$p.value.obs, llperm=fit$p.value.sim) } # Save p-values from both log likelihood and prr-test results.model <- as.data.frame(results.model) %>% mutate(loglik = loglik %>% as.numeric, llperm = llperm %>% as.numeric) %>% inner_join(otu.true, by="otu") %>% pivot_longer(c(loglik, llperm), names_to="type", values_to="p_value") %>% mutate(y_pred = p_value <= 0.05) results.models[[model]] <- results.model # Time per model end <- Sys.time() print(end-start) } results.models <- bind_rows(results.models, .id = "family") results <- rbind(results.tests, results.models) results$effect <- effect results$experiment <- i suppressWarnings({write.table(results, fn, sep=";", dec=",", append=T, col.names=F, row.names=F)}) } }
results <- read.table(file="results_group_count.csv", sep=";", dec=",", header = F) colnames(results) <- c("family","otu","y_true","type","p_value","y_pred","effect","experiment") results <- subset(results, subset = !(family %in% c("oneway_test", "kruskal_test"))) # Combine model and p-value calculation method into a single factor results$model <- factor(paste0(sprintf("%s (%s)", results$family, results$type)), levels=model.names)
Table with effect +100%:
results.1 <- results %>% filter(effect == 2.0) # results.1 <- results %>% filter(effect == unique(results$effect)[[3]]) # Calculate statistics for each experiment and display the mean stats <- results.1 %>% group_by(experiment, model) %>% group_modify(~ simple_stats(.x$y_true, .x$p_value)) %>% group_by(model) %>% summarize(AUC.avg = mean(AUC), AUC.sd = round(sd(AUC)/sqrt(n.simulations), 2), Power.avg = mean(Power), Power.sd = round(sd(Power)/sqrt(n.simulations), 2), FPR.avg = mean(FPR), FPR.sd = round(sd(FPR)/sqrt(n.simulations), 2), PowerAt.avg = mean(PowerAt), PowerAt.sd = round(sd(PowerAt)/sqrt(n.simulations), 2), AUCAt.avg = mean(AUCAt), AUCAt.sd = round(sd(AUCAt)/sqrt(n.simulations), 2)) %>% mutate(across(where(is.numeric), round, 2)) %>% separate(model, c("Family", "Type"), sep=" ") %>% transmute(Family=Family, Type=Type, #AUC = sprintf("%.2f ($\\pm%.2f$)", AUC.avg, AUC.sd), Power = sprintf("%.2f ($\\pm%.2f$)", Power.avg, Power.sd), FPR = sprintf("%.2f ($\\pm%.2f$)", FPR.avg, FPR.sd), PowerAt = sprintf("%.2f ($\\pm%.2f$)", PowerAt.avg, PowerAt.sd), AUCAt = sprintf("%.2f ($\\pm%.2f$)", AUCAt.avg, AUCAt.sd)) %>% rename("Power"="Power", "FPR"="FPR", "Power@0.05"="PowerAt", "AUC@0.10"="AUCAt") write.table(stats, "stats_group_count.csv", row.names=F, sep=";", dec=",") print.data.frame(stats)
Plot with effect sizes +25%, ..., +400%:
results.2 <- results %>% mutate(effect=(effect-1)*100) # results.2 <- results %>% mutate(effect=(exp(effect)-1)*100) # Calculate statistics for each effect size stats <- results.2 %>% group_by(experiment, model, effect) %>% group_modify(~ simple_stats(.x$y_true, .x$p_value)) %>% group_by(model, effect) %>% summarize(AUC = mean(AUC), Power = mean(Power), FPR = mean(FPR), PowerAt = mean(PowerAt), AUCAt = mean(AUCAt)) %>% pivot_longer(c(AUC, Power, FPR, PowerAt, AUCAt), names_to = "metric", values_to = "value") %>% mutate(family = recode(model, !!!model.family)) %>% filter(metric %in% c("Power", "FPR", "PowerAt", "AUCAt")) %>% mutate(metric = factor(metric, levels = c("Power", "FPR", "PowerAt", "AUCAt")))%>% mutate(metric = recode_factor(metric, Power="Power", FPR="FPR", PowerAt="Power@0.05", AUCAt="AUC@0.10")) # Plot statistics for each effect size p1 <- ggplot(stats, aes(x=effect, y=value, col=model, linetype=model)) + geom_line() + scale_linetype_manual(values = rep(c("dashed", "solid"), length.out=16)) + scale_color_manual(values = rep(brewer.pal(8, "Accent"), each=2)) + facet_grid(family ~ metric) + theme_bw() + #xlab("log(effect)") labs(title="Signal in counts: Metrics as a function of effect size", x="Effect size", y="value") + scale_x_continuous(trans="log", breaks=c(25, 50, 100, 200, 400), labels=paste0("+",c(25, 50, 100, 200, 400), "%")) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.text = element_text(size=10), # legend.title = element_blank(), legend.position="bottom") ggsave("metrics_group_count.eps", height = 5.25 , width = 6.75, dpi=240)
Experiment:
fn <- "results_group_both.csv" if (file.exists(fn)) file.remove(fn) # Loop over generated data sets for (i in 1:n.simulations) { print(sprintf("===== Experiment %d =====", i)) for (effect in effects) { print(sprintf("*** Effect %.2f ***", effect)) # Create simulated signal in real data diff.effects <- list( "Group" = list("meateater" = effect, "fisheater" = effect, "vegetarian" = effect, "vegan" = effect) ) signal <- generate_signal(strata, sample.size=nrow(otu.counts), otu.names, diff.abundant, diff.effects) data <- data_simulate(otu.counts, otu.names, strata=strata, signal, which="both") # This is the data.frame of simulated data and true answers otu.simulated <- data$X otu.simulated$Group <- as.factor(otu.simulated$Group) otu.true <- data.frame(otu=otu.names, y_true=as.integer(otu.names %in% data$y)) # Loop over statistical tests results.tests <- list() for (test in names(tests)) { print(test) start <- Sys.time() str.tests <- tests[[test]] testf <- get(test, mode = "function", envir = parent.frame()) # Loop over OTUs in paraller results.test <- foreach (i=1:length(otu.names), .combine=rbind) %dopar% { otu <- otu.names[[i]] pval <- list() for (test.name in names(str.tests)) { str.test <- str.tests[[test.name]] eq <- as.formula(do.call("sprintf", as.list(c(str.test, rep(otu, str_count(str.test, "%s")))))) subset = NULL if (test.name == "strata") { n.in.strata <- table(otu.simulated$Strata) subset = otu.simulated$Strata %in% names(n.in.strata[n.in.strata > 1]) } fit <- testf(eq, data=otu.simulated, subset=subset, distribution=approximate(nresample=500)) pval[[test.name]] <- pvalue(fit) } otu.result <- c(otu=otu, unlist(pval)) otu.result } # Save p-values from different test types results.test <- as.data.frame(results.test) %>% mutate(basic = basic %>% as.numeric) %>% inner_join(otu.true, by="otu") %>% pivot_longer(c(basic), names_to="type", values_to="p_value") %>% mutate(y_pred = p_value <= 0.05) results.tests[[test]] <- results.test # Time per test end <- Sys.time() print(end-start) } results.tests <- bind_rows(results.tests, .id = "family") # Loop over models results.models <- list() for (model in names(models)) { print(model) start <- Sys.time() str.model <- models[[model]] family <- get(model, mode = "function", envir = parent.frame()) # Loop over OTUs in paraller results.model <- foreach (i=1:length(otu.names), .combine=rbind) %dopar% { otu <- otu.names[[i]] eq <- as.formula(do.call("sprintf", as.list(c(str.model, rep(otu, str_count(str.model, "%s")))))) fit <- prr.test.ll(eq, otu.simulated, test.var="Group", test="both", n.rep=500, family=family) c(otu=otu, loglik=fit$p.value.obs, llperm=fit$p.value.sim) } # Save p-values from both log likelihood and prr-test results.model <- as.data.frame(results.model) %>% mutate(loglik = loglik %>% as.numeric, llperm = llperm %>% as.numeric) %>% inner_join(otu.true, by="otu") %>% pivot_longer(c(loglik, llperm), names_to="type", values_to="p_value") %>% mutate(y_pred = p_value <= 0.05) results.models[[model]] <- results.model # Time per model end <- Sys.time() print(end-start) } results.models <- bind_rows(results.models, .id = "family") results <- rbind(results.tests, results.models) results$effect <- effect results$experiment <- i suppressWarnings({write.table(results, fn, sep=";", dec=",", append=T, col.names=F, row.names=F)}) } }
results <- read.table(file="results_group_both.csv", sep=";", dec=",", header = F) colnames(results) <- c("family","otu","y_true","type","p_value","y_pred","effect","experiment") results <- subset(results, subset = !(family %in% c("oneway_test", "kruskal_test"))) # Combine model and p-value calculation method into a single factor results$model <- factor(paste0(sprintf("%s (%s)", results$family, results$type)), levels=model.names)
Table with effect +100%:
results.1 <- results %>% filter(effect == 2.0) # results.1 <- results %>% filter(effect == unique(results$effect)[[3]]) # Calculate statistics for each experiment and display the mean stats <- results.1 %>% group_by(experiment, model) %>% group_modify(~ simple_stats(.x$y_true, .x$p_value)) %>% group_by(model) %>% summarize(#AUC.avg = mean(AUC), #AUC.sd = round(sd(AUC)/sqrt(n.simulations), 2), Power.avg = mean(Power), Power.sd = round(sd(Power)/sqrt(n.simulations), 2), FPR.avg = mean(FPR), FPR.sd = round(sd(FPR)/sqrt(n.simulations), 2), PowerAt.avg = mean(PowerAt), PowerAt.sd = round(sd(PowerAt)/sqrt(n.simulations), 2), AUCAt.avg = mean(AUCAt), AUCAt.sd = round(sd(AUCAt)/sqrt(n.simulations), 2)) %>% mutate(across(where(is.numeric), round, 2)) %>% separate(model, c("Family", "Type"), sep=" ") %>% transmute(Family=Family, Type=Type, #AUC = sprintf("%.2f ($\\pm%.2f$)", AUC.avg, AUC.sd), Power = sprintf("%.2f ($\\pm%.2f$)", Power.avg, Power.sd), FPR = sprintf("%.2f ($\\pm%.2f$)", FPR.avg, FPR.sd), PowerAt = sprintf("%.2f ($\\pm%.2f$)", PowerAt.avg, PowerAt.sd), AUCAt = sprintf("%.2f ($\\pm%.2f$)", AUCAt.avg, AUCAt.sd)) %>% rename("Power"="Power", "FPR"="FPR", "Power@0.05"="PowerAt", "AUC@0.10"="AUCAt") write.table(stats, "stats_group_both.csv", row.names=F, sep=";", dec=",") print.data.frame(stats)
results.2 <- results %>% mutate(effect=(effect-1)*100) # results.2 <- results %>% mutate(effect=(exp(effect)-1)*100) # Calculate statistics for each effect size stats <- results.2 %>% group_by(experiment, model, effect) %>% group_modify(~ simple_stats(.x$y_true, .x$p_value)) %>% group_by(model, effect) %>% summarize(AUC = mean(AUC), Power = mean(Power), FPR = mean(FPR), PowerAt = mean(PowerAt), AUCAt = mean(AUCAt)) %>% pivot_longer(c(AUC, Power, FPR, PowerAt, AUCAt), names_to = "metric", values_to = "value") %>% mutate(family = recode(model, !!!model.family)) %>% filter(metric %in% c("Power", "FPR", "PowerAt", "AUCAt")) %>% mutate(metric = factor(metric, levels = c("Power", "FPR", "PowerAt", "AUCAt"))) %>% mutate(metric = recode_factor(metric, Power="Power", FPR="FPR", PowerAt="Power@0.05", AUCAt="AUC@0.10")) # Plot statistics for each effect size p2 <- ggplot(stats, aes(x=effect, y=value, col=model, linetype=model)) + geom_line() + scale_linetype_manual(values = rep(c("dashed", "solid"), length.out=16)) + scale_color_manual(values = rep(brewer.pal(8, "Accent"), each=2)) + facet_grid(family ~ metric) + theme_bw() + #xlab("log(effect)") labs(title="Signal in counts and zeros: Metrics as a function of effect size", x="Effect size", y="value") + scale_x_continuous(trans="log", breaks=c(25, 50, 100, 200, 400), labels=paste0("+",c(25, 50, 100, 200, 400), "%")) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.text = element_text(size=10), # legend.title = element_blank(), legend.position="bottom") ggsave("metrics_group_both.eps", height = 5.25 , width = 6.75, dpi=240)
ggarrange(p1, p2, ncol=2, common.legend=T, legend="bottom") ggsave("metrics_group.eps", height = 5.25 , width = 12, dpi=240)
Specification of models:
tests <- list( "oneway_test" = list(basic="%s ~ Group", strata="%s ~ Group | Strata"), "kruskal_test" = list(basic="%s ~ Group", strata="%s ~ Group | Strata") ) models <- list( "Poisson" = "%s ~ Group + Covariate + Age + offset(log(library_size))", "Binomial" = "cbind(%s, library_size - %s) ~ Group + Covariate + Age", "NegativeBinomial" = "%s ~ Group + Covariate + Age + offset(log(library_size))", "BetaBinomial" = "cbind(%s, library_size - %s) ~ Group + Covariate + Age", "ZIPoisson" = "%s ~ Group + Covariate + Age + offset(log(library_size)) | Group + Covariate + Age + offset(log(library_size))", "ZIBinomial" = "cbind(%s, library_size - %s) ~ Group + Covariate + Age | Group + Covariate + Age", "ZINegativeBinomial" = "%s ~ Group + Covariate + Age + offset(log(library_size)) | Group + Covariate + Age + offset(log(library_size))", "ZIBetaBinomial" = "cbind(%s, library_size - %s) ~ Group + Covariate + Age | Group + Covariate + Age" )
Specification of signal:
ages <- as.character(seq(20,69)) strata.1 <- data.frame(Group = rep(c("meateater", "fisheater", "vegetarian", "vegan"), 2), Covariate= c(rep("urbanlow", 4), rep("urbanhigh", 4)), Probability=c(c(0.4, 0.3, 0.2, 0.1)/2, c(0.1, 0.2, 0.3, 0.4)/2)) strata.2 <- data.frame(Group = c(rep("meateater", 50), rep("fisheater", 50), rep("vegetarian", 50), rep("vegan", 50)), Age = rep(ages, 4), Probability=c(c(rep(0.00,10)/10, rep(0.10,10)/10, rep(0.20,10)/10, rep(0.30,10)/10, rep(0.40,10)/10), c(rep(0.10,10)/10, rep(0.15,10)/10, rep(0.20,10)/10, rep(0.25,10)/10, rep(0.30,10)/10), c(rep(0.30,10)/10, rep(0.25,10)/10, rep(0.20,10)/10, rep(0.15,10)/10, rep(0.10,10)/10), c(rep(0.40,10)/10, rep(0.30,10)/10, rep(0.20,10)/10, rep(0.10,10)/10, rep(0.00,10)/10))) # Alternative: binned age # strata.2 <- data.frame(Group = c(rep("meateater", 5), rep("fisheater", 5), rep("vegetarian", 5), rep("vegan", 5)), # Age=rep(c(20, 30, 40, 50, 60), 2), # Probability=c(c(0.00, 0.10, 0.20, 0.30, 0.40), # c(0.10, 0.15, 0.20, 0.25, 0.30), # c(0.30, 0.25, 0.20, 0.15, 0.10), # c(0.40, 0.30, 0.20, 0.10, 0.0))) strata <- merge(strata.1, strata.2, by="Group") strata$Probability = strata$Probability.x * strata$Probability.y strata$Probability.x <- NULL strata$Probability.y <- NULL # Same OTUs are differentially abundant for each age, but the effect increases linearly over age age.otus <- sample(otu.names, length(otu.names)/10) age.diff <- setNames(lapply(ages, function (age) age.otus), ages) age.effect <- setNames(exp(seq(log(1),log(5),length.out=50)), ages) # These taxa are differentially abundant in each group diff.abundant <- list( "Group" = list( "meateater" = sample(otu.names, length(otu.names)/10), "fisheater" = sample(otu.names, length(otu.names)/10), "vegetarian" = sample(otu.names, length(otu.names)/10), "vegan" = sample(otu.names, length(otu.names)/10) ), "Covariate" = list( "urbanlow" = sample(otu.names, length(otu.names)/10), "urbanhigh" = sample(otu.names, length(otu.names)/10) ), "Age" = age.diff )
Experiment:
fn <- "results_regression_count.csv" if (file.exists(fn)) file.remove(fn) # Loop over generated data sets for (i in 1:n.simulations) { print(sprintf("===== Experiment %d =====", i)) for (effect in effects) { print(sprintf("*** Effect %.2f ***", effect)) # Create simulated signal in real data diff.effects <- list( "Group" = list("meateater" = effect, "fisheater" = effect, "vegetarian" = effect, "vegan" = effect), "Covariate" = list("urbanlow" = 3, "urbanhigh" = 3), "Age" = age.effect ) signal <- generate_signal(strata, sample.size=nrow(otu.counts), otu.names, diff.abundant, diff.effects) data <- data_simulate(otu.counts, otu.names, strata=strata, signal, which="count") # This is the data.frame of simulated data and true answers otu.simulated <- data$X otu.simulated$Group = as.factor(otu.simulated$Group) otu.simulated$Covariate = as.factor(otu.simulated$Covariate) otu.simulated$Age = as.numeric(otu.simulated$Age) otu.simulated$AgeCat = cut(otu.simulated$Age, c(20,30,40,50,60,70), right=F) otu.simulated$Strata <- factor(with(otu.simulated, paste(Covariate, AgeCat, sep=","))) otu.true <- data.frame(otu=otu.names, y_true=as.integer(otu.names %in% data$y)) # Loop over statistical tests results.tests <- list() for (test in names(tests)) { print(test) start <- Sys.time() str.tests <- tests[[test]] testf <- get(test, mode = "function", envir = parent.frame()) # Loop over OTUs in paraller results.test <- foreach (i=1:length(otu.names), .combine=rbind) %dopar% { otu <- otu.names[[i]] pval <- list() for (test.name in names(str.tests)) { str.test <- str.tests[[test.name]] eq <- as.formula(do.call("sprintf", as.list(c(str.test, rep(otu, str_count(str.test, "%s")))))) subset = NULL if (test.name == "strata") { n.in.strata <- table(otu.simulated$Strata) subset = otu.simulated$Strata %in% names(n.in.strata[n.in.strata > 1]) } fit <- testf(eq, data=otu.simulated, subset=subset, distribution=approximate(nresample=500)) pval[[test.name]] <- pvalue(fit) } otu.result <- c(otu=otu, unlist(pval)) otu.result } # Save p-values from different test types results.test <- as.data.frame(results.test) %>% mutate(basic = basic %>% as.numeric, strata = strata %>% as.numeric) %>% inner_join(otu.true, by="otu") %>% pivot_longer(c(basic, strata), names_to="type", values_to="p_value") %>% mutate(y_pred = p_value <= 0.05) results.tests[[test]] <- results.test # Time per test end <- Sys.time() print(end-start) } results.tests <- bind_rows(results.tests, .id = "family") # Loop over models results.models <- list() for (model in names(models)) { print(model) start <- Sys.time() str.model <- models[[model]] family <- get(model, mode = "function", envir = parent.frame()) # Loop over OTUs in paraller results.model <- foreach (i=1:length(otu.names), .combine=rbind) %dopar% { otu <- otu.names[[i]] eq <- as.formula(do.call("sprintf", as.list(c(str.model, rep(otu, str_count(str.model, "%s")))))) fit <- prr.test.ll(eq, otu.simulated, test.var="Group", test="both", n.rep=500, family=family) c(otu=otu, loglik=fit$p.value.obs, llperm=fit$p.value.sim) } # Save p-values from both log likelihood and prr-test results.model <- as.data.frame(results.model) %>% mutate(loglik = loglik %>% as.numeric, llperm = llperm %>% as.numeric) %>% inner_join(otu.true, by="otu") %>% pivot_longer(c(loglik, llperm), names_to="type", values_to="p_value") %>% mutate(y_pred = p_value <= 0.05) results.models[[model]] <- results.model # Time per model end <- Sys.time() print(end-start) } results.models <- bind_rows(results.models, .id = "family") results <- rbind(results.tests, results.models) results$effect <- effect results$experiment <- i suppressWarnings({write.table(results, fn, sep=";", dec=",", append=T, col.names=F, row.names=F)}) } }
results <- read.table(file="results_regression_count.csv", sep=";", dec=",", header = F) colnames(results) <- c("family","otu","y_true","type","p_value","y_pred","effect","experiment") results <- subset(results, subset = !(family %in% c("oneway_test", "kruskal_test"))) # Combine model and p-value calculation method into a single factor results$model <- factor(paste0(sprintf("%s (%s)", results$family, results$type)), levels=model.names)
Table with effect +100%:
results.1 <- results %>% filter(effect == 2.0) # results.1 <- results %>% filter(effect == unique(results$effect)[[3]]) # Calculate statistics for each experiment and display the mean stats <- results.1 %>% group_by(experiment, model) %>% group_modify(~ simple_stats(.x$y_true, .x$p_value)) %>% group_by(model) %>% summarize(AUC.avg = mean(AUC), AUC.sd = round(sd(AUC)/sqrt(n.simulations), 2), Power.avg = mean(Power), Power.sd = round(sd(Power)/sqrt(n.simulations), 2), FPR.avg = mean(FPR), FPR.sd = round(sd(FPR)/sqrt(n.simulations), 2), PowerAt.avg = mean(PowerAt), PowerAt.sd = round(sd(PowerAt)/sqrt(n.simulations), 2), AUCAt.avg = mean(AUCAt), AUCAt.sd = round(sd(AUCAt)/sqrt(n.simulations), 2)) %>% mutate(across(where(is.numeric), round, 2)) %>% separate(model, c("Family", "Type"), sep=" ") %>% transmute(Family=Family, Type=Type, #AUC = sprintf("%.2f ($\\pm%.2f$)", AUC.avg, AUC.sd), Power = sprintf("%.2f ($\\pm%.2f$)", Power.avg, Power.sd), FPR = sprintf("%.2f ($\\pm%.2f$)", FPR.avg, FPR.sd), PowerAt = sprintf("%.2f ($\\pm%.2f$)", PowerAt.avg, PowerAt.sd), AUCAt = sprintf("%.2f ($\\pm%.2f$)", AUCAt.avg, AUCAt.sd)) %>% rename("Power"="Power", "FPR"="FPR", "Power@0.05"="PowerAt", "AUC@0.10"="AUCAt") write.table(stats, "stats_regression_count.csv", row.names=F, sep=";", dec=",") print.data.frame(stats)
Plot with effect sizes +25%, ..., +400%:
results.2 <- results %>% mutate(effect=(effect-1)*100) # results.2 <- results %>% mutate(effect=(exp(effect)-1)*100) # Calculate statistics for each effect size stats <- results.2 %>% group_by(experiment, model, effect) %>% group_modify(~ simple_stats(.x$y_true, .x$p_value)) %>% group_by(model, effect) %>% summarize(AUC = mean(AUC), Power = mean(Power), FPR = mean(FPR), PowerAt = mean(PowerAt), AUCAt = mean(AUCAt)) %>% pivot_longer(c(AUC, Power, FPR, PowerAt, AUCAt), names_to = "metric", values_to = "value") %>% mutate(family = recode(model, !!!model.family)) %>% filter(metric %in% c("Power", "FPR", "PowerAt", "AUCAt")) %>% mutate(metric = factor(metric, levels = c("Power", "FPR", "PowerAt", "AUCAt")))%>% mutate(metric = recode_factor(metric, Power="Power", FPR="FPR", PowerAt="Power@0.05", AUCAt="AUC@0.10")) # Plot statistics for each effect size p1 <- ggplot(stats, aes(x=effect, y=value, col=model, linetype=model)) + geom_line() + scale_linetype_manual(values = rep(c("dashed", "solid"), length.out=16)) + scale_color_manual(values = rep(brewer.pal(8, "Accent"), each=2)) + facet_grid(family ~ metric) + theme_bw() + #xlab("log(effect)") labs(title="Signal in counts: Metrics as a function of effect size", x="Effect size", y="value") + scale_x_continuous(trans="log", breaks=c(25, 50, 100, 200, 400), labels=paste0("+",c(25, 50, 100, 200, 400), "%")) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.text = element_text(size=10), # legend.title = element_blank(), legend.position="bottom") ggsave("metrics_regression_count.eps", height = 5.25 , width = 6.75, dpi=240)
Experiments:
fn <- "results_regression_both.csv" if (file.exists(fn)) file.remove(fn) # Loop over generated data sets for (i in 1:n.simulations) { print(sprintf("===== Experiment %d =====", i)) for (effect in effects) { print(sprintf("*** Effect %.2f ***", effect)) # Create simulated signal in real data diff.effects <- list( "Group" = list("meateater" = effect, "fisheater" = effect, "vegetarian" = effect, "vegan" = effect), "Covariate" = list("urbanlow" = 3, "urbanhigh" = 3), "Age" = age.effect ) signal <- generate_signal(strata, sample.size=nrow(otu.counts), otu.names, diff.abundant, diff.effects) data <- data_simulate(otu.counts, otu.names, strata=strata, signal, which="both") # This is the data.frame of simulated data and true answers otu.simulated <- data$X otu.simulated$Group = as.factor(otu.simulated$Group) otu.simulated$Covariate = as.factor(otu.simulated$Covariate) otu.simulated$Age = as.numeric(otu.simulated$Age) otu.simulated$AgeCat = cut(otu.simulated$Age, c(20,30,40,50,60,70), right=F) otu.simulated$Strata <- factor(with(otu.simulated, paste(Covariate, AgeCat, sep=","))) otu.true <- data.frame(otu=otu.names, y_true=as.integer(otu.names %in% data$y)) # Loop over statistical tests results.tests <- list() for (test in names(tests)) { print(test) start <- Sys.time() str.tests <- tests[[test]] testf <- get(test, mode = "function", envir = parent.frame()) # Loop over OTUs in paraller results.test <- foreach (i=1:length(otu.names), .combine=rbind) %dopar% { otu <- otu.names[[i]] pval <- list() for (test.name in names(str.tests)) { str.test <- str.tests[[test.name]] eq <- as.formula(do.call("sprintf", as.list(c(str.test, rep(otu, str_count(str.test, "%s")))))) subset = NULL if (test.name == "strata") { n.in.strata <- table(otu.simulated$Strata) subset = otu.simulated$Strata %in% names(n.in.strata[n.in.strata > 1]) } fit <- testf(eq, data=otu.simulated, subset=subset, distribution=approximate(nresample=500)) pval[[test.name]] <- pvalue(fit) } otu.result <- c(otu=otu, unlist(pval)) otu.result } # Save p-values from different test types results.test <- as.data.frame(results.test) %>% mutate(basic = basic %>% as.numeric, strata = strata %>% as.numeric) %>% inner_join(otu.true, by="otu") %>% pivot_longer(c(basic, strata), names_to="type", values_to="p_value") %>% mutate(y_pred = p_value <= 0.05) results.tests[[test]] <- results.test # Time per test end <- Sys.time() print(end-start) } results.tests <- bind_rows(results.tests, .id = "family") # Loop over models results.models <- list() for (model in names(models)) { print(model) start <- Sys.time() str.model <- models[[model]] family <- get(model, mode = "function", envir = parent.frame()) # Loop over OTUs in paraller results.model <- foreach (i=1:length(otu.names), .combine=rbind) %dopar% { otu <- otu.names[[i]] eq <- as.formula(do.call("sprintf", as.list(c(str.model, rep(otu, str_count(str.model, "%s")))))) fit <- prr.test.ll(eq, otu.simulated, test.var="Group", test="both", n.rep=500, family=family) c(otu=otu, loglik=fit$p.value.obs, llperm=fit$p.value.sim) } # Save p-values from both log likelihood and prr-test results.model <- as.data.frame(results.model) %>% mutate(loglik = loglik %>% as.numeric, llperm = llperm %>% as.numeric) %>% inner_join(otu.true, by="otu") %>% pivot_longer(c(loglik, llperm), names_to="type", values_to="p_value") %>% mutate(y_pred = p_value <= 0.05) results.models[[model]] <- results.model # Time per model end <- Sys.time() print(end-start) } results.models <- bind_rows(results.models, .id = "family") results <- rbind(results.tests, results.models) results$effect <- effect results$experiment <- i suppressWarnings({write.table(results, fn, sep=";", dec=",", append=T, col.names=F, row.names=F)}) } }
results <- read.table(file="results_regression_both.csv", sep=";", dec=",", header = F) colnames(results) <- c("family","otu","y_true","type","p_value","y_pred","effect","experiment") results <- subset(results, subset = !(family %in% c("oneway_test", "kruskal_test"))) # Combine model and p-value calculation method into a single factor results$model <- factor(paste0(sprintf("%s (%s)", results$family, results$type)), levels=model.names)
results.1 <- results %>% filter(effect == 2.0) # results.1 <- results %>% filter(effect == unique(results$effect)[[3]]) # Calculate statistics for each experiment and display the mean stats <- results.1 %>% group_by(experiment, model) %>% group_modify(~ simple_stats(.x$y_true, .x$p_value)) %>% group_by(model) %>% summarize(#AUC.avg = mean(AUC), #AUC.sd = round(sd(AUC)/sqrt(n.simulations), 2), Power.avg = mean(Power), Power.sd = round(sd(Power)/sqrt(n.simulations), 2), FPR.avg = mean(FPR), FPR.sd = round(sd(FPR)/sqrt(n.simulations), 2), PowerAt.avg = mean(PowerAt), PowerAt.sd = round(sd(PowerAt)/sqrt(n.simulations), 2), AUCAt.avg = mean(AUCAt), AUCAt.sd = round(sd(AUCAt)/sqrt(n.simulations), 2)) %>% mutate(across(where(is.numeric), round, 2)) %>% separate(model, c("Family", "Type"), sep=" ") %>% transmute(Family=Family, Type=Type, #AUC = sprintf("%.2f ($\\pm%.2f$)", AUC.avg, AUC.sd), Power = sprintf("%.2f ($\\pm%.2f$)", Power.avg, Power.sd), FPR = sprintf("%.2f ($\\pm%.2f$)", FPR.avg, FPR.sd), PowerAt = sprintf("%.2f ($\\pm%.2f$)", PowerAt.avg, PowerAt.sd), AUCAt = sprintf("%.2f ($\\pm%.2f$)", AUCAt.avg, AUCAt.sd)) %>% rename("Power"="Power", "FPR"="FPR", "Power@0.05"="PowerAt", "AUC@0.10"="AUCAt") write.table(stats, "stats_regression_both.csv", row.names=F, sep=";", dec=",") print.data.frame(stats)
results.2 <- results %>% mutate(effect=(effect-1)*100) # results.2 <- results %>% mutate(effect=(exp(effect)-1)*100) # Calculate statistics for each effect size stats <- results.2 %>% group_by(experiment, model, effect) %>% group_modify(~ simple_stats(.x$y_true, .x$p_value)) %>% group_by(model, effect) %>% summarize(AUC = mean(AUC), Power = mean(Power), FPR = mean(FPR), PowerAt = mean(PowerAt), AUCAt = mean(AUCAt)) %>% pivot_longer(c(AUC, Power, FPR, PowerAt, AUCAt), names_to = "metric", values_to = "value") %>% mutate(family = recode(model, !!!model.family)) %>% filter(metric %in% c("Power", "FPR", "PowerAt", "AUCAt")) %>% mutate(metric = factor(metric, levels = c("Power", "FPR", "PowerAt", "AUCAt"))) %>% mutate(metric = recode_factor(metric, Power="Power", FPR="FPR", PowerAt="Power@0.05", AUCAt="AUC@0.10")) # Plot statistics for each effect size p2 <- ggplot(stats, aes(x=effect, y=value, col=model, linetype=model)) + geom_line() + scale_linetype_manual(values = rep(c("dashed", "solid"), length.out=16)) + scale_color_manual(values = rep(brewer.pal(8, "Accent"), each=2)) + facet_grid(family ~ metric) + theme_bw() + #xlab("log(effect)") labs(title="Signal in counts and zeros: Metrics as a function of effect size", x="Effect size", y="value") + scale_x_continuous(trans="log", breaks=c(25, 50, 100, 200, 400), labels=paste0("+",c(25, 50, 100, 200, 400), "%")) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.text = element_text(size=10), # legend.title = element_blank(), legend.position="bottom") ggsave("metrics_regression_both.eps", height = 5.25 , width = 6.75, dpi=240)
ggarrange(p1, p2, ncol=2, common.legend=T, legend="bottom") ggsave("metrics_regression.eps", height = 5.25 , width = 12, dpi=240)
# Calculate ROC with confidence intervals roc <- results.1 %>% group_by(experiment, model) %>% group_modify(~ simple_roc2(.x$y_true, 1-.x$p_value)) %>% group_by(model, FPR) %>% summarize(TPR.mean=mean(TPR))%>% mutate(family = recode(model, !!!model.family)) # Plot ggplot(roc, aes(x=FPR, y=TPR.mean, col=model, linetype=model)) + geom_step() + scale_linetype_manual(values = rep(c("solid", "dashed"), length.out=16)) + scale_color_manual(values = rep(brewer.pal(8, "Accent"), each=2)) + labs(title="ROC curve (effect size +100%)", x="FPR", y="TPR") + theme_bw() + facet_grid(family ~ .) ggsave("roc_regression_both.eps", height = 4.5 , width = 6.5, dpi=240)
mark.1 = stats %>% filter((model == "ZIBetaBinomial (loglik)") & (effect == 100)) %>% pivot_wider(names_from="metric", values_from="value") #mark.2 = roc %>% filter((model == "ZIBetaBinomial (loglik)") & (FPR <= 0.10)) ggplot(roc, aes(x=FPR, y=TPR.mean, col=model, linetype=model)) + geom_step() + scale_linetype_manual(values = rep(c("solid", "dashed"), length.out=16)) + scale_color_manual(values = rep(brewer.pal(8, "Accent"), each=2)) + labs(title="ROC curve", x="FPR", "TPR") + theme_bw() + #facet_grid(family ~ .) + annotate("segment", y=-0.05, yend=mark.1$`Power@0.05`, x=0.05, xend=0.05, color="#666666")+ annotate("segment", y=mark.1$`Power@0.05`, yend=mark.1$`Power@0.05`, x=0, xend=0.05, color="#666666") + annotate("segment", y=0, yend=mark.1$Power, x=mark.1$FPR, xend=mark.1$FPR, color="#666666")+ annotate("segment", y=mark.1$Power, yend=mark.1$Power, x=0, xend=mark.1$FPR, color="#666666") + #annotate("text", y=-0.11, x=0.00, label=".... AUC up to threshold such that FPR = 0.10", color="#666666", hjust = 0, vjust=0, fontface =1) + annotate("text", y=-0.05, x=0.05+0.01, label="TPR at threshold such that FPR = 0.05", color="#666666", hjust = 0, vjust=0, fontface =2) + annotate("text", y=0.01, x=mark.1$FPR+0.01, label="FPR & TPR at threshold set to 0.05", color="#666666", hjust = 0, vjust=0, fontface =2) + labs(x="FPR", y="TPR", title="ROC curve and metric definition") + guides(fill = "none")# + #geom_rect(data=mark.2, aes(xmin=FPR, xmax=lead(FPR), ymin=0, ymax=TPR.mean), fill=NA, lty="dotted", size=0.1) ggsave("roc_new.eps", height = 4.5 , width = 6.5, dpi=240)
stopImplicitCluster()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.