In MayaGans/CoreMicro: Testing Widely Used Core Microbiome Assignment Methods

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(CoreMicro)
library(MCMCpack)
library(tidyverse)
library(ggplot2)
library(RColorBrewer)
library(gridExtra)
library(patchwork)

set the structure of our simlulated taxa table

ntaxa<-1000
ncore<-25
nsites<-50
nreads<-2e06
nsims<-250


corenoncoreratio <-c(1,2,5,10,25)
p.core <-corenoncoreratio/(ntaxa - ncore + corenoncoreratio*ncore)
p.noncore <-( 1-(p.core*ncore) )/ (ntaxa - ncore)

theta <-c(1, 2, 10, 25, 50) #intensity parameter


## do a set of simulations across input parameter values and number of replicates ------
## set up the storage of input and output for the 25 simulation conditions
## quantify TP (true positives) and FP (false positives)
sim.df<- data.frame(expand.grid(corenoncoreratio=corenoncoreratio, theta=theta), 
                    p.core=rep(p.core, length(theta)), 
                    p.noncore=rep(p.noncore, length(theta)),
                    HardCut_TP= numeric(length(p.core) * length(theta)),
                    PropRead_TP = numeric(length(p.core) * length(theta)),
                    PropReps_TP = numeric(length(p.core) * length(theta)),
                    PropReadReps_TP = numeric(length(p.core) * length(theta)),
                    HardCut_FP= numeric(length(p.core) * length(theta)),
                    PropRead_FP = numeric(length(p.core) * length(theta)),
                    PropReps_FP = numeric(length(p.core) * length(theta)),
                    PropReadReps_FP = numeric(length(p.core) * length(theta)),
                    HardCut_FP_num= numeric(length(p.core) * length(theta)),
                    PropRead_FP_num = numeric(length(p.core) * length(theta)),
                    PropReps_FP_num = numeric(length(p.core) * length(theta)),
                    PropReadReps_FP_num = numeric(length(p.core) * length(theta)),
                    HardCut_TP_num= numeric(length(p.core) * length(theta)),
                    PropRead_TP_num = numeric(length(p.core) * length(theta)),
                    PropReps_TP_num = numeric(length(p.core) * length(theta)),
                    PropReadReps_TP_num = numeric(length(p.core) * length(theta))
)

#error2.5 = numeric(length(p.core) * length(theta)),
#error97.5 = numeric(length(p.core) * length(theta)))
#sim.df

for(trt in 1:(nrow(sim.df))){

  # create empty vectors to store output for each method for every simulation
  tmp.err.HardCutTP <-  numeric(nsims)
  tmp.err.PropReadTP <- numeric(nsims)
  tmp.err.PropRepsTP <- numeric(nsims)
  tmp.err.PropReadRepsTP <- numeric(nsims)
  tmp.err.HardCutFP <-  numeric(nsims)
  tmp.err.PropReadFP <- numeric(nsims)
  tmp.err.PropRepsFP <- numeric(nsims)
  tmp.err.PropReadRepsFP <- numeric(nsims)
  #tmp.err.HardCutTP_num <-  numeric(nsims)
  #tmp.err.PropReadTP_num <- numeric(nsims)
  #tmp.err.PropRepsTP_num <- numeric(nsims)
  #tmp.err.PropReadRepsTP_num <- numeric(nsims)
  #tmp.err.HardCutFP_num <-  numeric(nsims)
  #tmp.err.PropReadFP_num <- numeric(nsims)
  #tmp.err.PropRepsFP_num <- numeric(nsims)
  #tmp.err.PropReadRepsFP_num <- numeric(nsims)
  for(i in 1:nsims){
    # create a single OTU table for all four methods to be compared against
    replicateSim <- round(rdirichlet(nsites, c(rep(sim.df$p.core[trt] * sim.df$theta[trt], ncore), 
                                               rep(sim.df$p.noncore[trt] * sim.df$theta[trt], ntaxa-ncore))) * nreads)

    # find true and false positives
    # true positives are those that meet in the criterion in the first 1:ncore taxa.  The remaining taxa are not part of the core, so any taxa meeting the criterion in that set are false positives

    # hard cut off method
    # tmp.err.HardCut[i] <-  sum(colSums((replicateSim >= 25)) >= 5) - ncore
    tmp.err.HardCutTP[i] <-  sum((colSums((replicateSim >= 25)) >= 5)[1:ncore])
    tmp.err.HardCutFP[i] <-  sum((colSums((replicateSim >= 25)) >= 5)[(ncore+1):ntaxa])

    # Prop reads
    #tmp.err.PropRead[i] <- sum(cumsum(sort(colSums(replicateSim)/sum(rowSums(replicateSim))))>=0.25)  - ncore
    tmp.err.PropReadTP[i] <- sum((cumsum(sort(colSums(replicateSim)/sum(rowSums(replicateSim)), decreasing = T))<= 0.75)[1:ncore])
    tmp.err.PropReadFP[i] <- sum((cumsum(sort(colSums(replicateSim)/sum(rowSums(replicateSim)), decreasing = T))<= 0.75)[(ncore+1):ntaxa])  

    # Prop Reps
    #tmp.err.PropReps[i] <- sum(colSums(replicateSim)>10*nrow(replicateSim))  - ncore
    #Sites in rows, otus in Columns
    tmp.err.PropRepsTP[i] <- sum((colSums(replicateSim>0) >= (0.5 * nsites))[1:ncore])  
    tmp.err.PropRepsFP[i] <- sum((colSums(replicateSim>0) >= (0.5 * nsites))[(ncore+1):ntaxa])   

    # Proportion of reps and reads
    #tmp.err.PropReadReps[i] <- sum((colSums(replicateSim>0) >= (0.5 * nsites)) & (colSums(replicateSim) >= sum(replicateSim) * 1/1000) )  - ncore
    tmp.err.PropReadRepsTP[i] <- sum(((colSums(replicateSim>0) >= (0.5 * nsites)) & (colSums(replicateSim) >= sum(replicateSim) * 1/1000) )[1:ncore]) 
    tmp.err.PropReadRepsFP[i] <- sum(((colSums(replicateSim>0) >= (0.5 * nsites)) & (colSums(replicateSim) >= sum(replicateSim) * 1/1000) )[(ncore+1):ntaxa]) 
  }

  # fill the dataframe from the chunk above with these values for later plotting
  sim.df$HardCut_TP[trt] <- mean(tmp.err.HardCutTP) / ncore
  sim.df$PropRead_TP[trt] <- mean(tmp.err.PropReadTP) / ncore
  sim.df$PropReps_TP[trt] <- mean(tmp.err.PropRepsTP) / ncore
  sim.df$PropReadReps_TP[trt] <- mean(tmp.err.PropReadRepsTP) / ncore

  sim.df$HardCut_FP[trt] <- mean(tmp.err.HardCutFP) / (ntaxa - ncore)
  sim.df$PropRead_FP[trt] <- mean(tmp.err.PropReadFP) / (ntaxa - ncore)
  sim.df$PropReps_FP[trt] <- mean(tmp.err.PropRepsFP) / (ntaxa - ncore)
  sim.df$PropReadReps_FP[trt] <- mean(tmp.err.PropReadRepsFP) / (ntaxa - ncore)

  sim.df$HardCut_TP_num[trt] <- mean(tmp.err.HardCutTP) 
  sim.df$PropRead_TP_num[trt] <- mean(tmp.err.PropReadTP) 
  sim.df$PropReps_TP_num[trt] <- mean(tmp.err.PropRepsTP) 
  sim.df$PropReadReps_TP_num[trt] <- mean(tmp.err.PropReadRepsTP) 

  sim.df$HardCut_FP_num[trt] <- mean(tmp.err.HardCutFP) 
  sim.df$PropRead_FP_num[trt] <- mean(tmp.err.PropReadFP) 
  sim.df$PropReps_FP_num[trt] <- mean(tmp.err.PropRepsFP) 
  sim.df$PropReadReps_FP_num[trt] <- mean(tmp.err.PropReadRepsFP)
}

myPalette <- c("#800000", "#A04040", "#DFBFBF", "#ffffff", "#0b45a3")
myPalette2 <- c("#ffffff", "#C2D1E8", "#678BC6", "#0b45a3")
myPalette3 <- c("#ffffff", "#C2D1E8")

Here we change the TP-FP to an integer not the ratio. With the ratios we were assuming all calls TP vs. FP were equally valuable. With this we just look at the differencea and make no assumiption of the value.

sim.df$HardCut_ratio <- sim.df$HardCut_TP_num - sim.df$HardCut_FP_num
sim.df$PropRead_ratio <- sim.df$PropRead_TP_num - sim.df$PropRead_FP_num
sim.df$PropReps_ratio <- sim.df$PropReps_TP_num - sim.df$PropReps_FP_num
sim.df$PropReadReps_ratio <- sim.df$PropReadReps_TP_num - sim.df$PropReadReps_FP_num

sim.df_all <- sim.df %>%
  gather(method, mean, c(5:8, 9:12, 21:24)) %>%
  separate(method, into = c('method', 'error'), sep = "_") %>%
  mutate(error =
           case_when(
             error == "TP" ~ "True Positive",
             error == "FP" ~ "False Positive",
             error == "ratio" ~ "TP-FP"
           ) %>%
           factor(levels = c("True Positive", "False Positive", "TP-FP"))
  ) %>%
  mutate(method =
           case_when(
             method == "HardCut" ~ "Hard Cutoff",
             method == "PropRead" ~ "Summation Sequence Reads",
             method == "PropReps" ~ "Proportion of Sequence Replicates",
             method == "PropReadReps" ~ "Proportion of Sequence Reads and Replicates"
           ) %>%
           factor(levels = c("Summation Sequence Reads", "Proportion of Sequence Replicates", "Proportion of Sequence Reads and Replicates", "Hard Cutoff"))
  )

tp <- sim.df_all %>%
  filter(sim.df_all$error == "True Positive")

levels(tp$method)[1] <- 'Abundance'
levels(tp$method)[2] <- 'Occupancy'
levels(tp$method)[3] <- 'Abundance and Occupancy'
levels(tp$method)[4] <- 'Hard Cutoffs'

tp_plot <- ggplot(tp, aes(x = as.factor(corenoncoreratio), y = as.factor(theta))) +
  geom_tile(aes(fill = mean), colour = "white") +
  facet_grid(error~method, labeller = label_wrap_gen(multi_line = TRUE)) +
  theme(legend.text=element_text(size=2)) +
  theme_minimal() +
  scale_fill_gradientn(colours = myPalette2, values = scales::rescale(c(-1000, -500, -25,0, 100, 500, 1000))) +
  geom_text(aes(label = round(mean, digits = 2)), size = 8) +
   labs(x = expression(paste("Ratio of core:non-core abundance", " (", pi, " core / ", pi, " non-core)" )),
       y = expression(paste("Variance (", theta, ")")),
       fill = "Mean Error") +
  theme(strip.text.x = element_text(size = 12)) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  theme(text = element_text(size = 20)) + guides(fill=guide_legend(title="Proportion of\nTrue Positives"))+
   theme(strip.text.x = element_text(size = 20))  +
  theme(text = element_text(size = 20)) + theme(legend.text=element_text(size=22)) + theme(legend.title = element_text(size = 24))

tp_plot

fp <- sim.df_all %>%
  filter(sim.df_all$error == "False Positive")

levels(fp$method)[1] <- 'Abundance'
levels(fp$method)[2] <- 'Occupancy'
levels(fp$method)[3] <- 'Abundance and Occupancy'
levels(fp$method)[4] <- 'Hard Cutoffs'

fp_plot <- ggplot(fp, aes(x = as.factor(corenoncoreratio), y = as.factor(theta))) +
  geom_tile(aes(fill = mean), colour = "white") +
  facet_grid(error~method, labeller = label_wrap_gen(multi_line = TRUE)) +
  theme(legend.text=element_text(size=2)) +
  theme_minimal() +
  scale_fill_gradientn(colours = myPalette2, values = scales::rescale(c(1000, 500, 25,0, -100, -500, -1000))) +
  geom_text(aes(label = round(fp$mean, digits = 2)), size = 8) +
   labs(x = expression(paste("Ratio of core:non-core abundance", " (", pi, " core / ", pi, " non-core)" )),
       y = expression(paste("Variance (", theta, ")")),
       fill = "Mean Error") +
  theme(strip.text.x = element_text(size = 0)) +
  theme(text = element_text(size = 20)) + guides(fill=guide_legend(title="Proportion of\nFalse Positives")) + theme(legend.text=element_text(size=22)) + theme(legend.title = element_text(size = 24))

fp_plot

tf <- sim.df_all %>%
  filter(sim.df_all$error == "TP-FP")

levels(tf$method)[1] <- 'Abundance'
levels(tf$method)[2] <- 'Occupancy'
levels(tf$method)[3] <- 'Abundance and Occupancy'
levels(tf$method)[4] <- 'Hard Cutoffs'


breaks <- c(-900, -400, 0, 25)

tf_plot <- ggplot(tf, aes(x = as.factor(corenoncoreratio), y = as.factor(theta))) +
  geom_tile(aes(fill = mean), colour = "white") +
  #facet_grid(rows = error ~ method) +
  facet_wrap(~ method, ncol = 2) +
  theme(legend.text=element_text(size=12)) +
  theme_minimal() +
 theme(axis.title = element_text(size = 28)) +
  scale_fill_gradientn(colours = myPalette, breaks = breaks) +
  geom_text(aes(label = round(tf$mean, digits = 2)), size=8) +
  labs(x = expression(paste("Ratio of core:non-core abundance", " (", pi, " core / ", pi, " non-core)" )),
       y = expression(paste("Variance (", theta, ")")),
       fill = "Mean Error") +
   theme(strip.text.x = element_text(size = 24))  +
  theme(text = element_text(size = 20)) + guides(fill=guide_legend(title="TP-FP")) + theme(legend.text=element_text(size=22)) + theme(legend.title = element_text(size = 24))

tf_plot