In microbiome/miaSim: Microbiome Data Simulation

knitr::opts_chunk$set(
  cache = FALSE,
  fig.width = 9,
  message = FALSE,
  warning = FALSE)

Introduction

This document provides the detailed code required to replicate case study 3 discussed in Gao et al. (2023). Methods in Ecology and Evolution. DOI: 10.1111/2041-210X.14129

This replication study has been implemented with phyloseq. The TreeSE package has been since then upgraded to use the TreeSE data container.

For general instructions and examples on using the miaSim package tools, see the vignette. miaSim implements tools for microbiome data simulation based on different ecological modeling assumptions. These can be used to simulate species abundance matrices, including time series. For a detailed function documentation, see the function reference page

Case study 3: Environmental complexity impacts on different communities

(Reference: Non-additive microbial community responses to environmental complexity)

The aim of this case study is to recalculate the patterns shown in the existing publication in the above reference. More specifically, the left part of this Figure 2.

In this case study, consumer-resource model was used. The number of species in 3 distinct communities varied so that communities are named after the number of species (i.e. com13, com3, and com4).

For each community, a floor of different numbers of carbon resources (nutrients) is set from 1, 2, 4, 8, 16, to 32.

In the original experiment, value of OD600 is selected as y-axis to represent the growth yield. In our simulations, however, the total number of organisms(number of all individuals) is used to reflect the growth yield.

Setup

Load dependencies

library(ggplot2)
library(miaSim)
library(vegan)
library(ggplot2)
library(reshape2)

Set random seed

set.seed(42)

Set number of repeats

n_rep <- 50

Initial the output dataframes to store data

result_df <- data.frame(
    n_species = integer(),
    theta = numeric(),
    i = integer(),
    n_resources = integer(),
    value = numeric()
)

result_df2 <- data.frame(
  matrix(NA, nrow = 0, ncol = 13, dimnames = list(NULL, paste0("sp", seq_len(13))))
)

sorensen_df <- data.frame(
    n_species = integer(),
    theta = numeric(),
    rho_mean = numeric(),
    rho_sd = numeric()
)

Generating function. This function generates a data frame, where each row is arranged in an increasing dissimilarity to the first row.

gradient_df_generator <- function(n_row, n_col, density_row, max_gradient, error_interval){
  list_initial <- list()
  dissimilarity.gradient <- seq(from = 0, to = max_gradient, length.out = n_row)
  for (i in seq_len(n_row)){
    print(i)
    if (i == 1){
      row_temp <- rbeta(n_col, 1, n_col)
      col_to_remove <- sample(x = seq_len(n_col), size = n_col-n_col*density_row)
      row_temp[col_to_remove] <- 0
      list_initial[[i]] <- row_temp
    } else {
      while (length(list_initial) < i) {
        row_temp <- rbeta(n_col, 1, n_col)
        col_to_remove <- sample(x = seq_len(n_col), size = n_col-n_col*density_row)
        row_temp[col_to_remove] <- 0
        diff_temp <- abs(vegdist(rbind(list_initial[[1]], row_temp), method = "bray") - dissimilarity.gradient[i])
        if (diff_temp < error_interval) {
          list_initial[[i]] <- row_temp
        }
      }
    }
  }
  dataframe_to_return <- as.data.frame(t(matrix(unlist(list_initial), ncol = n_row)))
  return(dataframe_to_return)
}

Load parameters used by Pacheco et al., and initialized the community data frame. Note that in this step, the value range of theta has been extended.

n_species_types <- c(13, 3, 4)
theta_types <- c(1, 0.75, 0.5, 0.25, 0.1, 0.05)
n_resources_types <- c(1,2,4,8,16,32)

community.initial.df <- as.list(
  lapply(n_species_types, 
         gradient_df_generator, 
         n_row = n_rep, 
         density_row = 1,
         max_gradient = 0.7,
         error_interval = 0.15)
)

Loop for different combinations of (number of species X theta X number of repetations X number of resources)

for (n_species in n_species_types){
    for (theta in theta_types) {
        sorensen <- c()
        for (i in seq_len(n_rep)){
            for (n_resources in n_resources_types) {
                ### generate E ####
                Etest <- randomE(n_species = n_species,
                                 n_resources = n_resources,
                                 mean_consumption = theta*n_resources,
                                 exact = TRUE)

                ### calculate rho ####
                if (n_resources == max(n_resources_types)){
                    Etest_pos <- Etest
                    Etest_pos[Etest_pos<0] <- 0
                    for (j in seq_len(n_species - 1)){
                        for (k in 2:n_species){
                            sorensen <- c(sorensen,
                                          sum(apply(Etest_pos[c(j,k),], 2, min)))
                        }
                    }
                }

                if (n_resources > 1){
                    Priority <- t(apply(matrix(sample(n_species * n_resources), nrow = n_species), 1, order))
                    Priority <- (Etest > 0) * Priority
                } else {
                    Priority <- NULL
                }

                print(paste0("n_species=",n_species, " theta=",theta, " i=", i, " n_resources=", n_resources))
                x0temp <- as.numeric(community.initial.df[[match(n_species, n_species_types)]][i,])
                x0temp <- 10*x0temp/sum(x0temp)
                CRMtest <- simulateConsumerResource(n_species = n_species,
                                                    n_resources = n_resources,
                                                    x0 = x0temp, #rep(10, n_species),
                                                    resources = rep(100, n_resources),
                                                    E = Etest,
                                                    # trophic_priority = Priority,
                                                    stochastic = TRUE,
                                                    t_end = 1000,
                                                    t_step = 1,
                                                    t_store = 1000)
                CRMspecies <- getCommunity(CRMtest)
                CRMspeciesTotal <- sum(CRMspecies)
                result_df[nrow(result_df)+1,] <- c(n_species, theta, i, n_resources, CRMspeciesTotal)
                result_df2[nrow(result_df2)+1,] <- c(CRMspecies, rep(NA, 13-length(CRMspecies)))
                # makePlotRes(CRMtest$resources)
                # makePlot(CRMtest$matrix)
            }
        }
        rho_mean <- mean(sorensen)
        rho_sd <- var(sorensen)
        sorensen_df[nrow(sorensen_df)+1, ] <- c(n_species, theta, rho_mean, rho_sd)
    }
}

p_fig2_result_df <- ggplot(result_df, aes(x = n_resources, y = value, group = n_resources)) +
    geom_boxplot(outlier.shape = NA) +
    geom_jitter(alpha = 0.2, width = 0.2) +
    facet_grid(. ~ factor(n_species, levels = n_species_types)) +
    theme_bw() +
    scale_x_continuous(trans = "log2", breaks = n_resources_types) +
    xlab("number of resources") +
    ylab("growth yield (number of individuals)")
p_fig2_result_df