In RIVM-IIV-Microbiome/biomeStats: Methods for carrying out association studies of the microbiome

knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)

Import libraries

library(biomeStats)
library(biomeUtils)
library(microbiome)
library(dplyr)

dis_col <- c(L1="#6883ba", C = "#166f65", `NA`="black")

Import data

data("FuentesIliGutData")
ps <- subset_samples(FuentesIliGutData, ILI %in% c("C", "L1"))
ps.gen <- tax_glom(ps, "Genus")
#saveRDS(ps.gen, "data-raw/ps.gen.rds")
ps.gen <- readRDS("../data-raw/ps.gen.rds")
ps.rel <- microbiome::transform(ps.gen, "compositional")
taxa_names(ps.rel) <- make.unique(tax_table(ps.rel)[,"Genus"])
sample_data(ps.rel)$sample_ids <- sample_names(ps.rel)

sample_variables(ps.rel)[1:10]

# "ILI" "BMI" "sex" "age"
# "antidepressants" "antibiotics" "T2.diabetes.medication"

getSampleTibble(ps.rel) %>% 
  dplyr::count(ILI)

biomeUtils::getSampleTibble(ps.rel) %>% 
  dplyr::count(sex)

biomeUtils::getSampleTibble(ps.rel) %>% 
  dplyr::count(antidepressants)

biomeUtils::getSampleTibble(ps.rel) %>% 
  dplyr::count(anticoagulants)

biomeUtils::getSampleTibble(ps.rel) %>% 
  dplyr::count(antibiotics)

biomeUtils::getSampleTibble(ps.rel) %>% 
  dplyr::count(statins)

biomeUtils::getSampleTibble(ps.rel) %>% 
  dplyr::count(alphabeta.blocker)

Abundance data

Prepare a tibble from phyloseq object. This tibble consists of ID (for sample) relative abundance of taxa, average and SD of counts of taxa in sample, prop.positive, entropy and C.V.

gen_tab <- prepTaxaAbundanceData(ps.rel,
                                 detection_threshold= 0.001,
                                 prevalence_threshold=50/100)

dim(gen_tab)

Variables to test

There can be numerous variables that a user might want to test. It is advised to select and specify variables of interest. This has to be done manually because sample data my consists of several other data that are not useful, like primer/barcode sequences, internal codes from sequencing centre, etc.

#main_var <- "ILI"
comord.vars <- c("BMI")
#"antimicrobial","ARB",
med.vars <- c("anticoagulants", "antibiotics", "statins","alphabeta.blocker")

all.vars <- c(comord.vars,med.vars)

sample_data(ps.rel)$sex <- ifelse(sample_data(ps.rel)$sex =="M",0,1)
sample_data(ps.rel)$antidepressants <- ifelse(sample_data(ps.rel)$antidepressants =="N",0,1)
sample_data(ps.rel)$anticoagulants <- ifelse(sample_data(ps.rel)$anticoagulants =="N",0,1)
sample_data(ps.rel)$antibiotics <- ifelse(sample_data(ps.rel)$antibiotics =="N",0,1)
sample_data(ps.rel)$statins <- ifelse(sample_data(ps.rel)$statins =="N",0,1)
sample_data(ps.rel)$alphabeta.blocker <- ifelse(sample_data(ps.rel)$alphabeta.blocker =="N",0,1)

Prep metadata

phenotypic.data.1 <- prepSampleData(ps.rel,
                                    sample_ids = "sample_ids",
                                    reference_group ="C",
                                    compare = "ILI",
                                    confounder_variables = c("sex", "age"),
                                    variable_interest=all.vars)

head(phenotypic.data.1)

Strata frequency

frequency.of.strata <- table(phenotypic.data.1$stratum)
#mean(frequency.of.strata)
frequency.of.strata

Data for testing

# First column is sample ID so start from second column and stop at column 5 from last
taxa_select <- colnames(gen_tab)[2:(ncol(gen_tab)-5)]

# the last five columns are
other <- c("average","SD","prop.positive","entropy","CV")

#vars.test <- colnames(phenotypic.data.1)[c(2,5:39)]
# get variables
vars.test <- c("ILI",all.vars)


data.set.1 <- phenotypic.data.1 %>%
  dplyr::left_join(gen_tab, by = "sample_ids") %>%
  dplyr::relocate(stratum, .after = last_col()) %>%
  dplyr::select(!c(sample_ids, sex, age, age.group)) # age and gender removed as we treat them as confounders
# sum statistic with sex and age stratum as blocking factor.
head(data.set.1)
variables_to_test <- c("ILI",all.vars)

microbiome

Type of vars

IMPORTANT STEP

variables_to_test <- c("ILI",all.vars)
# BMI is continuous variable
TYPES <- c("categorical","continuous",
           rep("ordinal",4),
           rep("ordinal",length(taxa_select)),
           rep("continuous",5),
           "categorical")

types <- c("categorical","continuous",
           rep("categorical",4),
           rep("dirac.and.continuous",length(taxa_select)),
           rep("continuous",5),
           "categorical")

Test associations

#get column number that match bacterial abundance
bac_cols <- which(colnames(data.set.1) %in% taxa_select)

#no.of.factors <- length(variables_to_test)
#aux.columns <- bac_cols
#variables.of.interest <- colnames(data.set.1)[1:(ncol(data.set.1)-6)]

dir.create("../../test_stat")

doAssociationAnalysis(data_for_testing = data.set.1,
                      B = 1000,
                      no_of_factors = length(variables_to_test),
                      aux_columns = bac_cols,
                      log_scale = TRUE,
                      nominal_bound_on_FDR = 0.20,
                      phen_data = phenotypic.data.1,
                      compare_label ="case_control",
                      aux_TYPES = TYPES,
                      cat_ypes = types,
                      path_loc="../../test_stat/",
                      name_of_stratification= "Sex and age-group")