knitr::opts_chunk$set( echo = FALSE, warning = FALSE, message = FALSE, collapse = TRUE, comment = "#>", fig.wide=TRUE ) suppressPackageStartupMessages({ library(phyloseq) library(ape) library(vegan) library(dplyr) library(magrittr) library(ggplot2) library(nychanesmicrobiome) #devtools::load_all() })
#full NYC-HANES2 dataset nychanes_sas <- sas7bdat::read.sas7bdat("https://med.nyu.edu/departments-institutes/population-health/divisions-sections-centers/epidemiology/sites/default/files/nyc-hanes-datasets-and-resources-analytic-data-sets-sas-file.sas7bdat") nychanes_full <- annotateFullDataset(nychanes_sas) #pilot microbiome data suppressPackageStartupMessages({ NYC_HANES_raw <- loadQiimeData() NYC_HANES <- annotateFactors(NYC_HANES_raw) }) metadata <- data.frame(sample_data(NYC_HANES))
variablesToLook <- c("SPAGE", "AGEGRP5C", "GENDER","EDU4CAT","INC3C","DMQ_2","RACE","US_BORN", "DMQ_7YEAR","A1C","GLUCOSE", "AGEGRP3C", "EDU3CAT", "OHQ_3", "OHQ_5_3CAT","DBQ_10_3CAT") table1_data <- rbind(transform(formatMetadata(NYC_HANES), dataset="Oral Microbiome Subsample"), transform(formatMetadata(nychanes_full), dataset="Full NYC HANES Sample")) names(table1_data) <- c(names(formatMetadata(NYC_HANES)), "dataset") table1 <- tableone::CreateTableOne(vars=names(table1_data)[-which(names(table1_data)=="dataset")], strata="dataset",data=table1_data, includeNA = TRUE, test=FALSE) table1 <- print(table1, smd=TRUE, printToggle=FALSE) rownames(table1) <- rownames(prettytable_autoindent(vars = names(table1_data)[-which(names(table1_data)=="dataset")],data = table1_data, includeNA = TRUE)) rownames(table1)[1] <- "Total" rownames(table1) <- gsub("NA", "<i>Missing</i>", rownames(table1)) rownames(table1)[2] <- "Age in years -- median [range]" table1[2,1:2 ] <- c(paste0(median(table1_data$`Age (yrs)`[table1_data$dataset=="Oral Microbiome Subsample"]), " [", min(table1_data$`Age (yrs)`[table1_data$dataset=="Oral Microbiome Subsample"]), " to ", max(table1_data$`Age (yrs)`[table1_data$dataset=="Oral Microbiome Subsample"]), "]"), paste0(median(table1_data$`Age (yrs)`[table1_data$dataset=="Full NYC HANES Sample"],na.rm=TRUE), " [", min(table1_data$`Age (yrs)`[table1_data$dataset=="Full NYC HANES Sample"],na.rm=TRUE), " to ", max(table1_data$`Age (yrs)`[table1_data$dataset=="Full NYC HANES Sample"],na.rm=TRUE), "]")) knitr::kable(table1, format="markdown", caption="Table 1. Sample demographics and oral health behavioral characteristics.")
cramer_vars <- c('AGEGRP5C','GENDER','EDU4CAT','INC3C','DMQ_2','RACE','US_BORN','OHQ_3','OHQ_5_3CAT','SMOKER4CAT','DBQ_10_3CAT') cramer_varlabs <-c("Age (5 cat)","Gender", "Education (4 cat)","Family Income (3 cat)", "Marital Status","Race/Ethnicity","US- vs. Foreign-Born","Gum Disease","Mouthwash Use", "Smoking Status (4 cat)","Sugar-sweetened beverages (3 cat)") df_cramer <- nychanes_full[,cramer_vars] names(df_cramer) <- cramer_varlabs cramersV_matrix <- function (data) { mtrx <- sapply(1:ncol(data), function(var1) sapply(1:ncol(data), function(var2) lsr::cramersV(data[, var1], data[, var2]))) diag(mtrx) <- 1 rownames(mtrx) <- colnames(mtrx) <- colnames(data) mtrx } mtrx_cramersV <- abs(cramersV_matrix(data=df_cramer)) color_args <- list(name="Cramer's V",low = "blue", high = "darkorange") # Make a triangle mtrx_cramersV[lower.tri(mtrx_cramersV)] <- NA diag(mtrx_cramersV) <- NA mtrx_cramersV <- as.data.frame(mtrx_cramersV) # Convert to a data frame, and add labels df_cramer <- as.data.frame(mtrx_cramersV) df_cramer$var2 <- seq_along(rownames(df_cramer)) # Reshape to suit ggplot, remove NAs, and sort the labels df_cramer <- na.omit(reshape2::melt(df_cramer, id.vars="var2")) df_cramer$variable <- factor(df_cramer$variable, levels=rev(levels(df_cramer$variable))) p1 <- ggplot(df_cramer, aes(x=variable, y=var2, fill=value)) + geom_tile() + do.call(scale_fill_gradient, color_args) + scale_y_continuous(breaks=1:11, labels=rev(levels(df_cramer$variable))) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, vjust=1, hjust = 1), legend.position = "none") + labs(x=NULL, y=NULL) #create boxplot mtrx_cramersV$var2 <- rownames(mtrx_cramersV) data_boxplot <- reshape2::melt(mtrx_cramersV, "var2") %>% na.omit %>% mutate(grp=NA, alph=-1*value + 1, Name = paste0(var2, "*", variable)) p2 <- ggplot(data_boxplot, aes(y=value, x=grp, col=value)) + stat_boxplot(geom="errorbar") + geom_boxplot(outlier.shape=NA) + geom_jitter(size=2) + do.call(scale_color_gradient, color_args) + theme(axis.ticks = element_blank(), axis.text = element_blank(), axis.title = element_blank(), panel.background = element_blank(), plot.margin = unit(c(0, 0.1, 4, 0.5), "cm")) gt1 <- ggplot_gtable(ggplot_build(p1)) gt2 <- ggplot_gtable(ggplot_build(p2)) gt <- gtable::gtable(widths = unit(c(5.5, 2.5), "null"), height = unit(8, "null")) # Instert gt1, gt2 and gt3 into the new gtable gt <- gtable::gtable_add_grob(gt, gt1, 1, 1) gt <- gtable::gtable_add_grob(gt, gt2, 1, 2) grid::grid.newpage() grid::grid.draw(gt) grid::grid.rect(x = 0.5, y = 0.5, height = 0.995, width = 0.995, default.units = "npc", gp = grid::gpar(col="transparent", fill = NA, lwd = 1))
get_phyla_colors <- function(v,s) { set.seed(48) rev(sample(rainbow(n = 8, start = 0.1, end = 1, s = s, v=v), size=8, replace=FALSE))[c(2,1,3:8)] } phyla_colors <- c(rbind(get_phyla_colors(v=.7,s=.5)[c(2,4,6,8)], get_phyla_colors(v=.9,s=.7)[c(1,3,5,7)])) g <- plot_abundance(NYC_HANES, PALETTE = phyla_colors, top_n = 8, prop = "total") + labs(x = paste0("└────── ", length(sample_names(NYC_HANES)), " Samples ───────┘"), y = "", title = "") + scale_x_continuous(labels = NULL) + scale_y_continuous(labels = seq(0,100,25)) p <- plot_abundance(NYC_HANES, top_n = 8, PALETTE = phyla_colors, taxrank="Genus", prop = "total", type="area") p <- p + labs(x="", y="", title="") + scale_x_continuous(labels=NULL) + scale_y_continuous(labels=seq(0,100,25)) gridExtra::grid.arrange(p + theme(plot.margin=unit(c(1,0.95,-0.7,0.1), "cm"), panel.grid = element_blank()), g + theme(plot.margin=unit(c(-0.7,0.7,1,-0.1), "cm"), panel.grid = element_blank()), nrow=2, left="Abundance (%)")
df_alpha <- cbind(table1_data[table1_data$dataset=="Oral Microbiome Subsample", -which(names(table1_data) %in% c("Years in United States", "Age (yrs)", "Gum disease (self-reported)", "Mouthwash use (times per week)", "Smoking status", "Sugar-sweetened beverages (per week)", "dataset")) ]) names(df_alpha)[c(3,4,7)] <- c("Educational \nachievement", "Family \nincome","Place of \nbirth") #calculate p-values df_alpha <- cbind(df_alpha, estimate_richness(NYC_HANES, measures = "Chao1")["Chao1"]) alpha_p <- "\n(P=" %>% paste0(sapply(names(df_alpha), function(i) kruskal.test(Chao1 ~ eval(as.name(i)), data=df_alpha)$p.value) %>% formatC(digits=2, format = "f") ) %>% paste0(")") names(df_alpha)[-8] %<>% paste0(alpha_p) plot_alpha_by(NYC_HANES, metadata=df_alpha[-8], notch=FALSE)
model_vars = c('GENDER','AGEGRP3C','EDU3CAT','INC3C','DMQ_2','RACE','US_BORN') model_varlabs = c('Gender','Age','Education','Family income','Marital status','Race/ethnicity','US- vs. foreign-born') #drop empty levels sample_data(NYC_HANES) <- droplevels(data.frame(sample_data(NYC_HANES))) #Get a dataframe of toptags from all models df_tile <- get_all_edgeR_models(model_vars, model_varlabs, to.data.frame=TRUE) #label variables with the number of unique OTUs significant df_tile$variable <- paste0(as.character(df_tile$variable), " (", sapply(df_tile$variable, function(i) sum(df_tile$variable==i)), ")") #Attach genus information df_tile %<>% left_join(data.frame(OTU=rownames(tax_table(NYC_HANES)), Genus = tax_table(NYC_HANES)[,"Genus"])) #Sort genus by decreasing number of appearances df_tile$Genus <- factor(df_tile$Genus, levels=names(sort(table(df_tile$Genus), decreasing = TRUE))) #create a translator dataframe for coefficient labels and left join it to the toptags dataframe varnames <- c("GENDER","AGEGRP3C","EDU3CAT","INC3C","DMQ_2","RACE","US_BORN") varlevels <- unlist(sapply(varnames, function(i) levels(factor(eval(as.name(i), envir = metadata)))[-1])) varlabs <- c("Gender","Age group (3 cat)","Education (3 cat)","Income (3 cat)","Marital Status","Race/ethnicity","U.S. vs. foreign-born") times <- sapply(varnames, function(i) nlevels(factor(eval(as.name(i), envir = metadata)))-1) df_translate <- data.frame(coef=paste0(rep(varnames, times=times), varlevels), varlevel=varlevels, coef_lab=paste(rep(varlabs, times=times), "=",varlevels), stringsAsFactors = FALSE) df_tile %<>% left_join(df_translate, by="coef") df_tile$varlevel %<>% factor(levels = rev(levels(factor(df_tile$varlevel))[c(2,3, 15,7, 1,9, 12,8,4,13, 6, 10,11,14,16,5)])) df_tile$variable %<>% as.factor() plot_logfc <- df_tile %>% na.omit %>% ggplot(aes(x=Genus, y=coef_lab, fill=logFC)) + geom_tile(color="black") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, vjust=1, hjust = 1), strip.placement = "outside", panel.spacing = unit(.5, "line")) + scale_fill_gradient2(low = "seagreen4", mid = "white", high = "purple3") + labs(y=NULL)
f <- c("Neisseria","Prevotella","Streptococcus") phylo.otut <- list() for (x in f) { otumap <- read.table(paste0("../inst/extdata/", x,"_MATRIX-COUNT.txt"), header = T,row.names = 1) %>% t colnames(otumap) <- stringr::str_replace_all(colnames(otumap), 'R|c','') otumap <- otumap[,rownames(sample_data(NYC_HANES))] taxmat <- matrix(sample(letters, 7*nrow(otumap), replace = TRUE), nrow = nrow(otumap), ncol = 7) rownames(taxmat) <- rownames(otumap) colnames(taxmat) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species") phylo.otut[x] <- phyloseq(otu_table(otumap, taxa_are_rows = TRUE), tax_table(taxmat), sample_data(NYC_HANES)) }
#relevel education and income according to new levels in annotateFactors() levels(sample_data(phylo.otut$Neisseria)$EDU3CAT) <- levels(sample_data(phylo.otut$Prevotella)$EDU3CAT) <- levels(sample_data(phylo.otut$Streptococcus)$EDU3CAT) <- levels(metadata$EDU3CAT) levels(sample_data(phylo.otut$Neisseria)$INC3C) <- levels(sample_data(phylo.otut$Prevotella)$INC3C) <- levels(sample_data(phylo.otut$Streptococcus)$INC3C) <- levels(metadata$INC3C) #get all models for each oligotype pseq df_neisseria <- get_all_edgeR_models(vars=model_vars, varlabels = model_varlabs, pseq=phylo.otut$Neisseria) df_prevotella <- get_all_edgeR_models(vars=model_vars, varlabels = model_varlabs, pseq=phylo.otut$Prevotella) df_streptococcus <- get_all_edgeR_models(vars=model_vars, varlabels = model_varlabs, pseq=phylo.otut$Streptococcus) #combine into one data.frame for plotting df_oligotypes <- rbind(data.frame(df_neisseria, Genus="Neisseria"), data.frame(df_prevotella, Genus="Prevotella"), data.frame(df_streptococcus, Genus="Streptococcus")) #attach coefficient level labels df_oligotypes %<>% left_join(df_translate, by="coef") #plot plot_oligo <- df_oligotypes %>% ggplot(aes(x=OTU, y=coef_lab, fill=logFC)) + geom_tile(color="black") + theme_minimal() + theme(axis.text.x = element_blank(), panel.spacing = unit(.75, "cm")) + scale_fill_gradient2(low = "seagreen4", mid = "white", high = "purple3") + labs(y=NULL, x="Oligotype") + facet_grid(.~Genus,scales="free_x", space="free_x", drop=TRUE)
gridExtra::grid.arrange(plot_logfc, plot_oligo, heights=c(4,3.3))
vars <- c("GENDER","AGEGRP3C","EDU3CAT", "INC3C", "DMQ_2", "RACE", "US_BORN" ) varlabels <- c("Gender", "Age", "Education", "Family income", "Marital status", "Race/ethnicity", "US- vs. foreign-born") df_crude <- df_tile df_crude_nofilter <- get_all_edgeR_models(vars=vars, varlabels=varlabels, alph=1) df_ohq <- get_all_edgeR_models(vars=vars, varlabels = varlabels, adjusted_for="OHQ_5_3CAT", alph=1) %>% dplyr::rename(logFC_ohq = logFC) df_smoke <- get_all_edgeR_models(vars=vars, varlabels = varlabels, adjusted_for="smokingstatus", alph=1) %>% dplyr::rename(logFC_smoke = logFC) df_age_gender <- get_all_edgeR_models(vars=vars, varlabels = varlabels, adjusted_for=c("AGEGRP3C","GENDER"), alph=1) %>% dplyr::rename(logFC_age_gender = logFC) df_sugar <- get_all_edgeR_models(vars=vars, varlabels = varlabels, adjusted_for=c("DBQ_10_3CAT"), alph=1) %>% dplyr::rename(logFC_sugar = logFC)
df_crude[,c(1,2,4)] %<>% lapply(as.character) df_crude$variable <- gsub("[0-9]| \\(|\\)","",df_crude$variable) df_crude_nofilter[,c(1,2,4)] %<>% lapply(as.character) df_ohq[,c(1,2,4)] %<>% lapply(as.character) df_smoke[,c(1,2,4)] %<>% lapply(as.character) df_age_gender[,c(1,2,4)] %<>% lapply(as.character) df_sugar[,c(1,2,4)] %<>% lapply(as.character) create_crude_vs_adjusted_boxplot_df <- function(df_crude) { df_crude %<>% inner_join(df_ohq, by=c("OTU","coef","variable")) %>% inner_join(df_smoke, by=c("OTU","coef","variable")) %>% inner_join(df_age_gender, by=c("OTU","coef","variable"))%>% inner_join(df_sugar, by=c("OTU","coef","variable")) df_crude[,grep("logFC", names(df_crude))] %<>% lapply(function(i) abs(as.numeric(i))) df_crude$smoke_diff <- (df_crude$logFC_smoke - df_crude$logFC) / df_crude$logFC df_crude$ohq_diff <- (df_crude$logFC_ohq - df_crude$logFC) / df_crude$logFC df_crude$agegender_diff <- (df_crude$logFC_age_gender - df_crude$logFC) / df_crude$logFC df_crude$sugar_diff <- (df_crude$logFC_sugar - df_crude$logFC) / df_crude$logFC df_crude } df_fdr <- create_crude_vs_adjusted_boxplot_df(df_crude) df_nofdr <- create_crude_vs_adjusted_boxplot_df(df_crude_nofilter) #for boxplot comparing absolute values (FDR) df_fdr_melt <- reshape2::melt(df_fdr, id.vars = "variable",measure.vars = c("logFC","logFC_ohq","logFC_smoke","logFC_age_gender","logFC_sugar"), variable.name = "logFC") names(df_fdr_melt) <- c("variable", "logFC","value" ) dodge_amount = 1 figure5 <- ggplot(df_fdr_melt, aes(y=value, x=variable, fill=logFC)) + stat_boxplot(geom="errorbar", position=position_dodge(dodge_amount)) + geom_boxplot(outlier.shape=NA, alpha=.3, position=position_dodge(dodge_amount)) + geom_point(aes(color=logFC), size=.5, position=position_jitterdodge(jitter.width=.3, dodge.width = dodge_amount)) + scale_fill_discrete(labels=c("Crude","Adjusted for Mouthwash Use", "Adjusted for Smoking","Adjusted for Age and Gender","Adjusted for Sugar-sweetened Beverage Consumption"), name="") + scale_color_discrete(guide="none")+ theme_minimal() + guides(fill=guide_legend(ncol=2)) + theme(axis.text.x = element_text(angle=40,hjust=1,vjust=1), legend.position = "top", panel.spacing = unit(.6, "lines"), strip.text = element_blank()) + labs(y="Absolute Value of logFC", x=NULL) + facet_grid(.~variable, scales="free_x", space="free_x") figure5
df_table <- df_crude %>% mutate(direction=ifelse(as.numeric(logFC)>0, "Greater abundance in:","Lower abundance in:")) commalist <- function(x) { sentence = x[1] if(length(x) > 1) { for(i in 2:(length(x))) { sentence <- paste0(sentence, "<br> ", x[i]) } } sentence } selected_genera = c("Streptococcus", "Lactobacillus", "Lactococcus", "Prevotella", "Porphyromonas", "Fusobacterium") reference_groups = sapply(vars, function(i) levels(metadata[[i]])[1]) names(reference_groups) <- varlabels df_table %>% filter(Genus %in% selected_genera) %>% group_by(Genus, direction) %>% summarize(lst = commalist(unique(as.character(coef_lab))) ) %>% reshape2::dcast(Genus ~ direction) %>% knitr::kable(format="markdown", caption = "Table 2. Differential abundance findings for OTUs selected based on clinical relevance. Greater or lower abundance indicates false discovery rate (FDR) <0.01.") %>% kableExtra::kable_styling(bootstrap_options = "condensed") tfooter = paste0("Reference groups for sociodemographic variables are as follows: ", paste(paste0(names(reference_groups), ": ", reference_groups), collapse=", ")) tfooter
permanova_var_names <- c("AGEGRP5C","GENDER","EDU4CAT","INC3C","DMQ_2","RACE","US_BORN","smokingstatus") df_permanova <- na.omit(metadata[, c("Burklab_ID", permanova_var_names)]) # #creating an NA factor level for INC3C because permanova doesn't allow NAs # df_permanova$INC3C %<>% as.character # df_permanova$INC3C[is.na(df_permanova$INC3C)] <- "NA" dist_perm <- phyloseq::distance( subset_samples(NYC_HANES, sample_names(NYC_HANES) %in% df_permanova$Burklab_ID ), method="wunifrac") list_permanovas <- list( `Age (5 cat)` = adonis2(dist_perm ~ AGEGRP5C, data=df_permanova), Gender = adonis2(dist_perm ~ GENDER, data=df_permanova), `Education (4 cat)` = adonis2(dist_perm ~ EDU4CAT, data=df_permanova), `Income (tertiles)` = adonis2(dist_perm ~ INC3C, data=df_permanova), `Marital Status` = adonis2(dist_perm ~ DMQ_2, data=df_permanova), `Race/ethnicity` = adonis2(dist_perm ~ RACE, data=df_permanova), `US-born` = adonis2(dist_perm ~ US_BORN, data=df_permanova), `Smoking status` = adonis2(dist_perm ~ smokingstatus, data=df_permanova) ) #SumOfSqs <- unlist(lapply(list_permanovas, function(i) i[["SumOfSqs"]][1])) r_squared <- sapply(list_permanovas, function(i) i$SumOfSqs[1]/sum(i$SumOfSqs[1:2])) perm_results <- data.frame( r_squared, F = unlist(lapply(list_permanovas, function(i) i[["F"]][1])), p_value = unlist(lapply(list_permanovas, function(i) i[["Pr(>F)"]][1])) ) perm_results <- perm_results[names(sort(r_squared, decreasing = TRUE)),]
knitr::kable( perm_results)
pb <- plot_beta_by( pseq=subset_samples(NYC_HANES, sample_names(NYC_HANES) %in% df_permanova$Burklab_ID), vars = permanova_var_names, dist_mtrx = dist_perm) pb <- pb+ theme(legend.title = element_blank(), strip.text.x = element_blank(), axis.text.x = element_text(angle=60,hjust=1,vjust=1), legend.position = "bottom") + scale_x_discrete(name=NULL, breaks=permanova_var_names, labels=c("Age (5 cat)", "Gender","Education (4 cat)","Income (3 cat)", "Marital Status","Race/ethnicity","Nativity","Smoking Status")) + scale_fill_brewer(palette = "Set2", labels=c("Within group","Between groups")) + scale_color_brewer(palette = "Set2", guide="none") + labs(y="Weighted UniFrac Distance") pb
# number of significant OTUs in total and for each sociodemographic variable df_tile$variable <- gsub(" \\(|[1-9]|)", "", df_tile$variable) n_sig <- structure( c(length(unique(df_tile$OTU)), table(df_tile$variable)[model_varlabs]), names = c("ALL", model_vars)) # number of significant OTUs for confounders conf_vars <- c("smokingstatus","DBQ_10_3CAT","OHQ_5_3CAT","OHQ_3") df_tile_conf <- get_all_edgeR_models(vars = conf_vars, varlabels=conf_vars, to.data.frame = TRUE) n_sig <- c(n_sig, table(df_tile_conf$variable)) #replace any single-digit numbers with words numbers=c("one","two","three","four","five","six","seven","eight","nine","ten") n_sig[n_sig < 10] <-numbers[n_sig[n_sig < 10]]
Background
Variations in the oral microbiome are potentially implicated in health inequalities, but existing studies of the oral microbiome have minimal sociodemographic diversity. We describe sociodemographic variation of the oral microbiome in a diverse sample.
Methods
In a subsample (n=r nrow(metadata)
) of the 2013-14 population-based New York City Health and Nutrition Examination Study (NYC-HANES). Mouthwash samples were processed using using 16S v4 rRNA amplicon sequencing. We examined differential abundance of 216 operational taxonomic units (OTUs), in addition to alpha and beta diversity by sociodemographic variables including age, sex, income, education, nativity, and race/ethnicity.
Results
r n_sig["ALL"]
OTUs were differentially abundant by any sociodemographic variable (false discovery rate < 0.01), including r n_sig["RACE"]
by race/ethnicity, r n_sig["INC3C"]
by family income, r n_sig["EDU3CAT"]
by education, r n_sig["GENDER"]
by sex. We found r n_sig["smokingstatus"]
by smoking status, r n_sig["DBQ_10_3CAT"]
by sugar-sweetened beverage consumption, r n_sig["OHQ_5_3CAT"]
by mouthwash use. Genera differing for multiple sociodemographic characteristics included Lactobacillus, Prevotella, Porphyromonas, Fusobacterium.
Discussion
We identified variations in the oral microbiome consistent with health inequalities, with more taxa differing by race/ethnicity than sugar-sweetened beverage consumption, and more by SES variables than mouthwash use. Further investigation is warranted into possible mediating effects of the oral microbiome in social disparities in diabetes, inflammation, oral health, and preterm birth.
perc = function(x, level, digits=1) formatC(100 * prop.table(table(x))[level], digits=digits, format="f") age = metadata$SPAGE %>% (function(i) paste0(median(i), " [", min(i), " to ", max(i), "]")) female = perc(metadata$GENDER, "Female") white = perc(metadata$RACE, "Non-Hispanic White") black = perc(metadata$RACE, "Non-Hispanic Black") hispa = perc(metadata$RACE, "Hispanic") inc1 = perc(metadata$INC3C, 2) inc2 = perc(metadata$INC3C, 1) edu1 = perc(metadata$EDU4CAT, "Less than High school diploma") edu2 = perc(metadata$EDU4CAT, "College graduate or more") #alpha p_alp <- names(df_alpha)[-8] names(p_alp) <- substr(p_alp, 1, sapply(p_alp, function(i) gregexpr(pattern = " |\n", text = i)[[1]][1]-1)) p_alp <- tolower( sapply(p_alp, function(i) substr(i, max(gregexpr(pattern = "\\n", text = i)[[1]])+1, nchar(i))) ) p_alp["Place"] <- paste0(substr(p_alp["Place"],1,nchar(p_alp["Place"])-1), ", Figure 3)")
The initial subsample included 297 participants; after removing samples with less than 1000 reads, there were r nrow(sample_data(NYC_HANES))
participants remaining for analysis. Table 1 shows sociodemographic variation in the final analytic sample with respect to age (median [range]: r age
), gender (r female
% female), race/ethnicity (r white
% non-Hispanic White, r black
% non-Hispanic Black, r hispa
% Hispanic), annual family income (r inc1
% less than \$30K, r inc2
% \$60k or more), and educational achievement (r edu1
% less than high school diploma, r edu2
% college degree or greater). Cramer’s V on all pairwise combinations of sociodemographic variables indicated only minor collinearity (all V<.35) (Supplemental Figure 1).
Relative Abundance and Alpha Diversity
Oral microbiomes were characterized at the phylum level by a gradient between Firmicutes and Bacteroides abundance, with overall dominance by Firmicutes (mean=52±10%). Streptococcus was the most abundant genus (36±10%) followed by Prevotella (17±8%). (Figure 1).
The overall mean chao1 was 462±118, with no differences by age group r p_alp["Age"]
, gender r p_alp["Gender"]
, educational achievement r p_alp["Educational"]
, annual family income r p_alp["Family"]
, marital status r p_alp["Marital"]
, race/ethnicity r p_alp["Race/ethnicity"]
, or nativity r p_alp["Place"]
. (Supplemental Figure 2)
Differential Abundance and Oligotyping
Numerous taxa were differentially abundant by race/ethnicity, nativity, marital status, gender, family income tertiles, education, and age groups. Figure 2a displays log fold change (logFC), or coefficient from edgeR log linear models, for each comparison group and all significant OTUs.
A total of r n_sig["ALL"]
OTUs were differentially abundant by any sociodemographic variable, including r n_sig["AGEGRP3C"]
by age group, r n_sig["RACE"]
by race/ethnicity, r n_sig["INC3C"]
by family income, r n_sig["EDU3CAT"]
by education, r n_sig["DMQ_2"]
by marital status, r n_sig["US_BORN"]
by nativity, and r n_sig["GENDER"]
by gender. We also found r n_sig["OHQ_5_3CAT"]
by mouthwash use, r n_sig["OHQ_3"]
by self-reported gum disease, r n_sig["smokingstatus"]
by smoking status, and r n_sig["DBQ_10_3CAT"]
by sugar-sweetened beverage consumption. The most frequently differentially abundant were Lactobacillus (all variables), and Prevotella (age, education, family income, marital status, race/ethnicity, nativity, Figure 2a) Differential abundance findings for selected taxa are presented in Table 2.
df_tile$logFC %<>% as.numeric df_strongest <- df_tile %>% arrange(variable, desc(abs(logFC))) %>% group_by(variable) %>% filter(!duplicated(Genus))%>% top_n(5, wt=logFC) %>% dplyr::select(variable, varlevel, Genus, logFC, FDR) df_important_genera <- df_tile %>% #filter(Genus %in% c("Fusobacterium","Porphyromonas","Pseudomonas","Prevotella","Lactobacillus","Lactococcus")) %>% group_by(variable) %>% arrange(variable, desc(abs(logFC))) %>% filter(!duplicated(Genus)) %>% dplyr::select(variable, varlevel, Genus, logFC, FDR) options(scipen = 999) df_important_genera$FDR %<>% format.pval(eps = .0001, digits=1) df_important_genera$logFC %<>% round(1) df_important_genera %<>% mutate(abundance=ifelse(logFC>0, "greater","less") %>% factor(levels=c("greater","less"))) df_important_genera$result <- paste0(df_important_genera$Genus, " (logFC ", df_important_genera$logFC, ", FDR ", df_important_genera$FDR, ")") df_important_genera %<>% dplyr::select(-FDR,-Genus) %>% arrange(variable, abundance) df_important_genera %<>% ungroup #filtering by logFC >= 2 df_important_filter <- df_important_genera[df_important_genera$logFC >= 2, ] make_sentence <- function(x) { sentence = x[1] if(length(x) > 1) { if(length(x) > 2) { for(i in 2:(length(x)-1)) { sentence <- paste0(sentence, ", ", x[i]) } } sentence = paste0(sentence, ", and ", x[length(x)]) } sentence } df_sentences <- df_important_filter %>% group_by(variable, varlevel, abundance) %>% summarize(sent = make_sentence(unique(result))) %>% mutate(sent = paste0(abundance, " abundance of ", sent))
As numerous associations were present at FDR<0.01, we focus here on findings where the logFC was 2 or greater. Compared to individuals aged 20-34, individuals aged 65 and over displayed r df_sentences$sent[df_sentences$varlevel=="65 and over"]
. Compared to individuals with a college degree or greater, those with high school diploma or less showed r df_sentences$sent[df_sentences$varlevel=="High School Diploma or Less"]
, and those with some college or associate's degree showed r df_sentences$sent[df_sentences$varlevel=="Some College or Associate's Degree"]
. Individuals with annual family incomes between \$30,000 and \$60,000 had r df_sentences$sent[df_sentences$varlevel=="$30,000 - $60,000"]
, compared to those making less than \$30,000. Marital status showed a large number of strong associations: compared to being married, those living with a partner showed r df_sentences$sent[df_sentences$varlevel=="Living with partner"]
, those separated showed r df_sentences$sent[df_sentences$varlevel=="Separated"]
, and those who were widowed, r df_sentences$sent[df_sentences$varlevel=="Widowed"]
. Compared to non-Hispanic whites, non-Hispanic Blacks had r df_sentences$sent[df_sentences$varlevel=="Non-Hispanic Black"]
.
Figure 3 displays the distribution of logFCs for both crude and adjusted, including all OTUs with FDR <0.01 in crude models. Adjusting for smoking, mouthwash use, age and gender, had a minor effect on crude estimates; however, adjustment for smoking exerts the largest effect on findings for age, income and education.
Analyzing oligotypes of Neisseria, Prevotella, and Streptococcus revealed associations not apparent in the OTU analysis, whereas some associations present in OTU analysis were not apparent in oligotypes (Figure 2). New associations were revealed between Prevotella and gender, Streptococcus and gender, race/ethnicity and nativity, and Neisseria and gender, age, education, marital status, race/ethnicity and nativity. Associations present in OTUs but not in oligotyping were age, education and income in Prevotella, and income in Neisseria.
Oligotype associations within Neisseria for marital status, race/ethnicity, and nativity are each for a mutually exclusive set of taxa, and associations with gender in Neisseria, Prevotella, and Streptococcus are all in separate taxa from the associations with other sociodemographic variables. Age group and education had unidirectional associations in OTU analysis in Streptococcus but bidirectional differential abundance in oligotypes. In Prevotella, race/ethnicity had unidirectional associations in OTU analysis but bidirectional associations in oligotypes.
Beta Diversity and Clustering
bet <- function(variable) { paste0("(p=", round(perm_results["Education (4 cat)","p_value"], 3), " $r^2$=", round(perm_results["Education (4 cat)","r_squared"],3), ")") } us_p<- round(perm_results["US-born","p_value"], 3) us_r2 <- round(perm_results["US-born","r_squared"],3) age_p <- round(perm_results["Age (5 cat)","p_value"], 3) age_r2 <- round(perm_results["Age (5 cat)","r_squared"],3) edu_r2 <- round(perm_results["Education (4 cat)","r_squared"],3)
Figure 4 illustrates between-versus within-group weighted UniFrac distances by each sociodemographic variable. We observed overall shifts in composition by age group r bet("Age (5 cat)")
, education r bet("Education (4 cat)")
, and nativity r bet("US-born")
, with no other variables showing greater between- than within-group variation. Plots of the first two principal coordinates based on weighted UniFrac distances showed little patterning by any variable (not shown). Clustering scores were sensitive to the distance metric used, with Bray-Curtis indicating moderate support for 2 clusters (PS=0.86), and all other measures providing little support for clustering.
In a diverse population-based sample, we found that a large number of bacterial taxa were differentially abundant by age group, race/ethnicity, family income, education, nativity, and gender. Notably, we found a greater number of associations with SES variables (r n_sig["INC3C"]
by family income, r n_sig["EDU3CAT"]
by education) than with gender, marital status or nativity. There were more associations with SES than mouthwash use (r n_sig["OHQ_5_3CAT"]
) or gum disease (r n_sig["OHQ_3"]
), and a similar number of associations were found with sugar-sweetened beverage use (r n_sig["DBQ_10_3CAT"]
). Sociodemographic associations were not appreciably diminished by adjustment for these factors. We also found that differential abundance by sociodemographic characteristics differed in oligotyping vs. OTUs, especially for Neisseria. Alpha diversity was similar across groups, and beta diversity explained only a small proportion of variance by age (r age_r2*100
%), education r edu_r2*100
%), and nativity r us_r2*100
%), and less by other variables. We found poor support for clustering of samples by OTUs, and that, similarly to Koren et al. (2013) (52), clustering findings were sensitive to the distance metric employed.
sessionInfo()
alt_smokers_cigarettes <- sample_data(NYC_HANES) %>% data.frame() %>% dplyr::filter(smokingstatus == 'Alternative smoker') %>% dplyr::filter(CIGARETTES == 'Yes') %>% dplyr::select(Burklab_ID) %>% t NYC_HANES <- prune_samples(!(sample_names(NYC_HANES) %in% alt_smokers_cigarettes), NYC_HANES) sample_data(NYC_HANES)$smokingstatus <- relevel(sample_data(NYC_HANES)$smokingstatus,"Never smoker") #edgeR da.univ <- get_edgeR_results(~ smokingstatus, coef = 2, alph = 2, filtering = TRUE, method = "BH") da.univ <- da.univ[sort(da.univ %>% rownames),] #DESeq2 threshold <- 0.05 dds <- DESeq2::DESeq(phyloseq_to_deseq2(NYC_HANES, design = ~ smokingstatus), parallel = TRUE) ## here "Current smoker" is the numerator, "Never smoker" is the denominator in fold-change calculation res <- DESeq2::results(dds, contrast = c("smokingstatus","Cigarette","Never smoker")) res.filtered <- res[!is.na(res$padj),] res.filtered <- res.filtered[res.filtered$padj < threshold,]
When I run edgeR comparing current cigarette vs. never smokers, I get r sum(da.univ$table$FDR < 0.05)
significant OTUs at FDR < 0.05, and when I run DESeq2, I get r nrow(res.filtered)
.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.