knitr::opts_chunk$set(echo = TRUE)
suppressPackageStartupMessages({
library(vegan)
library(RColorBrewer)
library(phyloseq)
library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
library(DESeq2)
library(edgeR)
library(reshape2)
library(tableone) 
library(knitr)
library(phyloseq)
library(GSEABase)
library(magrittr)
library(GSVA)
library(SummarizedExperiment)})
source("utils.R")
source("summaryPlots.R")
source("edgeR_functions.R")

Conversion of HDF-5 QIIME's OTU table to JSON + addition of metadata

```{bash, render=FALSE}

biom add-metadata -i /CM/projects/fbeghini_oralMicrobiome/NYDH_smoking_pilot_fastq/otus97/otu_table_mc10_w_tax.biom -o /CM/projects/fbeghini_oralMicrobiome/NYDH_smoking_pilot_fastq/otus97/otu_table_mc10_w_tax_w_metadata.biom -m /CM/projects/fbeghini_oralMicrobiome/NYDH_smoking_pilot_fastq/validatedMappingFile/mappingfile_corrected.txt

biom convert -i /CM/projects/fbeghini_oralMicrobiome/NYDH_smoking_pilot_fastq/otus97/otu_table_mc10_w_tax_w_metadata.biom -o /CM/projects/fbeghini_oralMicrobiome/NYDH_smoking_pilot_fastq/otus97/otu_table_mc10_w_tax_w_metadata.txt --to-json --table-type "OTU table"

```r
NYC_HANES <- loadQiimeData() %>% annotateFactors(.)

Table 1: Demographics & Descriptive Statistics

metadata <- data.frame(sample_data(NYC_HANES))
source("prettytable_functions.R")

variablesToLook <- c("GENDER","RACE","EDU4CAT","SPAGE", "AGEGRP5C","BMI","DBTS_NEW","SR_ACTIVE","SMQ_7","INC25KMOD","POVGROUP6_0812CT","COTININE","MERCURYU","OHQ_5")
factorVars <- c("AGEGRP5C", "RACE","EDU4CAT","DBTS_NEW","SR_ACTIVE","INC25KMOD","POVGROUP6_0812CT", "GENDER")
table1 <- CreateTableOne(vars = variablesToLook,data = metadata, strata = "smokingstatus", factorVars = factorVars, includeNA = T,test = T)


table1_data <- formatMetadata(NYC_HANES)

table1 <- prettytable_autoindent(vars = names(table1_data),data = table1_data, 
                         includeNA = TRUE,test = FALSE)
NYC_HANES.genus <- tax_glom(NYC_HANES,taxrank = "Genus")
NYC_HANES.genus.relab <- transform_sample_counts(NYC_HANES.genus, function(x) x/sum(x))
NYC_HANES.relab <- transform_sample_counts(NYC_HANES, function(x) x/sum(x))
NYC_HANES.genus.relab <- merge_taxa(NYC_HANES.genus, taxa_names(filter_taxa(NYC_HANES.genus.relab, function(x) mean(x) < 2e-4, T)))
tax_table(NYC_HANES.genus.relab)[is.na(tax_table(NYC_HANES.genus.relab)[,"Genus"]),"Genus"] <- "Other"
NYC_HANES.genus.relab <- tax_glom(transform_sample_counts(NYC_HANES.genus.relab, function(x) x/sum(x)),taxrank = "Genus")

NYC_HANES.phylum <- tax_glom(NYC_HANES,taxrank = "Phylum")
NYC_HANES.phylum.relab <- transform_sample_counts(NYC_HANES.phylum, function(x) x/sum(x))

Microbial composition (8 most abundant genera)

plot_abundance(NYC_HANES, top_n = 8, prop="total",taxrank = "Genus") + labs(x="296 oral rinse samples") -> taxa
taxa
dir.create("out_img")
png("out_img/taxa_genus.png",800,600, res = 150, bg = "transparent")
taxa + 
  theme(axis.title = element_text(colour = "white"),
        axis.text = element_text(colour = "white"),
        title = element_text(colour = "white"),
        legend.text = element_text(colour = "white"),
        axis.ticks = element_line(colour = "white"),
        # panel.border = element_rect(colour = "white"),
        # panel.background = element_rect(fill = "transparent",colour = NA),
        # plot.background =  element_rect(fill = "transparent",colour = NA),
        legend.background = element_rect(fill = "transparent",colour = NA),
        legend.key = element_rect(fill = "transparent",colour = NA),
        panel.grid = element_blank()
        )
dev.off()

Alpha diversity

alphadiv <- estimate_richness(NYC_HANES, measures = c("Observed","Chao1","Shannon", "Simpson"))[,-3]
alphadiv <- cbind(alphadiv, sample_data(NYC_HANES)$smokingstatus)
colnames(alphadiv)[5] <- c("smokingstatus")

alphadiv.melted <- melt(alphadiv, id.vars = c("smokingstatus"))

alpha.ss <- ggplot(alphadiv.melted,aes(smokingstatus,value)) +
  geom_boxplot(aes(fill=smokingstatus), color="white")+
  facet_grid(variable ~., scales = "free", switch = 'y') +
  scale_x_discrete(labels = c("Cigarette\nsmoker\nn=111","Never\nsmoker\nn=43","Former\nsmoker\nn=45","Alternative\nsmoker\nn=58","Secondhand\nsmoke\nn=39"))+
  scale_y_continuous(position = 'right') +
  scale_fill_manual(values = scale_palette) +
  guides(colour="none") +
  theme_bw() +
  theme(strip.background = element_rect(colour = "transparent", fill = "transparent"),
        strip.text.y = element_text(angle = 180))
aovSMOKINGSTATUS<-aov(Shannon ~ smokingstatus,alphadiv)
summary(aovSMOKINGSTATUS)

aovSMOKINGSTATUS<-aov(Observed ~ smokingstatus,alphadiv)
summary(aovSMOKINGSTATUS)

aovSMOKINGSTATUS<-aov(Chao1 ~ smokingstatus,alphadiv)
summary(aovSMOKINGSTATUS)

alpha.ss
dir.create("out_img/")
png("out_img/alpha_smokingstatus.png",res = 120, height = 800, width = 1000, bg = 'transparent')
alpha.ss +
  theme_transparent +
  ggtitle("Smoking categories")+
  theme(axis.text = element_text(size = 13),
        strip.text = element_text(size = 13, colour = "white"),
        axis.title.x = element_blank(),
        axis.title = element_text(size = 15)
  )
dev.off()

Beta Diversity

distwu <- phyloseq::distance(NYC_HANES, "wunifrac")
ordwu <-  ordinate(NYC_HANES, method = "MDS", distance = distwu, weighted = TRUE)

smoker.wu.data <- plot_ordination(NYC_HANES, ordwu,justDF = T) %>% arrange()

smoker.wu.data$weeksquit <- as.numeric(smoker.wu.data[,"SMQ_4"]) * as.numeric(smoker.wu.data[,"SMQ_4UNIT"])
smoker.wu.data[!smoker.wu.data$SMOKER3CAT %in% "Former smoker","weeksquit"] <- 5
smoker.wu.data$CIGARETTES[is.na(smoker.wu.data$CIGARETTES)] <- "No"

smoker.wu.data %<>% bind_cols(otu_table(NYC_HANES.phylum.relab) %>% t %>% data.frame)
proteobacteria <- tax_table(NYC_HANES.phylum.relab)[tax_table(NYC_HANES.phylum.relab)[,"Phylum"]=="Proteobacteria"] %>% rownames
betadiv.proteobacteria <- ggplot(subset(smoker.wu.data,smokingstatus  %in% c("Cigarette","Never smoker"))) + 
  geom_point(aes(Axis.1,Axis.2, color=New.ReferenceOTU194), size=3) +
  theme_bw() +
  scale_color_gradient(name='Proteobacteria\nrelative\nabundance', low = "blue", high = "red") +
  scale_y_continuous(limits = c(-0.12,0.1)) +
  scale_x_continuous(limits = c(-0.17,0.14))

smoker.wu.noformer <- ggplot(subset(smoker.wu.data,smokingstatus  %in% c("Cigarette","Never smoker"))) + 
  geom_point(aes(Axis.1,Axis.2, color=smokingstatus), size=3, alpha=0.7) +
  theme_bw() +
  scale_color_manual(name='Smoking status', values = scale_palette[1:2]) +
  scale_y_continuous(limits = c(-0.12,0.1)) +
  scale_x_continuous(limits = c(-0.17,0.14))

smoker.wu.former.never <- ggplot(subset(smoker.wu.data,smokingstatus  %in% c("Cigarette","Never smoker","Former smoker"))) + 
  geom_point(aes(Axis.1,Axis.2, color=smokingstatus, size=weeksquit), alpha=0.7) +
  theme_bw() +
  scale_color_manual(name='Smoking status', values = scale_palette[c(1:3)]) +
  scale_y_continuous(limits = c(-0.12,0.1)) +
  scale_x_continuous(limits = c(-0.17,0.14)) +
  scale_size(range = c(2,8))

smoker.wu.second <- ggplot(subset(smoker.wu.data,smokingstatus  %in% c("Cigarette","Never smoker","Former smoker","Secondhand"))) + 
  geom_point(aes(Axis.1,Axis.2, color=smokingstatus, size=weeksquit), alpha=0.7) +
  theme_bw() +
  scale_color_manual(name='Smoking status', values = scale_palette[c(1:4)]) +
  scale_y_continuous(limits = c(-0.12,0.1)) +
  scale_x_continuous(limits = c(-0.17,0.14)) +
  scale_size(range = c(2,8))

smoker.wu <- ggplot(smoker.wu.data) + 
  geom_point(aes(Axis.1,Axis.2, color=smokingstatus, size=weeksquit), alpha=0.7) +
  theme_bw() +
  scale_color_manual(name='Smoking status', values = scale_palette) +
  scale_y_continuous(limits = c(-0.12,0.1)) +
  scale_x_continuous(limits = c(-0.17,0.14)) +
  scale_size(range = c(2,8))

smoker.wu.alternative.cigarettes <- ggplot(subset(smoker.wu.data,smokingstatus  %in% c("Cigarette","Alternative smoker"))) + 
  geom_point(aes(Axis.1,Axis.2, color=smokingstatus, shape=CIGARETTES), alpha=0.7, size=3) +
  theme_bw() +
  scale_color_manual(name='Smoking status', values = scale_palette[c(1,5)]) +
  scale_shape_discrete(name='Alongside alternative ways,\ndo you also smoke cigarettes?') +
  scale_y_continuous(limits = c(-0.12,0.1)) +
  scale_x_continuous(limits = c(-0.17,0.14))

cotinine.gradient <- ggplot() +
  xlab("Axis.1")+
  ylab("Axis.2")+
  theme_bw() +
  geom_point(data=smoker.wu.data[smoker.wu.data$COTININE<1,], aes(Axis.1, Axis.2), size=3, alpha=1, color="black") +
  geom_point(data=smoker.wu.data[smoker.wu.data$COTININE>1,], aes(Axis.1, Axis.2, color=COTININE), size=3, alpha=0.7) +
  scale_y_continuous(limits = c(-0.12,0.1)) +
  scale_x_continuous(limits = c(-0.17,0.14)) +
  scale_color_gradient(low = "blue", high = "red", name = "Cotinine (ng/ml)") +
  scale_size(range = c(2,8))

Cigarette smokers vs Never smokers

smoker.wu.noformer

Test group with PERMANOVA (adonis, vegan package)

Betadisper results are not significant, meaning we cannot reject the null hypothesis that our groups have the same dispersions.

metadata <- as(sample_data(NYC_HANES),"data.frame")
permanova.res <- data.frame()

for(s in combn(levels(metadata$smokingstatus),2, simplify = FALSE)){
  metadata_2cat <- metadata[metadata$smokingstatus %in% s,]
  distwu_2cat <- phyloseq::distance(subset_samples(NYC_HANES, smokingstatus %in% s), "wunifrac")
  permanova <- adonis(distwu_2cat ~ smokingstatus + RACE + GENDER + AGEGRP4C + SR_ACTIVE + EDU3CAT + DBTS_NEW, metadata_2cat, parallel = 5)
  beta_2cat <- betadisper(distwu_2cat, metadata_2cat$smokingstatus, type = "centroid")
  permetest <- permutest(beta_2cat,parallel = T)
  permanova.res %<>% bind_rows(data.frame(contrast = stringr::str_c(s, collapse = " vs "), r2=permanova$aov.tab[1,5],pvalue=permanova$aov.tab[1,6], betadisp=permetest$tab$`Pr(>F)`[1])) %>% arrange(pvalue)
}

knitr::kable(permanova.res)

Three categories

metadata_3cat <- metadata[metadata$smokingstatus %in% c("Cigarette","Never smoker", "Former smoker"),]
distwu_3cat <- phyloseq::distance(subset_samples(NYC_HANES, smokingstatus %in% c("Cigarette","Never smoker", "Former smoker")), "wunifrac")
adonis(distwu_3cat ~ smokingstatus + RACE + GENDER + AGEGRP4C + SR_ACTIVE + EDU3CAT + DBTS_NEW, metadata_3cat)

All categories

adonis(distwu ~ smokingstatus, metadata)

Cigarette smokers + Never smokers + Former smokers

smoker.wu.former.never

Cigarette smokers + Never smokers + Former smokers + Secondhand smokers

smoker.wu.second

All smoking statuses

smoker.wu

Cigarette smokers vs Alternative smokers

smoker.wu.alternative.cigarettes

Discrete serum blood Cotinine levels

cotinine.gradient
png("out_img/beta-proteobacteria.png",1000,800, res = 120, bg = "transparent")
betadiv.proteobacteria + theme_transparent
dev.off()

png("out_img/beta-SMOKER3CAT-sns.png",1000,800, res = 120, bg = "transparent")
smoker.wu.noformer +theme_transparent
dev.off()

png("out_img/beta-SMOKER3CAT.former.png",1000,800, res = 120, bg = "transparent")
smoker.wu.former.never + theme_transparent
dev.off()

png("out_img/beta-SMOKER3CAT.secondhand.png",1000,800, res = 120, bg = "transparent")
smoker.wu.second 
dev.off()

png("out_img/beta-SMOKER3CAT.allcat.png",1000,800, res = 120, bg = "transparent")
smoker.wu
dev.off()

png("out_img/beta-SMOKER3CAT.cigarette_alt.png",1000,800, res = 120, bg = "transparent")
smoker.wu.alternative.cigarettes
dev.off()

png("out_img/beta-COTININE-wu.png",1080,800, res = 120, bg = "transparent")
cotinine.gradient
dev.off()

Differential analysis

Genera reported in previous studies

| Up in smokers | Down in smokers | | ------------- | --------------- | | Sterptococcus | Neisseria | | Eubacterium | Porphyromonas | | Megasphera | Gemella | | Veilonella | Capnocytophaga | | Eikenella | Fusobacterium | | Actinomyces | Corynebacterium | | Rothia mucilaginosa | | | Bifidobacterium longum | | | Atopobium spp | | | Haemophilus | |

DESeq2

Current smokers vs. never smokers univariate

threshold <- 0.05
dds <- DESeq(phyloseq_to_deseq2(NYC_HANES, design = ~ smokingstatus), parallel = TRUE)

## here "Current smoker" is the numerator, "Never smoker" is the denominator in fold-change calculation
res <- DESeq2::results(dds, contrast = c("smokingstatus","Cigarette","Never smoker"))
res.filtered <- res[!is.na(res$padj),]
res.filtered <- res.filtered[res.filtered$padj < threshold,]

## Create a table with the taxonomy of the significant OTUs 
res.tax <- cbind(as.data.frame(res.filtered), as.matrix(tax_table(NYC_HANES)[rownames(res.filtered),]))
dds.taxa <- rownames(res.tax)
res.maxfold <- tapply(res.tax$log2FoldChange, res.tax$Genus, function(x) max(x))
res.maxfold <- sort(res.maxfold, T)
# res.tax["New.ReferenceOTU179","Genus"] <- res.tax["New.ReferenceOTU179","Phylum"] %>% as.character
res.tax$Genus <- factor(as.character(res.tax$Genus), levels = names(res.maxfold))
res.tax$type <- "Current/Never"
res.tax <- res.tax[order(res.tax$log2FoldChange),] 
biosis <- read.csv("biosis.tsv",header = T, sep = '\t')
taxp <- ggplot(right_join(biosis, res.tax, by = c("X1" = "Genus")), aes(log2FoldChange, X1)) + 
  geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
  geom_point(size=3, aes(color = X2)) + 
  facet_grid(Phylum ~ ., scales = "free", space = "free_y",switch = "y",as.table = T)+
  theme_bw() +
  scale_y_discrete(position = "right") +
  scale_color_discrete(name = "Oxygen requirement")+
  # guides(color=FALSE) + 
  theme(axis.text = element_text(hjust = 0, vjust=0.5, size = 11),
        strip.text = element_text(size = 9),
        axis.title = element_text(size = 9),
        axis.title.y = element_blank(),
        strip.background = element_rect(colour = "transparent", fill = "transparent"),
        strip.text.y = element_text(angle = 180)
  )
taxp

Current smokers vs. never smokers adjusting for confounding

## Counfounders added in the design's formula
dds.conf <- DESeq(phyloseq_to_deseq2(NYC_HANES, design = ~ RACE + GENDER + AGEGRP4C + SR_ACTIVE + EDU3CAT + DBTS_NEW + smokingstatus),parallel = TRUE)
##Use two out of three categories in the contrast variable
res.conf <- results(dds.conf, contrast = c("smokingstatus","Cigarette","Never smoker"), pAdjustMethod = "BH")
res.filtered.conf <- res.conf[!is.na(res.conf$padj),]
res.filtered.conf <- res.filtered.conf[res.filtered.conf$padj < threshold,]

res.tax.conf <- cbind(as.data.frame(res.filtered.conf), as.matrix(tax_table(NYC_HANES)[rownames(res.filtered.conf),]))
res.tax.conf <- res.tax.conf[!is.na(res.tax.conf$Family),]
dds.taxa.conf <- rownames(res.tax.conf)
res.maxfold.conf <- tapply(res.tax.conf$log2FoldChange, res.tax.conf$Genus, function(x) max(x))
res.maxfold.conf <- sort(res.maxfold.conf, T)
res.tax.conf["New.ReferenceOTU148","Genus"] <- "Neisseriaceae"
res.tax.conf["New.ReferenceOTU137","Genus"] <- "Neisseriaceae"
res.tax.conf["New.ReferenceOTU192","Genus"] <- "Pasteurellaceae"

res.tax.conf$Genus <- factor(as.character(res.tax.conf$Genus), levels = names(res.maxfold.conf))
res.tax.conf$type <- "Cigarette/Never + conf"
source("edgeR_functions.R")
covs <- c("RACE","GENDER","AGEGRP4C","SR_ACTIVE","EDU3CAT","DBTS_NEW")
res1 <- list()
for (i in c(1:6)) {
  form <- formula(paste("~", "smokingstatus+", str_c(covs[1:i], collapse = "+")))
  dds1.conf <- get_edgeR_results(form , coef = 2, alph = 1)
  res1[[as.character(form)[2]]] <- data.frame(dds1.conf[rownames(dds1.conf$table) %>% sort,])
}

res1.delta <- data.frame(res1[[1]]%>%rownames)
for(i in c(2:6)){
  res1.delta %<>% cbind(100*(res1[[i]]$logFC - res1[[i-1]]$logFC)/res1[[i]]$logFC %>% as.numeric() )
}
lapply(res1.delta[,2:6], median)
taxp.conf <- ggplot(right_join(biosis, res.tax.conf, by = c("X1" = "Genus")), aes(log2FoldChange, X1)) + 
  geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
  geom_point(size=3, aes(color = X2)) + 
  facet_grid(Phylum~., scales = "free", space = "free_y",switch = "y",as.table = T) +
  theme_bw() +
  # guides(color=FALSE) + 
  facet_grid(Phylum~., scales = "free", space = "free_y",switch = "y",as.table = T)+
  scale_y_discrete( position = "right") +
 scale_color_discrete(name = "Oxygen requirement")+
  # guides(color=FALSE) + 
  theme(axis.text = element_text(hjust = 0, vjust=0.5, size = 11),
        strip.text = element_text(size = 9),
        axis.title = element_text(size = 9),
        axis.title.y = element_blank(),
        strip.background = element_rect(colour = "transparent", fill = "transparent"),
        strip.text.y = element_text(angle = 180)
  )

taxp.conf

Plot univariate vs. multivariate from DESeq2

compardat <- data.frame(univariate = res$log2FoldChange, pvalue = res$pvalue, padj = res$padj, multivariate = res.conf$log2FoldChange)
compardat$padj[is.na(compardat$padj)] <- 1
compare.deseq2   <- ggplot(compardat) +
  geom_point(aes(univariate, multivariate, color=padj < 0.05), size=2) +
  geom_abline()+#color="white") +
  scale_colour_manual(values = setNames(c('#DCDC29','grey'),c(TRUE, FALSE)), guide=FALSE) +
  scale_x_continuous(limits = c(-2,2))+
  scale_y_continuous(limits = c(-2,2))+
  theme_bw() +
  theme( axis.text = element_text(hjust = 0, vjust=0.5, size = 17),
         strip.text = element_text(size = 13),
         axis.title = element_text(size = 15))
compare.deseq2

Secondhand vs Cigarette/Never

NYC_HANEScot <- subset_samples(NYC_HANES, smokingstatus %in% c("Secondhand"))
ddscot <- DESeq(phyloseq_to_deseq2(NYC_HANEScot, design = ~ COTININE), parallel = T)
##Use two out of three categories in the contrast variable
rescot <- results(ddscot, alpha = 0.05)
cor.test(res$log2FoldChange, 
         rescot$log2FoldChange,
         use="pairwise.complete.obs")

corr <- cor.test(res$log2FoldChange[res$padj < 0.05], 
                 rescot$log2FoldChange[res$padj < 0.05], 
                 use="pairwise.complete.obs")
corr

cor.data <- data.frame(smoks=res$log2FoldChange[res$padj < 0.05], cot=rescot$log2FoldChange[res$padj < 0.05])

smokers_secondhand <- ggplot(cor.data,aes(smoks, cot)) + 
  theme_bw() +
  xlab("Univariate Smokers vs. Non-smokers") +
  ylab("Univariate Cotinine Levels") +
  geom_text(aes(0,0.11,label = sprintf("p-value: %s\ncor: %s",signif(corr$p.value,2),signif(corr$estimate,2))), color="white") +
  geom_point(size=4, color = '#DCDC29') +
  geom_smooth(method=lm, se = FALSE,color="red") +
  # scale_y_continuous(limits = c(-2,2))+
  scale_x_continuous(limits = c(-1.7,1.7)) +
# theme_transparent +
theme(axis.text = element_text(hjust = 0, vjust=0.5, size = 13),
      strip.text = element_text(size = 13),
      axis.title = element_text(size = 15))
smokers_secondhand
dir.create("out_img")

png("out_img/deseq_smokerno.png",bg = "transparent", res = 300, units = "in", width = 6, height = 6)
taxp + theme_transparent
dev.off()

png("out_img/deseq_smokerno_wconf.png",bg = "transparent", res = 300, units = "in", width = 6, height = 6)
taxp.conf + theme_transparent
dev.off()

png("out_img/deseq_compare_univ_multiv.png",bg = "transparent", res = 300, units = "in", width = 6, height = 6)
compare.deseq2
dev.off()

png("out_img/deseq_compare_smokers_secondhand.png",650,550,bg = "transparent", res = 120)
smokers_secondhand + theme_transparent
dev.off()

edgeR

Cigarette smoker vs Never smoker: univariate

source("edgeR_functions.R")
sample_data(NYC_HANES)$smokingstatus <- relevel(sample_data(NYC_HANES)$smokingstatus,"Never smoker")
da.univ <- get_edgeR_results(~ smokingstatus, coef = 2, alph = 2, filtering = F, method = "BH")
da.univ <- da.univ[sort(da.univ %>% rownames),]
plot_edgeR(~ smokingstatus, pseq = NYC_HANES, coef=2, alph = 0.05, filtering = T, method = "BH", color = "Phylum", sortby = "Phylum") -> edger.univ

Cigarette smoker vs Never smoker: Multivariate

da.multiv <- get_edgeR_results(~ smokingstatus +RACE + GENDER + AGEGRP4C + SR_ACTIVE + EDU3CAT + DBTS_NEW , coef = 2, alph = 2, filtering = F)
da.multiv <- da.multiv[sort(da.multiv %>% rownames),]
plot_edgeR(~ smokingstatus + RACE + GENDER + AGEGRP4C + SR_ACTIVE + EDU3CAT + DBTS_NEW , coef = 2, pseq = NYC_HANES, alph = 0.05, color = "Phylum", filtering=TRUE, sortby = "Phylum")-> edger.multiv

Plot univariate vs multivariate beta coefficients

compardat <- data.frame(univariate = da.univ$table$logFC, padj = da.univ$table$PValue, multivariate = da.multiv$table[da.univ$table %>% rownames,]$logFC)
compare.edger <- ggplot(compardat) +
  geom_abline(color="black", alpha=0.5, size=1) +
  geom_point(aes(univariate, multivariate, color=padj < 0.05), size=2) +
  scale_colour_manual(values = setNames(c('#e41a1c','grey'),c(TRUE, FALSE)), guide=FALSE) +
  scale_x_continuous(limits = c(-2,2))+
  scale_y_continuous(limits = c(-2,2))+
  xlab("Univariate Log fold change") +
  ylab("Multivariate Log fold change") +
  theme_bw() +
  theme( axis.text = element_text(hjust = 0, vjust=0.5, size = 15),
         strip.text = element_text(size = 13),
         axis.title = element_text(size = 14))
compare.edger

Secondhand vs Cigarette/Never

da.cotinine <- get_edgeR_results(~ COTININE, pseq = subset_samples(NYC_HANES, smokingstatus %in% c("Secondhand")), alph = 2, filtering = TRUE, method = "BH")
da.cotinine <- da.cotinine[sort(da.cotinine %>% rownames),]
edger.smoker_secondhand <- data.frame(smokers = da.univ$table[rownames(da.cotinine$table),"logFC"], secondhand = da.cotinine$table$logFC, pvalue = da.univ$table[rownames(da.cotinine$table),"FDR"])

edger.smoker_secondhand<-edger.smoker_secondhand[edger.smoker_secondhand$pvalue<0.05,]

corr <- cor.test(edger.smoker_secondhand$secondhand, edger.smoker_secondhand$smokers,use="pairwise.complete.obs")

corr
secondhand.compare <- ggplot(edger.smoker_secondhand) + 
  theme_bw() +
  xlab("Univariate Smokers vs. Non-smokers") +
  ylab("Univariate Cotinine Levels") +
  geom_text(aes(1,0.4,label = sprintf("p-value: %s\ncor: %s",signif(corr$p.value,2),signif(corr$estimate,2)))) +
  geom_point(aes(smokers, secondhand, color=pvalue <= 0.05), size=2) +
  scale_colour_manual(values = setNames(c('#e41a1c','grey'),c(TRUE, FALSE)), guide=FALSE) +
  geom_smooth(aes(smokers, secondhand), method=lm, se = FALSE,color="red") +
  theme(axis.text = element_text(hjust = 0, vjust=0.5, size = 13),
        strip.text = element_text(size = 13),
        axis.title = element_text(size = 15))
secondhand.compare

Alternative smokers vs Never smoker: univariate

sample_data(NYC_HANES)$smokingstatus <- relevel(sample_data(NYC_HANES)$smokingstatus,"Never smoker")
da.univ <- get_edgeR_results(~ smokingstatus, coef = 4, alph = 2, filtering = F, method = "BH")
da.univ <- da.univ[sort(da.univ %>% rownames),]
plot_edgeR(~ smokingstatus, pseq = NYC_HANES, coef=4, alph = 0.05, filtering = T, method = "BH", color = "Phylum", sortby = "Phylum") -> edger.univ
dir.create("out_img")

png("out_img/edgeR_smokerno.png",bg = "transparent", res = 300, units = "in", width = 6, height = 6)
edger.univ + theme_transparent
dev.off()

png("out_img/edgeR_smokerno_wconf.png",bg = "transparent", res = 300, units = "in", width = 6, height = 6)
edger.multiv + theme_transparent
dev.off()

png("out_img/edgeR_compare_univ_multiv.png",bg = "transparent", res = 300, units = "in", width = 6, height = 6)
compare.edger
dev.off()

svg("out_img/edgeR_compare_univ_multiv.svg",bg = "transparent")
compare.edger
dev.off()


png("out_img/edgeR_compare_smokers_secondhand.png",bg = "transparent", res = 300, units = "in", width = 6, height = 6)
smokers_secondhand
dev.off()

Analysis on biosis of bacteria

Odds ratio calculated on aerobic/anaerobic vs enrichment in smokers/never smokers Anaerobic because degradation of aeromatic compounds

Odds ratio smokers vs never smokers

*Odds ratio are now calculated using all logFCs and not only on the ones statically significant

x <- cbind(as.data.frame(rescot), as.matrix(tax_table(NYC_HANES)[rownames(rescot),])) %>% left_join(biosis, by = c("Genus"="X1")) %>% dplyr::select(X2,log2FoldChange) %>% dplyr::mutate(log2FoldChange > 0 ) %>% dplyr::select(-log2FoldChange) 
x$`log2FoldChange > 0` <- factor(x$`log2FoldChange > 0`,labels = c("Non Smokers","Smokers"))
x.table <- table(x$X2, x$`log2FoldChange > 0`)
OR_aer <- (x.table[1,1]/x.table[2,1])/(x.table[1,2]/x.table[2,2])
OR_fan <- (x.table[1,1]/x.table[3,1])/(x.table[1,2]/x.table[3,2])
"OR Aerobic vs anaerobic"
OR_aer
"OR F anaerobic vs anaerobic"
OR_fan
se_aer <- sqrt(1/x.table[1,1]+1/x.table[2,1]+1/x.table[1,2]+1/x.table[2,2])
se_fan <- sqrt(1/x.table[1,1]+1/x.table[3,1]+1/x.table[1,2]+1/x.table[3,2])
CI_aer <- c(exp(log(OR_aer)-1.96*se_aer),exp(log(OR_aer)+1.96*se_aer))
CI_fan <- c(exp(log(OR_fan)-1.96*se_fan),exp(log(OR_fan)+1.96*se_fan))
fisher.test(x.table[1:2,])
fisher.test(x.table[c(1,3),])

GSEA cigarette smokers vs never smokers

NYC_HANES.sns <- NYC_HANES %>% subset_samples(., smokingstatus %in% c("Cigarette","Never smoker"))
NYC_HANES.sns %<>% prune_taxa((NYC_HANES.sns %>% otu_table  %>% rowSums())>30,.)
meta <- sample_data(NYC_HANES.sns)
pdata <- new("AnnotatedDataFrame",meta)

group = sample_data(NYC_HANES.sns)$smokingstatus
design = model.matrix(~smokingstatus, data=data.frame(sample_data(NYC_HANES.sns)))

d <- edgeR::DGEList(otu_table(NYC_HANES.sns), group = sample_data(NYC_HANES.sns)$smokingstatus)
d <- edgeR::calcNormFactors(d)
class(d$counts) <- "matrix"
otu_table.voom <- limma::voom(d, design = design, plot = FALSE)

eset <- ExpressionSet(assayData = otu_table.voom$E, phenoData = pdata)
eset$GROUP <- ifelse(eset$smokingstatus == "Cigarette",0,1)
de.eset <- EnrichmentBrowser::de.ana(eset)

par(mfrow=c(1,2))
EnrichmentBrowser::volcano(rowData(de.eset)$FC, rowData(de.eset)$ADJ.PVAL) 
eset.none <- EnrichmentBrowser::de.ana(de.eset, padj.method="none")
EnrichmentBrowser::pdistr(rowData(eset.none)$ADJ.PVAL)
biosis <- read.csv("biosis.tsv",header = T, sep = '\t')
ds <- tax_table(NYC_HANES.sns) %>% as.data.frame %>% bind_cols(data.frame(OTU=rownames(tax_table(NYC_HANES.sns)))) %>% filter(Domain != "Unassigned") %>% left_join(biosis, by = c("Genus"="X1"))
sAero <- (ds %>% filter(X2=="Aerobic"))$OTU %>% as.character
sAnae <- (ds %>% filter(X2=="Anaerobic"))$OTU %>% as.character
sFana <- (ds %>% filter(X2=="F Anaerobic" | X2=="Aero / Facultative Anaerobic"))$OTU %>% as.character
my.gs <- list(aero=sAero, anae=sAnae, fana=sFana)

EnrichmentBrowser::config.ebrowser("OUTDIR.DEFAULT","./results")
gsea.res <- EnrichmentBrowser::sbea("gsea", de.eset, my.gs)
EnrichmentBrowser::gs.ranking(gsea.res, signif.only=FALSE)

Most significative OTU

ind <- which.min(rowData(de.eset)$ADJ.PVAL)
otuname <- rownames(de.eset)[ind]
smoker.exprs <- assay(de.eset)[ind,de.eset$GROUP==1]
nsmoker.exprs <- assay(de.eset)[ind,de.eset$GROUP==0]
boxplot(nsmoker.exprs, smoker.exprs)
title(paste(otuname, tax_table(NYC_HANES.sns)[otuname,'Genus']))

ORA cigarette smokers vs never smokers

ora.res <- EnrichmentBrowser::sbea("ora", de.eset, my.gs, perm=0)
EnrichmentBrowser::gs.ranking(ora.res, signif.only=FALSE)

GSEA secondhand smokers

NYC_HANES.sh <- NYC_HANES %>% subset_samples(., smokingstatus %in% c("Secondhand"))
NYC_HANES.sh %<>% prune_taxa((NYC_HANES.sh %>% otu_table  %>% rowSums())>30,.)

meta <- sample_data(NYC_HANES.sh)
meta$catcotinine <- ifelse(meta$COTININE<3,'low','high')
pdata <- new("AnnotatedDataFrame",meta)

group = meta$catcotinine
design = model.matrix(~catcotinine, data=data.frame(meta))

d = edgeR::DGEList(otu_table(NYC_HANES.sh), group = meta$catcotinine)
d = edgeR::calcNormFactors(d)
otu_table.voom <- voom(d, design = design, plot = FALSE)

eset <- ExpressionSet(assayData = otu_table.voom$E, phenoData = pdata)
eset$GROUP <- ifelse(meta$COTININE < 1.9, 0,ifelse(meta$COTININE>4.35,1,NA))
subeset <- eset[,!is.na(eset$GROUP)]
##TERTILES DIVISION HERE
# quantile(meta$COTININE,c(0.33,0.66))
de.eset <- EnrichmentBrowser::de.ana(subeset)

biosis <- read.csv("biosis.tsv",header = T, sep = '\t')
ds <- tax_table(NYC_HANES.sh) %>% as.data.frame %>% bind_cols(data.frame(OTU=rownames(tax_table(NYC_HANES.sh)))) %>% filter(Domain != "Unassigned") %>% left_join(biosis, by = c("Genus"="X1"))
sAero <- (ds %>% filter(X2=="Aerobic"))$OTU %>% as.character
sAnae <- (ds %>% filter(X2=="Anaerobic"))$OTU %>% as.character
sFana <- (ds %>% filter(X2=="F Anaerobic" | X2=="Aero / Facultative Anaerobic"))$OTU %>% as.character
my.gs <- list(aero=sAero, anae=sAnae, fana=sFana)


EnrichmentBrowser::config.ebrowser("OUTDIR.DEFAULT","./results")
gsea.res <- EnrichmentBrowser::sbea("gsea", de.eset, my.gs)
EnrichmentBrowser::gs.ranking(gsea.res, signif.only=FALSE)

ORA secondhand smokers

ora.res <- EnrichmentBrowser::sbea("ora", de.eset, my.gs, perm=0)
EnrichmentBrowser::gs.ranking(ora.res, signif.only=FALSE)

GSVA continuous cotinine on secondhand smokers

NYC_HANES.sh <- NYC_HANES %>% subset_samples(., smokingstatus %in% c("Secondhand"))
NYC_HANES.sh %<>% prune_taxa((NYC_HANES.sh %>% otu_table  %>% rowSums())>30,.)

meta <- sample_data(NYC_HANES.sh)
pdata <- new("AnnotatedDataFrame",meta)
group = meta$COTININE
design = model.matrix(~COTININE, data=data.frame(meta))

d = edgeR::DGEList(otu_table(NYC_HANES.sh), group = meta$COTININE)
d = edgeR::calcNormFactors(d)
otu_table.voom <- voom(d, design = design, plot = FALSE)

eset <- ExpressionSet(assayData = otu_table.voom$E, phenoData = pdata)


biosis <- read.csv("biosis.tsv",header = T, sep = '\t')
ds <- tax_table(NYC_HANES.sh) %>% as.data.frame %>% bind_cols(data.frame(OTU=rownames(tax_table(NYC_HANES.sh)))) %>% filter(Domain != "Unassigned") %>% left_join(biosis, by = c("Genus"="X1"))
sAero <- (ds %>% filter(X2=="Aerobic"))$OTU %>% as.character
sAnae <- (ds %>% filter(X2=="Anaerobic"))$OTU %>% as.character
sFana <- (ds %>% filter(X2=="F Anaerobic" | X2=="Aero / Facultative Anaerobic"))$OTU %>% as.character
my.gs <- list(aero=sAero, anae=sAnae, fana=sFana)

my.gsva <- gsva(eset, my.gs, parallel.sz = 1)

fit <- limma::lmFit(my.gsva, my.gsva$COTININE) %>% eBayes()
topTable(fit)

GSVA continuous cotinine on cigarette vs never smokers

NYC_HANES.sns <- NYC_HANES %>% subset_samples(., smokingstatus %in% c("Cigarette","Never smoker") & !is.na(COTININE))
NYC_HANES.sns %<>% prune_taxa((NYC_HANES.sns %>% otu_table  %>% rowSums())>30,.)

meta <- sample_data(NYC_HANES.sns)
pdata <- new("AnnotatedDataFrame",meta)

design = model.matrix(~COTININE, data=data.frame(meta))

d = edgeR::DGEList(otu_table(NYC_HANES.sns), group = meta$COTININE)
d = edgeR::calcNormFactors(d)
otu_table.voom <- voom(d, design = design)

eset <- ExpressionSet(assayData = otu_table.voom$E, phenoData = pdata)


biosis <- read.csv("biosis.tsv",header = T, sep = '\t')
ds <- tax_table(NYC_HANES.sns) %>% as.data.frame %>% bind_cols(data.frame(OTU=rownames(tax_table(NYC_HANES.sns)))) %>% filter(Domain != "Unassigned") %>% left_join(biosis, by = c("Genus"="X1"))
sAero <- (ds %>% filter(X2=="Aerobic"))$OTU %>% as.character
sAnae <- (ds %>% filter(X2=="Anaerobic"))$OTU %>% as.character
sFana <- (ds %>% filter(X2=="F Anaerobic" | X2=="Aero / Facultative Anaerobic"))$OTU %>% as.character
my.gs <- list(aero=sAero, anae=sAnae, fana=sFana)

my.gsva <- gsva(eset, my.gs, parallel.sz = 1)

fit <- limma::lmFit(my.gsva, my.gsva$COTININE) %>% eBayes()
topTable(fit)


audreyrenson/nychanesmicrobiome documentation built on May 3, 2019, 8:32 p.m.