knitr::opts_chunk$set(
    cache = TRUE,
    message = FALSE,
    warning = FALSE
)
library(nychanesmicrobiome)
library(dplyr)

scale_palette <- c("#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00")
theme_transparent <- ggplot2::theme(
  axis.title = ggplot2::element_text(colour = "white"),
  axis.text = ggplot2::element_text(colour = "white"),
  title = ggplot2::element_text(colour = "white"),
  legend.text = ggplot2::element_text(colour = "white"),
  axis.ticks = ggplot2::element_line(colour = "white"),
  panel.border = ggplot2::element_rect(colour = "white"),
  panel.background = ggplot2::element_rect(fill = "transparent",colour = NA),
  plot.background =  ggplot2::element_rect(fill = "transparent",colour = NA),
  legend.background = ggplot2::element_rect(fill = "transparent",colour = NA),
  legend.key = ggplot2::element_rect(fill = "transparent",colour = NA),
  panel.grid = ggplot2::element_blank()
)
NYC_HANES <- loadQiimeData() %>% 
  annotateFactors()
# Extract alternative smokers who also smoke cigarettes and remove them
alt_smokers_cigarettes <- NYC_HANES |> 
  sample_data() |> 
  data.frame() |> 
  dplyr::filter(smokingstatus == 'Alternative smoker') |> 
  dplyr::filter(CIGARETTES == 'Yes') |> 
  dplyr::select(Burklab_ID) |> 
  t()
NYC_HANES <- prune_samples(!(sample_names(NYC_HANES) %in% alt_smokers_cigarettes), NYC_HANES)
sample_data(NYC_HANES)$smokingstatus <- relevel(sample_data(NYC_HANES)$smokingstatus,"Never smoker")

# Subset alternative smokers
NYC_HANES.alt <- subset_samples(NYC_HANES, smokingstatus %in% c("Never smoker", "Alternative smoker"))
levels(sample_data(NYC_HANES.alt)$smokingstatus) <- c('Never','Alternative')

biosis.tsv <- system.file("extdata","biosis.tsv", package="nychanesmicrobiome", mustWork = TRUE)
biosis <- read.csv(biosis.tsv,header = TRUE, sep = '\t')

Table 1: Demographics & Descriptive Statistics

metadata <- data.frame(sample_data(NYC_HANES))

variablesToLook <- c("GENDER","RACE","EDU4CAT","SPAGE", "AGEGRP5C","DBTS_NEW","SR_ACTIVE","INC25KMOD","COTININE","OHQ_3")
factorVars <- c("AGEGRP5C", "RACE","EDU4CAT","DBTS_NEW","SR_ACTIVE","INC25KMOD", "GENDER")
table1 <- tableone::CreateTableOne(vars = variablesToLook,
                                   data = metadata, 
                                   factorVars = factorVars,
                                   strata = "smokingstatus",
                                   includeNA = TRUE,
                                   test = FALSE)

alternative_habits <-
  metadata[metadata$smokingstatus == 'Alternative smoker', c('E_CIGARETTES', 'HOOKAH_PIPE', 'CIGARS_CIGARILLOS')] %>%
  dplyr::mutate_all(dplyr::funs(as.numeric)) %>% 
  colSums(na.rm = TRUE)

x <- print(table1, explain = TRUE, printToggle = FALSE,nonnormal = c('COTININE'))
knitr::kable(x)
NYC_HANES.genus <- tax_glom(NYC_HANES, taxrank = "Genus")
NYC_HANES.genus.relab <-
  transform_sample_counts(NYC_HANES.genus, function(x)
    x / sum(x))
NYC_HANES.relab <-
  transform_sample_counts(NYC_HANES, function(x)
    x / sum(x))

NYC_HANES.genus.relab <- merge_taxa(NYC_HANES.genus,
                                    taxa_names(filter_taxa(NYC_HANES.genus.relab, function(x)
                                      mean(x) < 2e-4, TRUE)))
tax_table(NYC_HANES.genus.relab)[is.na(tax_table(NYC_HANES.genus.relab)[, "Genus"]), "Genus"] <-
  "Other"
NYC_HANES.genus.relab <-
  tax_glom(transform_sample_counts(NYC_HANES.genus.relab, function(x)
    x / sum(x)),
    taxrank = "Genus")

NYC_HANES.phylum <- tax_glom(NYC_HANES, taxrank = "Phylum")
NYC_HANES.phylum.relab <-
  transform_sample_counts(NYC_HANES.phylum, function(x)
    x / sum(x))

Microbial composition (8 most abundant genera)

plot(
  plot_abundance(
    NYC_HANES,
    top_n = 8,
    prop = "total",
    taxrank = "Genus"
  ) + labs(x = "296 oral rinse samples")
)

Alpha diversity

alphadiv <-
  estimate_richness(NYC_HANES,
                    measures = c("Observed", "Chao1", "Shannon", "Simpson"))[, -3]
alphadiv <- cbind(alphadiv, sample_data(NYC_HANES)$smokingstatus)
colnames(alphadiv)[5] <- c("smokingstatus")

alphadiv.melted <-
  reshape2::melt(alphadiv, id.vars = c("smokingstatus"))
ggplot(alphadiv.melted, aes(smokingstatus,value)) +
  geom_boxplot(aes(fill=smokingstatus))+
  facet_grid(variable ~., scales = "free", switch = 'y') +
  scale_x_discrete(labels = c("Never\nsmokers","Cigarette\nsmokers","Former\nsmokers","Alternative\nsmokers","Secondhand\nsmoke"))+
  scale_y_continuous(position = 'right') +
  scale_fill_manual(values = scale_palette) +
  guides(colour="none") +
  ggtitle("Smoking categories") +
  theme_bw() +
  theme(strip.background = element_rect(colour = "transparent", fill = "transparent"),
        strip.text.y = element_text(angle = 180),
        axis.text = element_text(size = 13),
        axis.title = element_text(size = 12),
        legend.text = element_text(size=12),
        legend.title = element_text(size=12),
        legend.position = 'bottom')
aovSMOKINGSTATUS <- aov(Shannon ~ smokingstatus, alphadiv)
summary(aovSMOKINGSTATUS)

aovSMOKINGSTATUS <- aov(Observed ~ smokingstatus, alphadiv)
summary(aovSMOKINGSTATUS)

aovSMOKINGSTATUS <- aov(Chao1 ~ smokingstatus, alphadiv)
summary(aovSMOKINGSTATUS)

Serum cotinine vs Smoking status

ggplot(metadata, aes(smokingstatus, COTININE, fill = smokingstatus)) + 
  stat_boxplot() +
  scale_fill_manual(values = scale_palette) + 
  theme_bw() +
  scale_x_discrete(labels = c("Never\nsmokers","Cigarette\nsmokers", "Former\nsmokers","Alternative\nsmokers","Secondhand\nsmoke")) +
  xlab('Reported smoking status') +
  ylab('Serum Cotinine (ng/ml)') +
  guides(fill = FALSE) +
  theme(strip.background = element_rect(colour = "transparent", fill = "transparent"),
        strip.text.y = element_text(angle = 180),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 12),
        legend.text = element_text(size=12),
        legend.title = element_text(size=12),
        legend.position = 'bottom')

Beta Diversity

distwu <- phyloseq::distance(NYC_HANES, "wunifrac")
ordwu <- ordinate(NYC_HANES,
           method = "MDS",
           distance = distwu,
           weighted = TRUE)

smoker.wu.data <- plot_ordination(NYC_HANES, ordwu, justDF = TRUE) %>% dplyr::arrange()

smoker.wu.data$weeksquit <-
  as.numeric(smoker.wu.data[, "SMQ_4"]) * as.numeric(smoker.wu.data[, "SMQ_4UNIT"])
smoker.wu.data[!smoker.wu.data$SMOKER3CAT %in% "Former smoker", "weeksquit"] <- 5
smoker.wu.data$CIGARETTES[is.na(smoker.wu.data$CIGARETTES)] <- "No"

smoker.wu.data %<>% bind_cols(otu_table(NYC_HANES.phylum.relab) %>% t %>% data.frame)
proteobacteria <-
  tax_table(NYC_HANES.phylum.relab)[tax_table(NYC_HANES.phylum.relab)[, "Phylum"] ==
                                      "Proteobacteria"] %>% rownames

Cigarette smokers vs Never smokers

plot(
  ggplot(subset(
    smoker.wu.data,
    smokingstatus  %in% c("Cigarette", "Never smoker")
  )) +
    geom_point(
      aes(Axis.1, Axis.2, color = smokingstatus),
      size = 3,
      alpha = 0.7
    ) +
    stat_ellipse(aes(Axis.1, Axis.2, color = smokingstatus), type = 't') +
    theme_bw() +
    theme(
      axis.text = element_text(size = 13),
      axis.title = element_text(size = 12),
      legend.text = element_text(size = 12),
      legend.title = element_text(size = 12),
      legend.position = 'bottom'
    ) +
    scale_color_manual(name = 'Smoking status', values = scale_palette[1:2]) +
    scale_y_continuous(limits = c(-0.14, 0.13)) +
    scale_x_continuous(limits = c(-0.18, 0.20))
)

Cigarette smokers + Never smokers + Former smokers

plot(
  ggplot(subset(
    smoker.wu.data,
    smokingstatus  %in% c("Cigarette", "Never smoker", "Former smoker")
  )) +
    geom_point(
      aes(Axis.1, Axis.2, color = smokingstatus, size = weeksquit),
      alpha = 0.7
    ) +
    theme_bw() +
    theme(
      axis.text = element_text(size = 13),
      axis.title = element_text(size = 12),
      legend.text = element_text(size = 12),
      legend.title = element_text(size = 12),
      legend.position = 'bottom',
      legend.box = 'vertical',
      legend.margin = margin(-10, 0, 0, 0)
    ) +
    scale_color_manual(name = 'Smoking status', values = scale_palette[c(1:3)]) +
    scale_y_continuous(limits = c(-0.14, 0.13)) +
    scale_x_continuous(limits = c(-0.18, 0.20)) +
    scale_size(range = c(2, 8))
)

Cigarette smokers + Never smokers + Former smokers + Secondhand smokers

plot(
  ggplot(subset(
    smoker.wu.data,
    smokingstatus  %in% c("Cigarette", "Never smoker", "Former smoker", "Secondhand")
  )) +
    geom_point(
      aes(Axis.1, Axis.2, color = smokingstatus, size = weeksquit),
      alpha = 0.7
    ) +
    theme_bw() +
    scale_color_manual(name = 'Smoking status', values = scale_palette[c(1:4)]) +
    scale_y_continuous(limits = c(-0.14, 0.13)) +
    scale_x_continuous(limits = c(-0.18, 0.20)) +
    scale_size(range = c(2, 8))
)

All smoking statuses

plot(
  ggplot(smoker.wu.data) +
    geom_point(
      aes(Axis.1, Axis.2, color = smokingstatus, size = weeksquit),
      alpha = 0.7
    ) +
    theme_bw() +
    scale_color_manual(name = 'Smoking status', values = scale_palette) +
    scale_y_continuous(limits = c(-0.14, 0.13)) +
    scale_x_continuous(limits = c(-0.18, 0.20)) +
    scale_size(range = c(2, 8))
)

Discrete serum blood Cotinine levels

plot(
  ggplot() +
    xlab("Axis.1") +
    ylab("Axis.2") +
    theme_bw() +
    geom_point(
      data = smoker.wu.data[smoker.wu.data$COTININE < 1, ],
      aes(Axis.1, Axis.2),
      size = 3,
      alpha = 1,
      color = "black"
    ) +
    geom_point(
      data = smoker.wu.data[smoker.wu.data$COTININE > 1, ],
      aes(Axis.1, Axis.2, color = COTININE),
      size = 3,
      alpha = 0.7
    ) +
    scale_y_continuous(limits = c(-0.14, 0.13)) +
    scale_x_continuous(limits = c(-0.18, 0.20)) +
    scale_color_gradient(low = "blue", high = "red", name = "Cotinine (ng/ml)") +
    scale_size(range = c(2, 8))
)

Test group with PERMANOVA (adonis, vegan package)

metadata <- as(sample_data(NYC_HANES), "data.frame")
permanova.res <- data.frame()
for (s in combn(levels(metadata$smokingstatus), 2, simplify = FALSE)) {
  metadata_2cat <- metadata[metadata$smokingstatus %in% s, ]
  distwu_2cat <- phyloseq::distance(
    subset_samples(NYC_HANES, smokingstatus %in% s), "wunifrac"
  )
  permanova <- vegan::adonis2(
      formula = distwu_2cat ~ smokingstatus +  OHQ_3 + RACE + GENDER + AGEGRP4C + SR_ACTIVE + EDU3CAT + DBTS_NEW,
      data = metadata_2cat
  )
  permanova.res %<>% bind_rows(
    data.frame(
      contrast = stringr::str_c(s, collapse = " vs "),
      r2 = permanova[1, 'R2'],
      pvalue = permanova[1, 'Pr(>F)']
    )
  ) %>% arrange(pvalue)
}
knitr::kable(permanova.res)
s <- c("Cigarette", "Never smoker")
metadata <- as(sample_data(NYC_HANES), 'data.frame')
metadata_2cat <- metadata[metadata$smokingstatus %in% s, ]
distwu_2cat <-
  phyloseq::distance(subset_samples(NYC_HANES, smokingstatus %in% s), "wunifrac")
permanova <-
  vegan::adonis2(data = metadata_2cat, distwu_2cat ~ COTININE)
permanova

Three categories

metadata_3cat <-
  metadata[metadata$smokingstatus %in% c("Cigarette", "Never smoker", "Former smoker"), ]
distwu_3cat <-
  phyloseq::distance(subset_samples(
    NYC_HANES,
    smokingstatus %in% c("Cigarette", "Never smoker", "Former smoker")
  ), "wunifrac")
vegan::adonis2(
  distwu_3cat ~ smokingstatus + RACE + GENDER + AGEGRP4C + SR_ACTIVE + EDU3CAT + DBTS_NEW,
  metadata_3cat
)

All categories

vegan::adonis2(distwu ~ smokingstatus, metadata)

Differential analysis

DESeq2

DESeq2 is presented as sensitivity analysis. Main results are calculated by edgeR.

Current smokers vs. never smokers crude

threshold <- 0.05
dds <-
  DESeq2::DESeq(phyloseq_to_deseq2(NYC_HANES, design = ~ smokingstatus),
                parallel = TRUE)

## here "Current smoker" is the numerator, "Never smoker" is the denominator in fold-change calculation
res.crude <-
  DESeq2::results(dds, contrast = c("smokingstatus", "Cigarette", "Never smoker"))
res.filtered <- res.crude[!is.na(res.crude$padj), ]
res.filtered <- res.filtered[res.filtered$padj < threshold, ]

## Create a table with the taxonomy of the significant OTUs
res.tax <-
  cbind(as.data.frame(res.filtered), as.matrix(tax_table(NYC_HANES)[rownames(res.filtered), ]))
res.maxfold <-
  tapply(res.tax$log2FoldChange, res.tax$Genus, function(x)
    max(x))

res.maxfold <- sort(res.maxfold, T)
# res.tax["New.ReferenceOTU179","Genus"] <- res.tax["New.ReferenceOTU179","Phylum"] %>% as.character
res.tax$Genus <-
  factor(as.character(res.tax$Genus), levels = names(res.maxfold))
res.tax$type <- "Current/Never"
res.tax <- res.tax[order(res.tax$log2FoldChange), ] 
plot(
  ggplot(right_join(biosis, res.tax, by = c("X1" = "Genus")), aes(log2FoldChange, X1)) +
    geom_vline(
      xintercept = 0.0,
      color = "gray",
      size = 0.5
    ) +
    geom_point(size = 3, aes(color = X2)) +
    facet_grid(
      Phylum ~ .,
      scales = "free",
      space = "free_y",
      switch = "y",
      as.table = TRUE
    ) +
    theme_bw() +
    scale_y_discrete(position = "right") +
    scale_color_discrete(name = "Oxygen requirement") +
    # guides(color=FALSE) +
    theme(
      axis.text = element_text(
        hjust = 0,
        vjust = 0.5,
        size = 11
      ),
      strip.text = element_text(size = 9),
      axis.title = element_text(size = 9),
      axis.title.y = element_blank(),
      strip.background = element_rect(colour = "transparent", fill = "transparent"),
      strip.text.y = element_text(angle = 180)
    )
)

Current smokers vs. never smokers adjusting for confounding

## Counfounders added in the design's formula
dds.conf <-
  DESeq2::DESeq(
    phyloseq_to_deseq2(
      NYC_HANES,
      design = ~ AGEGRP4C + GENDER + RACE + SR_ACTIVE + EDU3CAT + DBTS_NEW + smokingstatus
    ) ,
    parallel = TRUE,
    quiet = TRUE
  )
##Use two out of three categories in the contrast variable
res.conf <-
  DESeq2::results(
    dds.conf,
    contrast = c("smokingstatus", "Cigarette", "Never smoker"),
    pAdjustMethod = "BH"
  )
res.filtered.conf <- res.conf[!is.na(res.conf$padj), ]
res.filtered.conf <-
  res.filtered.conf[res.filtered.conf$padj < threshold, ]

#Create result table with DA OTUs
tt <- tax_table(NYC_HANES)[rownames(res.filtered.conf), ]
if (all.equal(rownames(res.filtered.conf), rownames(tt))) {
  res.tax.conf <-
    cbind(as.data.frame(res.filtered.conf), as.matrix(tt))
} else{
  stop("rowname mismatch for res.tax.conf")
}
res.tax.conf <- res.tax.conf[!is.na(res.tax.conf$Family), ]
res.maxfold.conf <-
  tapply(res.tax.conf$log2FoldChange, res.tax.conf$Genus, function(x)
    max(x))
res.maxfold.conf <- sort(res.maxfold.conf, T)
res.tax.conf$Genus <- as.character(res.tax.conf$Genus)
res.tax.conf["New.ReferenceOTU148", "Genus"] <- "Neisseriaceae"
res.tax.conf["New.ReferenceOTU137", "Genus"] <- "Neisseriaceae"
res.tax.conf["New.ReferenceOTU192", "Genus"] <- "Pasteurellaceae"

res.tax.conf$Genus <-
  factor(as.character(res.tax.conf$Genus), levels = names(res.maxfold.conf))
res.tax.conf$type <- "Cigarette/Never + conf"
plot(
  ggplot(
    right_join(biosis, res.tax.conf, by = c("X1" = "Genus")),
    aes(log2FoldChange, X1)
  ) +
    geom_vline(
      xintercept = 0.0,
      color = "gray",
      size = 0.5
    ) +
    geom_point(size = 3, aes(color = X2)) +
    facet_grid(
      Phylum ~ .,
      scales = "free",
      space = "free_y",
      switch = "y",
      as.table = TRUE
    ) +
    theme_bw() +
    # guides(color=FALSE) +
    facet_grid(
      Phylum ~ .,
      scales = "free",
      space = "free_y",
      switch = "y",
      as.table = TRUE
    ) +
    scale_y_discrete(position = "right") +
    scale_color_discrete(name = "Oxygen requirement") +
    # guides(color=FALSE) +
    theme(
      axis.text = element_text(
        hjust = 0,
        vjust = 0.5,
        size = 11
      ),
      strip.text = element_text(size = 9),
      axis.title = element_text(size = 9),
      axis.title.y = element_blank(),
      strip.background = element_rect(colour = "transparent", fill = "transparent"),
      strip.text.y = element_text(angle = 180)
    )
)

Plot crude vs. adjusted from DESeq2

#Take log2fc from crude and adjusted analysis. The pvalue and FDR are from the crude analysis
stopifnot(all.equal(rownames(res.crude), rownames(res.conf)))
compardat <- data.frame(
  crude = res.crude$log2FoldChange,
  pvalue = res.crude$pvalue,
  padj = res.crude$padj,
  adjusted = res.conf$log2FoldChange
)
#Set pvalues with NAs at 1
compardat$padj[is.na(compardat$padj)] <- 1
ggplot(compardat) +
  geom_point(aes(crude, adjusted, color = padj < 0.05), size = 2) +
  geom_abline() +
  scale_colour_manual(values = setNames(c('black', 'grey'), c(TRUE, FALSE)), guide =
                        FALSE) +
  #  scale_x_continuous(limits = c(-2,2))+
  #  scale_y_continuous(limits = c(-2,2))+
  theme_bw() +
  theme(
    axis.text = element_text(
      hjust = 0,
      vjust = 0.5,
      size = 17
    ),
    strip.text = element_text(size = 13),
    axis.title = element_text(size = 15)
  )

Secondhand vs Cigarette/Never

#Extract all samples marked as SHS
NYC_HANEScot <-
  subset_samples(NYC_HANES, smokingstatus %in% c("Secondhand"))
#NYC_HANEScot <- subset_samples(NYC_HANEScot, smokingstatus %in% c("Secondhand") & COTININE < 3)
#Build DESeq2 object using continuous cotinine variable
ddscot <-
  DESeq2::DESeq(phyloseq_to_deseq2(NYC_HANEScot, design = ~ COTININE),
                parallel = TRUE)
rescot.crude <- DESeq2::results(ddscot, alpha = 0.05)

#Select OTUs significant in the crude smokers vs non-smokers model.
#Correlation is calculated by taking the log2FC from the crude smokers vs non-smokers model and the crude cotinine model
(
  corr <- cor.test(
    res.crude$log2FoldChange[res.crude$padj < 0.05],
    rescot.crude$log2FoldChange[res.crude$padj < 0.05],
    use = "pairwise.complete.obs"
  )
)

if (identical(all.equal(rownames(rescot.crude), rownames(res.crude)), TRUE)) {
  cor.data <-
    data.frame(smoks = res.crude$log2FoldChange[res.crude$padj < 0.05],
               cot = rescot.crude$log2FoldChange[res.crude$padj < 0.05])
} else{
  stop("mismatch in cotinine/smoking rownames")
}
plot(
  ggplot(cor.data, aes(smoks, cot)) +
    theme_bw() +
    xlab("Crude Smokers vs. Non-smokers") +
    ylab("Crude Cotinine Levels") +
    geom_text(aes(
      0.5,
      0.25,
      label = stringr::str_c(
        'p-value: ',
        signif(corr$p.value, 2),
        '\nCorr: ',
        signif(corr$estimate, 2)
      )
    ), color = "black") +
    geom_point(size = 4, color = '#e41a1c') +
    geom_smooth(method = lm, se = FALSE, color = "red") +
    #  scale_x_continuous(limits = c(-1.7,1.7)) +
    #  scale_y_continuous(limits = c(-0.17,0.3)) +
    theme(
      axis.text = element_text(
        hjust = 0,
        vjust = 0.5,
        size = 13
      ),
      strip.text = element_text(size = 13),
      axis.title = element_text(size = 15)
    )
)

edgeR

Cigarette smoker vs Never smoker: crude

#Set Never smokers as first level
sample_data(NYC_HANES)$smokingstatus <-
  relevel(sample_data(NYC_HANES)$smokingstatus, "Never smoker")
dasmoking.crude <-
  get_edgeR_results( ~ smokingstatus,
                     pseq=NYC_HANES,
                     coef = 2,
                     alph = 0.05,
                     method = "BH")
dasmoking.crude$table <-
  dasmoking.crude$table[sort(dasmoking.crude$table %>% rownames), ]
dasmoking.crude$table$Genus <-
  as.character(dasmoking.crude$table$Genus)
dasmoking.crude$table[c("New.CleanUp.ReferenceOTU8965", "New.ReferenceOTU179"), 'Genus'] <-
  'uncultured'
dasmoking.crude$table['New.ReferenceOTU72', 'Genus'] <-
  'Family XIII'
dasmoking.crude$table$Genus[is.na(dasmoking.crude$table$Genus)] <-
  'unclassified'
ord <-
  order(dasmoking.crude$table[['Phylum']],
        dasmoking.crude$table$logFC,
        decreasing = TRUE)
dasmoking.crude$table$Genus <-
  factor(dasmoking.crude$table$Genus,
         levels = unique(dasmoking.crude$table$Genus[ord]))
phyla_colors <-
  c(
    '#00B2FFFF',
    '#FF9900FF',
    '#FF0000FF',
    '#0DFF00FF',
    '#00FF9FFF',
    '#A600FFFF',
    '#B9FF00FF',
    '#FF00ACFF',
    '#0006FFFF',
    '#ababab'
  )


p <-
  ggplot2::ggplot(dasmoking.crude$table,
                  ggplot2::aes(x = logFC, y = Genus, color = Phylum)) +
  ggplot2::geom_vline(xintercept = 0.0,
                      color = "gray",
                      size = 0.5) +
  ggplot2::geom_point(size = 3) +
  ggplot2::guides(color = ggplot2::guide_legend(title = 'Phylum')) +
  ggplot2::theme_light() +
  ggplot2::theme(
    axis.text.x = ggplot2::element_text(
      angle = -90,
      hjust = 0,
      vjust = 0.5
    ),
    axis.text.y = ggplot2::element_text(face = "italic"),
    axis.text = ggplot2::element_text(size = 12),
    legend.text = ggplot2::element_text(size = 12),
    legend.box.margin = margin(-27, 0, -27, -27),
    legend.title = ggplot2::element_text(size = 14),
    title = ggplot2::element_text(size = 15),
    plot.title = ggplot2::element_text(margin = ggplot2::margin(b =
                                                                  26))
  ) +
  ggplot2::scale_y_discrete(position = "right") +
  ggplot2::labs(x = "Log2-fold change") +
  ggplot2::xlim(-3.1, 3.1) +
  ggplot2::ggtitle('Cigarette vs. Never smoker') +
  ggplot2::scale_color_manual(values = phyla_colors)

p1 <- p +
  ggplot2::annotation_custom(
    grob = grid::textGrob('Lower in smokers'),
    xmin = -1.8,
    xmax = -1.8,
    ymin = 29.7,
    ymax = 29.7
  ) +
  ggplot2::annotation_custom(
    grob = grid::textGrob('Higher in smokers'),
    xmin = 1.8,
    xmax = 1.8,
    ymin = 29.7,
    ymax = 29.7
  )

gg_table <- ggplot_gtable(ggplot_build(p1))
gg_table$layout$clip[gg_table$layout$name == "panel"] <- "off"
grid::grid.draw(gg_table)

Cigarette smoker vs Never smoker: crude phylum level

sample_data(NYC_HANES.phylum)$smokingstatus <-
  relevel(sample_data(NYC_HANES.phylum)$smokingstatus, "Never smoker")
DGELRT <-
  get_edgeR_results(
    ~ smokingstatus,
    pseq = NYC_HANES.phylum,
    coef = 2,
    alph = 0.05,
    method = "BH"
  )

p <-
  ggplot2::ggplot(DGELRT$table, ggplot2::aes(x = logFC, y = Phylum, color =
                                               Phylum)) +
  ggplot2::geom_vline(xintercept = 0.0,
                      color = "gray",
                      size = 0.5) +
  ggplot2::geom_point(size = 3) +
  ggplot2::guides(color = ggplot2::guide_legend(title = 'Phylum')) +
  ggplot2::theme_light() +
  ggplot2::theme(
    axis.text.x = ggplot2::element_text(
      angle = -90,
      hjust = 0,
      vjust = 0.5
    ),
    axis.text.y = ggplot2::element_text(face = "italic"),
    axis.text = ggplot2::element_text(size = 12),
    legend.text = ggplot2::element_text(size = 12),
    legend.box.margin = margin(-27, 0, -27, -27),
    legend.title = ggplot2::element_text(size = 14),
    title = ggplot2::element_text(size = 15),
    plot.title = ggplot2::element_text(margin = ggplot2::margin(b =
                                                                  26))
  ) +
  ggplot2::scale_y_discrete(position = "right") +
  ggplot2::labs(x = "Log2-fold change") +
  ggplot2::xlim(-3.1, 3.1) +
  ggplot2::ggtitle('Cigarette vs. Never smoker') +
  ggplot2::scale_color_manual(values = phyla_colors)

p1 <- p +
  ggplot2::annotation_custom(
    grob = grid::textGrob('Lower in smokers', gp = grid::gpar(col = "black", fontsize =
                                                                11)),
    xmin = -1.8,
    xmax = -1.8,
    ymin = 5.8,
    ymax = 5.8
  ) +
  ggplot2::annotation_custom(
    grob = grid::textGrob('Higher in smokers', gp = grid::gpar(col = "black", fontsize =
                                                                 11)),
    xmin = 1.8,
    xmax = 1.8,
    ymin = 5.8,
    ymax = 5.8
  )


gg_table <- ggplot_gtable(ggplot_build(p1))
gg_table$layout$clip[gg_table$layout$name == "panel"] <- "off"
grid::grid.draw(gg_table)

Cigarette smoker vs Never smoker: Adjusted

#Adjusted model with race, sex, age, physical activity, education level, diabetes status (based on serum HbA1c) and self-reported gum disease
da.adj <-
  get_edgeR_results(
    ~ smokingstatus + AGEGRP4C + GENDER + RACE + SR_ACTIVE + EDU3CAT + DBTS_NEW,
    coef = 2,
    alph = 0.05
  )
da.adj <- da.adj[sort(da.adj %>% rownames), ]
da.adj$table$Genus <- as.character(da.adj$table$Genus)
da.adj$table['New.ReferenceOTU72', 'Genus'] <- 'Family XIII'
da.adj$table$Genus[is.na(da.adj$table$Genus)] <- 'unclassified'
ord <-
  order(da.adj$table[['Phylum']], da.adj$table$logFC, decreasing = TRUE)
da.adj$table$Genus <-
  factor(da.adj$table$Genus, levels = unique(da.adj$table$Genus[ord]))
phyla_colors <-
  c('#FF9900FF',
    '#0DFF00FF',
    '#00FF9FFF',
    '#A600FFFF',
    '#FF00ACFF')


p <-
  ggplot2::ggplot(da.adj$table, ggplot2::aes(x = logFC, y = Genus, color =
                                               Phylum)) +
  ggplot2::geom_vline(xintercept = 0.0,
                      color = "gray",
                      size = 0.5) +
  ggplot2::geom_point(size = 3) +
  ggplot2::guides(color = ggplot2::guide_legend(title = 'Phylum')) +
  ggplot2::theme_light() +
  ggplot2::theme(
    axis.text.x = ggplot2::element_text(
      angle = -90,
      hjust = 0,
      vjust = 0.5
    ),
    # axis.title.x = ggplot2::element_text(margin = ggplot2::margin(t=25)),
    axis.text.y = ggplot2::element_text(face = "italic"),
    axis.text = ggplot2::element_text(size = 12),
    legend.text = ggplot2::element_text(size = 12),
    legend.title = ggplot2::element_text(size = 14),
    title = ggplot2::element_text(size = 15),
    plot.title = ggplot2::element_text(margin = ggplot2::margin(b =
                                                                  26))
  ) +
  ggplot2::scale_y_discrete(position = "right") +
  ggplot2::xlim(-2.5, 2.5) +
  ggplot2::labs(x = "Log2-fold change") +
  ggplot2::ggtitle('Cigarette vs. Never smoker') +
  ggplot2::scale_color_manual(values = phyla_colors)
p

p1 <- p +
  ggplot2::annotation_custom(
    grob = grid::textGrob('Lower in smokers'),
    xmin = -1.4,
    xmax = -1.4,
    ymin = 17.2,
    ymax = 17.2
  ) +
  ggplot2::annotation_custom(
    grob = grid::textGrob('Higher in smokers'),
    xmin = 1.4,
    xmax = 1.4,
    ymin = 17.2,
    ymax = 17.2
  )

gg_table <- ggplot_gtable(ggplot_build(p1))
gg_table$layout$clip[gg_table$layout$name == "panel"] <- "off"
grid::grid.draw(gg_table)

Plot crude vs adjusted beta coefficients

#Get crude edgeR results without filtering for FDR
dasmoking.crude <-
  get_edgeR_results( ~ smokingstatus,
                     coef = 2,
                     alph = 2,
                     method = "BH")
dasmoking.crude$table <-
  dasmoking.crude$table[sort(dasmoking.crude$table %>% rownames), ]
#Get adjusted edgeR results without filtering for FDR
da.adj <-
  get_edgeR_results(
    ~ smokingstatus + RACE + GENDER + AGEGRP4C + SR_ACTIVE + EDU3CAT + DBTS_NEW + OHQ_3,
    coef = 2,
    alph = 2
  )
da.adj <- da.adj[sort(da.adj %>% rownames), ]

# Build result table by taking log2FC from crude and adjusted models and FDR from the crude one.
if (identical(all.equal(rownames(da.adj), rownames(dasmoking.crude)), TRUE)) {
  compardat <- data.frame(
    crude = dasmoking.crude$table$logFC,
    padj = dasmoking.crude$table$FDR,
    adjusted = da.adj$table$logFC
  )
} else{
  stop("rownames mismatch between da.adj and dasmoking.crude")
}
plot(
  ggplot(compardat) +
    geom_abline(
      color = "black",
      alpha = 0.5,
      size = 1
    ) +
    geom_point(aes(crude, adjusted, alpha = padj < 0.05), size = 2) +
    scale_colour_manual(values = setNames(c('grey', 'black'), c(FALSE, TRUE)), guide =
                          FALSE) +
    scale_x_continuous(limits = c(-2, 2)) +
    scale_y_continuous(limits = c(-2, 2)) +
    xlab("Crude log2 fold change") +
    ylab("Adjusted log2 fold change") +
    theme_bw() +
    theme(
      axis.text = element_text(
        hjust = 0,
        vjust = 0.5,
        size = 15
      ),
      strip.text = element_text(size = 13),
      axis.title = element_text(size = 14),
      legend.position = 'bottom'
    )
)

Secondhand vs Cigarette/Never: only significant OTUs

A function for merging smoking and cotinine edgeR results:

mergeResults <- function(obj.smoking, obj.cotinine) {
  intersect.filters <-
    intersect(rownames(obj.smoking), rownames(obj.cotinine))
  intersect.filters <- sort(intersect.filters)
  obj.smoking <- obj.smoking[intersect.filters, ]
  obj.cotinine <- obj.cotinine[intersect.filters, ]
  stopifnot(all.equal(rownames(obj.smoking), rownames(obj.cotinine)))
  ##
  res <-
    data.frame(
      logFC_smoking = obj.smoking$table$logFC,
      logFC_cotinine = obj.cotinine$table$logFC,
      FDR_smoking = obj.smoking$table$FDR,
      FDR_cotinine = obj.cotinine$table$FDR,
      OTU = rownames(obj.smoking$table)
    )
  res <- cbind(res, obj.smoking[, c("Phylum", "Class", "Order", "Family", "Genus", "Species")])
  return(res)
}

And a plotting function:

plotSmokingcotinineComparison <-
  function(obj.smoking,
           obj.cotinine,
           labtext = "Crude",
           FDRcutoff = 1) {
    dataset <- mergeResults(obj.smoking, obj.cotinine)
    dataset <- dataset[dataset$FDR_smoking < FDRcutoff,]
    corr.result <- with(dataset,
                        cor.test(logFC_smoking, logFC_cotinine))
    p <- ggplot(dataset) +
      theme_bw() +
      xlab(paste(labtext, "Smokers vs. Non-smokers")) +
      ylab(paste(labtext, "Cotinine Levels (ng/ml)")) +
      geom_text(aes(1,-0.3, label = sprintf(
        "p-value: %s\ncor: %s \nFDR<0.05: %s",
        signif(corr.result$p.value, 2),
        signif(corr.result$estimate, 2),
        sum(dataset$FDR_smoking < 0.05)
      )), size = 5) +
      geom_point(aes(logFC_smoking, logFC_cotinine, color = FDR_smoking <= 0.05),
                 size = 3) +
      scale_colour_manual(values = setNames(c('black', 'grey'), c(TRUE, FALSE)), guide =
                            FALSE) +
      geom_smooth(
        aes(logFC_smoking, logFC_cotinine),
        method = lm,
        se = FALSE,
        color = "red"
      ) +
      theme(
        axis.text = element_text(
          hjust = 0,
          vjust = 0.5,
          size = 13
        ),
        strip.text = element_text(size = 13),
        axis.title = element_text(size = 15)
      )
    return(p)
  }

Two thresholds for serum cotinine:

NYC_HANES.secondhand.original <- subset_samples(NYC_HANES, smokingstatus %in% c("Secondhand"))
NYC_HANES.secondhand.strict <- subset_samples(NYC_HANES.secondhand.original, COTININE < 10)
sort(sample_data(NYC_HANES.secondhand.original)$COTININE)
summary(sample_data(NYC_HANES.secondhand.original)$COTININE > 10)
#Get crude edgeR results for model built with continuous cotinin without filtering for FDR 
dacot.crude <- get_edgeR_results(~ COTININE, 
                                 pseq = NYC_HANES.secondhand.original, 
                                 filtering=TRUE, # pre-filtering
                                 alph = 2, #don't apply alpha cutoff
                                 method = "BH")
#Get crude edgeR results using strict definition of second-hand smokers
dacot.crude.strict <- get_edgeR_results(~ COTININE, 
                                 pseq = NYC_HANES.secondhand.strict, 
                                 filtering=TRUE, # pre-filtering
                                 alph = 2, #don't apply alpha cutoff
                                 method = "BH")
#Get crude edgeR results for model smokers vs non-smokers without filtering
dasmoking.crude <- get_edgeR_results(~ smokingstatus, 
                                     pseq=NYC_HANES,
                                     filtering=TRUE, # pre-filtering
                                     alph = 2, #don't apply alpha cutoff
                                     coef = 2, #use non-smokers as reference
                                     method = "BH")
edger.smoker_secondhand_crude <- mergeResults(dasmoking.crude, dacot.crude)
edger.smoker_secondhand_crude.strict <- mergeResults(dasmoking.crude, dacot.crude.strict)

Number of OTUs with FDR < 0.05 and total that passed non-specific screens in both analysis:

summary(edger.smoker_secondhand_crude$FDR_smoking < 0.05)
# note that the different total number for strict cotinine cutoff is 
# due to non-specific filtering in edgeR
summary(edger.smoker_secondhand_crude.strict$FDR_smoking < 0.05) 
corarray <- array(dim=c(2, 2, 2), dimnames=list(secondhand=c("cot14", "cot10"),
                                                counfounding=c("crude", "adjusted"),
                                                FDRcutoff=c("no", "yes")))
parray <- array(dim=c(2, 2, 2), dimnames=list(secondhand=c("cot14", "cot10"),
                                                counfounding=c("crude", "adjusted"),
                                                FDRcutoff=c("no", "yes")))

Correlation between crude smoking and cotinine coefficients, all OTUs:

corr <- with(edger.smoker_secondhand_crude, 
     cor.test(logFC_smoking, logFC_cotinine))
parray["cot14", "crude", "no"] <- corr$p.value
corarray["cot14", "crude", "no"] <- corr$estimate

Strict definition of second-hand smoke:

corr <- with(edger.smoker_secondhand_crude.strict, 
     cor.test(logFC_smoking, logFC_cotinine))
parray["cot10", "crude", "no"] <- corr$p.value
corarray["cot10", "crude", "no"] <- corr$estimate

Correlation between crude smoking and cotinine coefficients, FDR < 0.05 in smoking:

corr <- with(filter(edger.smoker_secondhand_crude, FDR_smoking < 0.05),
  cor.test(logFC_smoking, logFC_cotinine))
corarray["cot10", "crude", "yes"] <- corr$p.value
parray["cot10", "crude", "yes"] <- corr$estimate

All OTUs, black for FDR < 0.05 on smoking coefficients, grey for FDR >= 0.05

p <-
  plotSmokingcotinineComparison(dasmoking.crude,
                                dacot.crude,
                                labtext = "Crude",
                                FDR = 1)
p

Repeat, using the strict definition of second-hand smokers (cotinine < 10)

p <-
  plotSmokingcotinineComparison(dasmoking.crude,
                                dacot.crude.strict,
                                labtext = "Crude",
                                FDR = 1)
p

Adjusted Secondhand vs Adjusted Cigarette/Never: only significant OTUs

dacot.adjusted <- get_edgeR_results(~ COTININE + AGEGRP4C + EDU3CAT,
    pseq = NYC_HANES.secondhand.original,
    alph = 2,
    filtering = TRUE,
    method = "BH"
  )

dacot.adjusted.strict <- get_edgeR_results(~ COTININE + AGEGRP4C + EDU3CAT,
    pseq = NYC_HANES.secondhand.strict,
    alph = 2,
    filtering = TRUE,
    method = "BH"
  )

dasmoking.adjusted <- get_edgeR_results(~ smokingstatus + AGEGRP4C + EDU3CAT,
    coef = 2,
    alph = 2,
    filtering = TRUE
  )

edger.smoker_secondhand_adjusted <- mergeResults(dasmoking.adjusted, dacot.adjusted)
edger.smoker_secondhand_adjusted.strict <- mergeResults(dasmoking.adjusted, dacot.adjusted.strict)

Number of differentially abundant OTUs, and total

nrow(edger.smoker_secondhand_adjusted)
summary(edger.smoker_secondhand_adjusted$logFC_smoking < 0.05)
summary(edger.smoker_secondhand_adjusted$logFC_cotinine < 0.05)

Correlation between adjusted smoking and cotinine coefficients, all OTUs:

corr <- with(edger.smoker_secondhand_adjusted, 
     cor.test(logFC_smoking, logFC_cotinine))
parray["cot14", "adjusted", "no"] <- corr$p.value
corarray["cot14", "adjusted", "no"] <- corr$estimate

As above, using strict serum cotinine cutoff (<10)

corr <- with(edger.smoker_secondhand_adjusted.strict, 
     cor.test(logFC_smoking, logFC_cotinine))
parray["cot10", "adjusted", "no"] <- corr$p.value
corarray["cot10", "adjusted", "no"] <- corr$estimate

Correlation between adjusted smoking and cotinine coefficients, FDR < 0.05 in smoking:

with(filter(edger.smoker_secondhand_adjusted, FDR_smoking < 0.05),
  cor.test(logFC_smoking, logFC_cotinine))
parray["cot14", "adjusted", "yes"] <- corr$p.value
corarray["cot14", "adjusted", "yes"] <- corr$estimate

Sensitivity analysis:

round(corarray, 2)
signif(parray, 1)
p <-
  plotSmokingcotinineComparison(
    obj.smoking = dasmoking.adjusted,
    obj.cotinine = dacot.adjusted,
    labtext = "Adjusted",
    FDRcutoff = 1
  )
p

Repeat, using the strict definition of second-hand smokers (cotinine < 10)

p <-
  plotSmokingcotinineComparison(
    obj.smoking = dasmoking.adjusted,
    obj.cotinine = dacot.adjusted.strict,
    labtext = "Adjusted",
    FDRcutoff = 1
  )
p

Write supplemental file

stopifnot(all.equal(rownames(edger.smoker_secondhand_adjusted), 
                    rownames(edger.smoker_secondhand_crude)))
crude <- edger.smoker_secondhand_crude[, 1:4]
adjusted <- edger.smoker_secondhand_adjusted[, 1:4]
colnames(crude) <- paste0(colnames(crude), "_crude")
colnames(adjusted) <- paste0(colnames(adjusted), "_adjusted")
sfile <- cbind(edger.smoker_secondhand_crude[, -1:-4], crude, adjusted)
readr::write_excel_csv(sfile, path="SupplementalFile1.csv")

Alternative smokers vs Never smoker: crude

sample_data(NYC_HANES)$smokingstatus <- relevel(sample_data(NYC_HANES)$smokingstatus,"Never smoker")
plot_edgeR(~ smokingstatus, pseq = NYC_HANES, coef=4, alph = 0.05, filtering = TRUE, method = "BH", color = "Phylum", sortby = "Phylum",invisbl = FALSE)

E-cigarette vs never smokers

sample_data(NYC_HANES)$E_CIGARETTES <- factor(sample_data(NYC_HANES)$E_CIGARETTES,labels = c('Yes','No',NaN))
plot_edgeR(~ E_CIGARETTES, pseq = NYC_HANES, alph = 0.05, filtering = TRUE, method = "BH", sortby = "Phylum") -> edger.univ

Hookah pipe vs never smokers

sample_data(NYC_HANES)$HOOKAH_PIPE <- factor(sample_data(NYC_HANES)$HOOKAH_PIPE, labels = c('Yes','No',NaN))
plot_edgeR(~ HOOKAH_PIPE, pseq = NYC_HANES, alph = 0.05, filtering = TRUE, method = "BH", sortby = "Phylum", color = 'Phylum',invisbl = FALSE)

Cigars and cigarillos vs never smokers

sample_data(NYC_HANES)$CIGARS_CIGARILLOS <- factor(sample_data(NYC_HANES)$CIGARS_CIGARILLOS,labels = c('Yes','No',NaN))
plot_edgeR(~ CIGARS_CIGARILLOS, pseq = NYC_HANES, alph = 0.05, filtering = TRUE, method = "BH", sortby = "Phylum", invisbl = FALSE)

Analysis on biosis of bacteria

Odds ratio smokers vs never smokers

R.utils::gcDLLs()
dasmoking.crude <- get_edgeR_results(~ smokingstatus, coef = 2, alph = 2, filtering = FALSE, method = "BH")
dasmoking.crude$table %<>% left_join(biosis, by = c("Genus"="X1"))
dasmoking.crude <- dasmoking.crude$table
dasmoking.crude$direction <- factor(ifelse(dasmoking.crude$logFC >0, 'Enriched in smokers', 'Depleted in smokers'))
dasmoking.crude$direction <- factor(dasmoking.crude$direction, levels(dasmoking.crude$direction)[c(2,1)])

direction <- c('enriched in','depleted in;')

x<- table(dasmoking.crude$X2 == 'Aerobic', dasmoking.crude$direction)[c(2,1),]
rownames(x) <- c('OTU is aerobic', 'OTU is not aerobic')
epitools::epitab(x)

x<- table(dasmoking.crude$X2 == 'Anaerobic', dasmoking.crude$direction)[c(2,1),]
rownames(x) <- c('OTU is anaerobic', 'OTU is not anaerobic')
epitools::epitab(x)

x<- table(dasmoking.crude$X2 == 'F Anaerobic', dasmoking.crude$direction)[c(2,1),]
rownames(x) <- c('OTU is F Anaerobic', 'OTU is not F Anaerobic')
epitools::epitab(x)

Cigarette smokers vs Never smokers

GSEA

full_eset <- ExpressionSet(
  assayData = microbiome::abundances(NYC_HANES),
  phenoData = new("AnnotatedDataFrame", sample_data(NYC_HANES))
)
full_eset$CATCOTININE <- ifelse(full_eset$COTININE < 3,'low','high')
full_annotated_ds <- tax_table(NYC_HANES) %>% as.data.frame %>% bind_cols(data.frame(OTU=rownames(tax_table(NYC_HANES)))) %>% filter(Domain != "Unassigned") %>% left_join(biosis, by = c("Genus"="X1"))

alt_full_eset <- full_eset[,(full_eset$smokingstatus %in% c("Never smoker","Alternative smoker"))]
alt_full_eset$smokingstatus <- droplevels(alt_full_eset$smokingstatus)
levels(alt_full_eset$smokingstatus) <- c('Never','Alternative')
alt_full_eset$GROUP <- ifelse(alt_full_eset$smokingstatus == "Alternative",1,0)

EnrichmentBrowser::configEBrowser("OUTDIR.DEFAULT","./results")
sns.eset <- full_eset[,full_eset$smokingstatus %in%  c("Cigarette","Never smoker")]
NYC_HANES.sns <- NYC_HANES %>% subset_samples(., smokingstatus %in% c("Cigarette","Never smoker"))
sns.eset$GROUP <- ifelse(sns.eset$smokingstatus == "Cigarette",1,0)
de.sns_eset <- EnrichmentBrowser::deAna(sns.eset, de.method = 'edgeR', filter.by.expr = FALSE)

design = model.matrix(~smokingstatus, data=data.frame(sample_data(NYC_HANES.sns)))

otu_table.voom <- limma::voom(SummarizedExperiment::assay(de.sns_eset), design = design, plot = FALSE)
SummarizedExperiment::assay(de.sns_eset) <- otu_table.voom$E
class(SummarizedExperiment::assay(de.sns_eset)) <- "matrix"

sns_ds <- full_annotated_ds %>% filter(OTU %in% taxa_names(NYC_HANES.sns))
sns_sAero <- (sns_ds %>% filter(X2=="Aerobic"))$OTU %>% as.character
sns_sAnae <- (sns_ds %>% filter(X2=="Anaerobic"))$OTU %>% as.character
sns_sFana <- (sns_ds %>% filter(X2=="F Anaerobic" | X2=="Aero / Facultative Anaerobic"))$OTU %>% as.character
sns_my.gs <- list(aero=sns_sAero, anae=sns_sAnae, fana=sns_sFana)

sns.gsea.res <- EnrichmentBrowser::sbea("gsea", de.sns_eset, sns_my.gs)
EnrichmentBrowser::gsRanking(sns.gsea.res, signif.only=FALSE)
expr = SummarizedExperiment::assay(de.sns_eset)
expr = expr[SummarizedExperiment::rowData(de.sns_eset)$ADJ.PVAL <= 0.05,]
grp = factor(de.sns_eset$GROUP, labels = c('Never smoker','Cigarette'))

expr <- t(scale(t(expr)))
# expr <- expr[rownames(expr) %in% unlist(sns.gsea.res$gs$aero), ]
expr <- expr[rownames(expr) %in% unlist(sns.gsea.res$gs), ]

coll <- c("#B62A84", "#2AB68C")
names(coll) <- levels(grp)
coll <- list(Group = coll)
df <- data.frame(Group = grp)
ha <- ComplexHeatmap::HeatmapAnnotation(df = df, col=coll)

rownames(expr) <- paste(data.frame(tax_table(NYC_HANES)[rownames(expr),])$Genus, '')

ComplexHeatmap::Heatmap(expr, name = "Expression", 
                        top_annotation = ha,
                        show_row_names = (nrow(expr) < 41),
                        show_column_names = (ncol(expr) < 41),
                        column_title = "Samples", 
                        row_title = "OTUs",
                        cluster_rows = TRUE
                        # row_order = unlist(gsea.res$gs)
                       )

ORA

sns_ora.res <- EnrichmentBrowser::sbea("ora", de.sns_eset, sns_my.gs, perm=1000)
EnrichmentBrowser::gsRanking(sns_ora.res, signif.only=FALSE)

Secondhand smokers

GSEA

NYC_HANES.sh <- NYC_HANES %>% subset_samples(., smokingstatus %in% c("Secondhand"))
sh_eset <-  full_eset[,full_eset$smokingstatus %in%  c("Secondhand")]

cotinine_quantiles <- quantile(sh_eset$COTININE ,c(0.33,0.66))
sh_eset$GROUP <- ifelse(sh_eset$COTININE < cotinine_quantiles[[1]], 0,ifelse(sh_eset$COTININE > cotinine_quantiles[[2]],1,NA))

subeset <- sh_eset[,!is.na(sh_eset$GROUP)]
sh_design = model.matrix(~CATCOTININE, data = data.frame(pData(subeset)))
de.sh_eset <- EnrichmentBrowser::deAna(subeset, de.method = 'edgeR', filter.by.expr = FALSE)

otu_table.voom <- limma::voom(SummarizedExperiment::assay(de.sh_eset), design = sh_design, plot = FALSE)
SummarizedExperiment::assay(de.sh_eset) <- otu_table.voom$E
class(SummarizedExperiment::assay(de.sh_eset)) <- "matrix"

sh_ds <- full_annotated_ds %>% dplyr::filter(OTU %in% taxa_names(NYC_HANES.sh))
sh_sAero <- (sh_ds %>% filter(X2=="Aerobic"))$OTU %>% as.character
sh_sAnae <- (sh_ds %>% filter(X2=="Anaerobic"))$OTU %>% as.character
sh_sFana <- (sh_ds %>% filter(X2=="F Anaerobic" | X2=="Aero / Facultative Anaerobic"))$OTU %>% as.character
sh_my.gs <- list(aero=sh_sAero, anae=sh_sAnae, fana=sh_sFana)

sh_gsea.res <- EnrichmentBrowser::sbea("gsea", de.sh_eset, sh_my.gs)
EnrichmentBrowser::gsRanking(sh_gsea.res, signif.only=FALSE)

ORA

sh_ora.res <- EnrichmentBrowser::sbea("ora", de.sh_eset, sh_my.gs)
EnrichmentBrowser::gsRanking(sh_ora.res, signif.only=FALSE)

Continuous cotinine

GSVA continuous cotinine on secondhand smokers

sh.gsva <- GSVA::gsva(sh_eset, sh_my.gs, parallel.sz = 1)
sh_fit <- limma::lmFit(sh.gsva, sh.gsva$COTININE) %>% eBayes()
topTable(sh_fit)

GSVA continuous cotinine on cigarette vs never smokers

NYC_HANES.sns.nc <- NYC_HANES %>% subset_samples(., smokingstatus %in% c("Cigarette","Never smoker") & !is.na(COTININE))

sns.nc.eset <- full_eset[,full_eset$smokingstatus %in%  c("Cigarette","Never smoker") & !is.na(full_eset$COTININE)]

sns.nc_ds <- full_annotated_ds %>% dplyr::filter(OTU %in% taxa_names(NYC_HANES.sns.nc))
sns.nc_sAero <- (sns.nc_ds %>% filter(X2=="Aerobic"))$OTU %>% as.character
sns.nc_sAnae <- (sns.nc_ds %>% filter(X2=="Anaerobic"))$OTU %>% as.character
sns.nc_sFana <- (sns.nc_ds %>% filter(X2=="F Anaerobic" | X2=="Aero / Facultative Anaerobic"))$OTU %>% as.character
sns.nc_my.gs <- list(aero=sns.nc_sAero, anae=sns.nc_sAnae, fana=sns.nc_sFana)

sns.nc.my.gsva <- GSVA::gsva(sns.nc.eset, sns.nc_my.gs, parallel.sz = 1)

sns.nc.fit <- limma::lmFit(sns.nc.my.gsva, sns.nc.my.gsva$COTININE) %>% eBayes()
topTable(sns.nc.fit)
NYC_HANES <- loadQiimeData() |> 
  annotateFactors()
alt_smokers_cigarettes <- NYC_HANES |> 
  sample_data() |> 
  data.frame() |> 
  dplyr::filter(smokingstatus == 'Alternative smoker') |> 
  dplyr::filter(CIGARETTES == 'Yes') |>  
  dplyr::select(Burklab_ID) |> 
  t()

NYC_HANES <- prune_samples(!(sample_names(NYC_HANES) %in% alt_smokers_cigarettes), NYC_HANES)
sample_data(NYC_HANES)$smokingstatus <- relevel(sample_data(NYC_HANES)$smokingstatus,"Never smoker")

NYC_HANES.alt <- subset_samples(NYC_HANES, smokingstatus %in% c("Never smoker", "Alternative smoker"))
levels(sample_data(NYC_HANES.alt)$smokingstatus) <- c('Never','Alternative')

full_eset <- ExpressionSet(assayData = microbiome::abundances(NYC_HANES), phenoData = new("AnnotatedDataFrame", sample_data(NYC_HANES)))
full_eset$CATCOTININE <- ifelse(full_eset$COTININE < 3,'low','high')
full_annotated_ds <- tax_table(NYC_HANES) %>% as.data.frame %>% bind_cols(data.frame(OTU=rownames(tax_table(NYC_HANES)))) %>% filter(Domain != "Unassigned") %>% left_join(biosis, by = c("Genus"="X1"))

alt_full_eset <- full_eset[,(full_eset$smokingstatus %in% c("Never smoker","Alternative smoker"))]
alt_full_eset$smokingstatus <- droplevels(alt_full_eset$smokingstatus)
levels(alt_full_eset$smokingstatus) <- c('Never','Alternative')
alt_full_eset$GROUP <- ifelse(alt_full_eset$smokingstatus == "Alternative",1,0)

Hookah smokers vs Never smokers

GSEA

NYC_HANES.hookah <- subset_samples(NYC_HANES.alt, ((smokingstatus %in% c("Never")) | (HOOKAH_PIPE == 1)))

hookah_eset <- alt_full_eset[,(alt_full_eset$smokingstatus %in% c("Never") | tidyr::replace_na(alt_full_eset$HOOKAH_PIPE == 1, FALSE))]
write.csv(hookah_eset$smokingstatus, '/tmp/a')
de.hookah_eset <- EnrichmentBrowser::deAna(hookah_eset, de.method = 'edgeR', filter.by.expr = FALSE)

hookah_design <- model.matrix(~smokingstatus, data = data.frame(pData(hookah_eset)))
otu_table.voom <- limma::voom(SummarizedExperiment::assay(de.hookah_eset), design = hookah_design , plot = FALSE)
SummarizedExperiment::assay(de.hookah_eset) <- otu_table.voom$E
class(SummarizedExperiment::assay(de.hookah_eset)) <- "matrix"

hookah_ds <- full_annotated_ds %>% dplyr::filter(OTU %in% taxa_names(NYC_HANES.hookah))
hookah_sAero <- (hookah_ds %>% filter(X2=="Aerobic"))$OTU %>% as.character
hookah_sAnae <- (hookah_ds %>% filter(X2=="Anaerobic"))$OTU %>% as.character
hookah_sFana <- (hookah_ds %>% filter(X2=="F Anaerobic" | X2=="Aero / Facultative Anaerobic"))$OTU %>% as.character
hookah_my.gs <- list(aero=hookah_sAero, anae=hookah_sAnae, fana=hookah_sFana)

hookah_gsea.res <- EnrichmentBrowser::sbea("gsea", de.hookah_eset, hookah_my.gs)
EnrichmentBrowser::gsRanking(hookah_gsea.res, signif.only=FALSE)

ORA

hookah_ora.res <- EnrichmentBrowser::sbea("ora", de.hookah_eset, hookah_my.gs, perm=0)
EnrichmentBrowser::gsRanking(hookah_ora.res, signif.only=FALSE)

E-cigarette smokers vs Never smokers

GSEA

NYC_HANES.ecig <- NYC_HANES.alt %>% subset_samples(., smokingstatus %in% c("Never") | E_CIGARETTES == 1)

ecig_eset <- alt_full_eset[,(alt_full_eset$smokingstatus %in% c("Never") | tidyr::replace_na(alt_full_eset$E_CIGARETTES == 1, FALSE))]

de.ecig_eset <- EnrichmentBrowser::deAna(ecig_eset, de.method = 'edgeR', filter.by.expr = FALSE)

ecig_design <- model.matrix(~smokingstatus, data = data.frame(pData(ecig_eset)))
otu_table.voom <- limma::voom(SummarizedExperiment::assay(de.ecig_eset), design = ecig_design, plot = FALSE)
SummarizedExperiment::assay(de.ecig_eset) <- otu_table.voom$E
class(SummarizedExperiment::assay(de.ecig_eset)) <- "matrix"

ecig_ds <- full_annotated_ds %>% dplyr::filter(OTU %in% taxa_names(NYC_HANES.ecig))
ecig_sAero <- (ecig_ds %>% filter(X2=="Aerobic"))$OTU %>% as.character
ecig_sAnae <- (ecig_ds %>% filter(X2=="Anaerobic"))$OTU %>% as.character
ecig_sFana <- (ecig_ds %>% filter(X2=="F Anaerobic" | X2=="Aero / Facultative Anaerobic"))$OTU %>% as.character
ecig_my.gs <- list(aero=ecig_sAero, anae=ecig_sAnae, fana=ecig_sFana)

ecig_gsea.res <- EnrichmentBrowser::sbea("gsea", de.ecig_eset, ecig_my.gs)
EnrichmentBrowser::gsRanking(ecig_gsea.res, signif.only=FALSE)

ORA

ecig_ora.res <- EnrichmentBrowser::sbea("ora", de.ecig_eset, ecig_my.gs, perm=0)
EnrichmentBrowser::gsRanking(ecig_ora.res, signif.only=FALSE)

Cigar/cigarillos smokers vs Never smokers

GSEA

NYC_HANES.cigars <- NYC_HANES.alt %>% subset_samples(., smokingstatus %in% c("Never") | CIGARS_CIGARILLOS == 1)

cigars_eset <- alt_full_eset[,(alt_full_eset$smokingstatus %in% c("Never") | tidyr::replace_na(alt_full_eset$CIGARS_CIGARILLOS == 1, FALSE))]

de.cigars_eset <- EnrichmentBrowser::deAna(cigars_eset, de.method = 'edgeR', filter.by.expr = FALSE)

cigars_design <- model.matrix(~smokingstatus, data = data.frame(pData(cigars_eset)))
otu_table.voom <- limma::voom(SummarizedExperiment::assay(de.cigars_eset), design = cigars_design, plot = FALSE)
SummarizedExperiment::assay(de.cigars_eset) <- otu_table.voom$E
class(SummarizedExperiment::assay(de.cigars_eset)) <- "matrix"

cigars_ds <- full_annotated_ds %>% dplyr::filter(OTU %in% taxa_names(NYC_HANES.cigars))
cigars_sAero <- (cigars_ds %>% filter(X2=="Aerobic"))$OTU %>% as.character
cigars_sAnae <- (cigars_ds %>% filter(X2=="Anaerobic"))$OTU %>% as.character
cigars_sFana <- (cigars_ds %>% filter(X2=="F Anaerobic" | X2=="Aero / Facultative Anaerobic"))$OTU %>% as.character
cigars_my.gs <- list(aero=cigars_sAero, anae=cigars_sAnae, fana=cigars_sFana)

cigars_gsea.res <- EnrichmentBrowser::sbea("gsea", de.cigars_eset, cigars_my.gs)
EnrichmentBrowser::gsRanking(cigars_gsea.res, signif.only=FALSE)

ORA

cigars_ora.res <- EnrichmentBrowser::sbea("ora", de.cigars_eset, cigars_my.gs, perm=0)
EnrichmentBrowser::gsRanking(cigars_ora.res, signif.only=FALSE)
sessionInfo()


waldronlab/nychanesmicrobiome documentation built on April 21, 2024, 8:52 a.m.