R/sarcoid_manuscript.R

# General functions -------------------------------------------------------

#' Creates a MinRank vector from an agglomerated data frame (based on
#' eclectic::tax_climber). Use inside dplyr::mutate or directly assign to col.
#' MinRank columns are the most specific taxonomic assignments (up to a
#' threshold)
#' @param agg agglomerated data frame (with taxonomic ranks)
#' @param end.rank the most specific taxonomic rank if available
#' @param rank a character vector of taxonomic ranks present in agg
#' @param ... additional arguments passed to eclectic::tax_climber
min_rank <- function(agg, end.rank, ranks=qiimer::taxonomic_ranks, ...) {
  .agg <- filter(agg, !is.na(otu))
  md <- .agg[,c(ranks, "otu")]
  md <- data.frame(distinct(md))
  rownames(md) <- md$otu
  minrank <- tax_climber(md$otu, md, end=end.rank, ranks=ranks, ...)
  left_join(select(agg, otu), data.frame(otu=md$otu, MinRank=minrank))$MinRank
}

#' Creates a counts matrix from an agglomerated data frame.
#' Optionally, metadata columns can be included (useful for some functions), in
#' which case the output is a data.frame, not a matrix.
#' @param agg agglomerated data frame
#' @param additional.columns any columns to include as columns in the matrix
#' @return default: a numeric matrix with otu as rows and samples as columns.
#' If additional columns specified, a data.frame with otus as columns and samples as rows
counts_matrix <- function(agg, additional.columns) {
  cols <- c("otu", "SampleID", "count")
  if (!missing(additional.columns)) cols <- c(cols, additional.columns)
  .mat <- agg %>% select_(.dots=cols) %>%
    spread(SampleID, count, fill=0) %>%
    as.data.frame %>%
    filter(!is.na(otu))
  if (!missing(additional.columns)) {
    return(.mat)
  } else {
    rownames(.mat) <- .mat$otu
    .mat$otu <- NULL
    .mat <- t(as.matrix(.mat))
    return(.mat)
  }
}

#' Root a tree using a random node as the root.
#' @param tree a phylogenetic tree (class "phylo")
#' @param otus vector of otus to keep
#' @return rooted tree
root_tree <- function(tree, otus) {
  # Root the tree using a random node as the root
  dropped.otus <- tree$tip.label[!(tree$tip.label %in% otus)]
  .tree <- drop.tip(tree, dropped.otus)
  .rtree <- root(.tree, sample(.tree$tip.label, 1), resolve.root = TRUE)
  stopifnot(is.rooted(.rtree))
  return(.rtree)
}

#' Non-random version of \code{\link{forcats::fct_anon}}
#' @inheritParams forcats::fct_anon
#' @export
fct_anon_deterministic <- function(f, prefix="") {
  levels <- paste0(prefix, forcats:::zero_pad(seq_len(nlevels(f))))
  lvls_revalue(f, levels)
}

lvl_toend <- function(f, level) {
  forcats:::check_factor(f)
  lvls <- levels(f)[levels(f) != level]
  lvls <- c(lvls, level)
  factor(f, levels=lvls)
}

get_ggplot_legend <- function(p) {
  gp <- ggplot2::ggplot_gtable(ggplot2::ggplot_build(p))
  legend_idx <- which(sapply(gp$grobs, function(x) x$name) == "guide-box")
  gp$grobs[[legend_idx]]
}

#' Try to use \code{cairo_pdf}, but fall back to \code{pdf} if that doesn't work.
#' @param ... arguments to cairo_pdf/pdf
#' @export
cairo_failsafe <- function(...) {
  tryCatch(cairo_pdf(...), error=pdf(...))
}
# Color themes ------------------------------------------------------------

quick_palette <- function(levels) {
  eclpalettes::named_palette(levels, eclpalettes::tol_palette(length(levels)))
}

# Heatmap functions -------------------------------------------------------


#' Prepare agglomerated dataframe for heatmap display by aggregating counts
#' by minimum rank, completing missing cases, and clustering rows
#' @param min.samples the minimum number of samples required for a taxa to appear
#' @param s sample metadata dataframe
#' @param min.rank.1 the most specific rank to aggregate by (determines y-axis)
#' Uses tax_climber to select next most specific rank if not available.
#' @param min.rank.2 (optional) higher rank to prefix min.rank.1 (e.g. Phylum)
#' Uses tax_climber to select next most specific rank if not available.
#' @param min.samples a row must appear more than this number of samples to be shown
#' @export
MakeHeatmapData <- function(agg, s, min.rank.1 = "Genus", min.rank.2 = NA, min.samples=1) {
  # Create the y-axes for the data via the desired taxonomic rank level(s)
  .agg <- agg %>% mutate(
    MinRank1 = min_rank(agg, end=min.rank.1, label=FALSE))
  if (!is.na(min.rank.2)) {
    .agg <- .agg %>% mutate(
      MinRank2 = min_rank(agg, end=min.rank.2, label=TRUE, sep="__"),
      MinRank1 = paste(MinRank2, MinRank1)
    )
  }

  .agg <- .agg %>%
    # Aggregate counts by minimum available rank
    group_by(SampleID, MinRank1) %>%
    summarize(count = sum(count)) %>%
    # Update proportions
    group_by(SampleID) %>%
    mutate(proportion=count/sum(count))

  # Convert to matrix form
  .mat <- reshape2::dcast(
    .agg, MinRank1 ~ SampleID, value.var="proportion", fill = 0) %>%
    filter(!is.na(MinRank1))
  rownames(.mat) <- .mat$MinRank1
  .mat$MinRank1 <- NULL

  # Cluster and pull out row order
  minrank_order <- (dist(.mat, method="euclidean") %>%
    hclust(method="average"))$order

  .agg %>%
    # Reorder by clustering order
    mutate(MinRank1 = factor(MinRank1, levels=rownames(.mat)[minrank_order])) %>%
    # Complete missing cases: fills missing combinations in with NA
    # instead of just omitting the row entirely (necessary to show blank cells)
    ungroup() %>%
    tidyr::complete(SampleID, nesting(MinRank1)) %>%
    select(SampleID, MinRank1, count, proportion) %>%
    distinct() %>%
    # Filter taxa that appear in fewer than required number of samples
    group_by(MinRank1) %>%
    filter(sum(proportion > 0, na.rm=TRUE) > min.samples) %>%
    # Re-add metadata
    left_join(s, by="SampleID", all.y=TRUE) %>%
    ungroup()
}

#' Plot the results of MakeHeatmapData.
#' @param heatmap.data the dataframe from MakeHeatmapData
#' @param use.reads if TRUE, plot the raw readcounts; if FALSE, use proportions
#' @param threshold at what relative proportion the color scale goes to red
#' @param x.text display labels on the x-axis
#' @export
PlotHeatmap <- function(
  heatmap.data, use.reads=TRUE, threshold=0.6, x.text=TRUE, x.axis='SampleID') {

  if (use.reads) {
    fill.var <- "count"
    scale.name <- "Reads"
    fill.scale <- saturated_rainbow_cts(threshold = threshold, name=scale.name)
  } else {
    fill.var <- "proportion"
    scale.name <- "Abundance"
    fill.scale <- saturated_rainbow_pct(threshold = threshold, name=scale.name)
  }

  p <- ggplot(heatmap.data, aes_string(x.axis, "MinRank1", fill=fill.var)) +
    geom_tile(color="grey40", size=0.4) +
    facet_grid(. ~ StudyGroup + SampleType, space="free", scales="free") +
    fill.scale +
    theme_grey() +
    theme(
      strip.text.y = element_text(angle=0, vjust=0),
      strip.text.x = element_text(angle=90, vjust=0),
      panel.border = element_blank())
  if (x.text) {
    p <- p + theme(
      axis.text.x = element_text(angle=45, hjust=1, vjust=1))
  } else{
    p <- p + theme(
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank())
  }
  p
}


# GLMM functions ----------------------------------------------------------

#' Create and test a mixed effect model between two sample types and study groups
#' (e.g. BAL and prewash, tissue and paraffin), where the interaction term for the
#' model is the coefficient of interest.
#' @param agg an agglomerated data frame; ensure the StudyGroup and SampleType
#'        levels are ordered correctly (i.e. interesting ones last)
#' @param grouping.var the column defining the sample/control grouping
#' @param tax.level the taxonomic level to test; lower levels will be summed
#' @param sparsity the proportions of zeros below which a taxa will be skipped
TestMatchedControlMixedModel <- function(agg, grouping.var="OriginatingSampleID", tax.level="Genus", sparsity=0.1) {
  # Create the data matrix from aggregated data frame, aggregating by the
  # specified taxonomy level
  .mat <- reshape2::dcast(
    agg,
    formula = formula(sprintf(
      "SampleID + %s + SampleType + StudyGroup + total ~ %s",
      grouping.var, tax.level)),
    value.var = "count",
    fill = 0,
    fun.aggregate = sum)

  # This is the testing function that builds the models
  .fitGLMM <- function(test.col) {
    # We return this default in case of error/no fit
    default <- list(null.model=NA, model=NA)

    counts <- .mat[, colnames(.mat)==test.col]
    if (sum(counts > 0)/length(counts) < sparsity) {
      message(sprintf("%s: too sparse, skipping", test.col))
      return(default)
    }

    # Build positive and negative counts for taxa under test
    test.data <- .mat[, c(1:5)]
    test.data$neg.cts <- test.data[["total"]] - counts
    test.data$pos.cts <- counts

    # The model of interest contains an interaction between StudyGroup and
    # SampleType; the null model does not. We are stricter about forcing
    # convergence for this model than the null model. Errors return NA.
    model1 <- plyr::try_default(
      lme4::glmer(
        formula=sprintf(
          "cbind(pos.cts, neg.cts) ~ StudyGroup * SampleType + (1|%s) + (1|SampleID)",
          grouping.var),
        data=test.data,
        family=binomial,
        control=glmerControl(
          optimizer="bobyqa",
          check.conv.singular = "warning",
          check.conv.grad = "warning",
          check.conv.hess = "warning")
      ), NA)

    # The null model
    model0 <- plyr::try_default(
      lme4::glmer(
        formula=sprintf(
          "cbind(pos.cts, neg.cts) ~ StudyGroup + SampleType + (1|%s) + (1|SampleID)",
          grouping.var),
        data=test.data,
        family=binomial,
        control=glmerControl(
          optimizer="bobyqa")
      ), NA)

    # Return NA if the primary model broke
    if (is.na(model1)) {
      message(sprintf("Nonconvergent: %s", test.col))
      return(default)
    } else {
      # browser()
      return(list(null.model=model0, model=model1))
    }
  }

  # Get a list of things to test:
  test.candidates <- data.frame(taxa=colnames(subset(.mat, select=-c(SampleID:total))))

  # Fit the models
  results <- test.candidates %>%
    mutate(res = purrr::map(taxa, .fitGLMM))

  # this function generates ANOVA p-values, returning NA if either of the
  # models is NA
  .safeAnova <- function(x) {
    if (!is.na(x$null.model) & !is.na(x$model)) {
      p <- anova(x$null.model, x$model)$`Pr(>Chisq)`[2]
      if (length(p) != 1 | is.null(p)) return(NA)
      else p
    } else {
      return(NA)
    }
  }

  # Calculate p-values for all taxa, clean up, and return results
  results %>%
    mutate(anova.p = purrr::map_dbl(res, .safeAnova)) %>%
    mutate(model = purrr::map(res, ~ broom::tidy(.x$model))) %>%
    dplyr::select(-res) %>%
    tidyr::unnest() %>%
    dplyr::select(-x)  # An artifact from tidy(NA)
}

#' Extracts the significant results from TestMatchedControlMixedModel and
#' adjusts for multiple testing (p-values corrected are only those for the
#' interaction terms with the correct magnitude sign)
#' @param glmm.results results from TestMatchedControlMixedModel
#' @param direction a binary function such as > or < determining coefficient relationship to 0
#' @param padj.threshold only return adjusted p-values below this value
ExtractSignificantResults <- function(glmm.results, direction = `>`, padj.threshold=0.1) {
  .results <- glmm.results
  .interaction.terms <- filter(.results, grepl(":", term))

  if (nrow(.interaction.terms) == 0) return()

  .interaction.terms %>% filter(direction(estimate, 0)) %>%
    mutate(padj = p.adjust(anova.p)) %>%
    filter(padj <= padj.threshold)
}


# DESeq2 functions --------------------------------------------------------

MakeDESeqFromAgglomerated <- function(agg, tax.level="Genus") {
  agg <- group_by(agg, SampleID) %>% filter(sum(count) > 0) %>% droplevels()
  # browser()
  mat <- reshape2::dcast(
    agg,
    formula=sprintf("SampleID + StudyGroup + SampleType ~ %s", tax.level),
    value.var="count",
    fill = 0,
    fun.aggregate = sum)
  pd <- mat[, c(1:3)]
  mat <- mat[, -c(1:3)]
  mat <- as.matrix(t(mat))
  colnames(mat) <- pd$SampleID
  DESeqDataSetFromMatrix(mat, pd, ~ StudyGroup)
}

TestDESeq <- function(ds, fitType="local") {
  # Referencing McMurdie (http://www.bioconductor.org/packages/release/bioc/vignettes/phyloseq/inst/doc/phyloseq-mixture-models.html)
  # calculate geometric means prior to estimate size factors
  gm_mean = function(x, na.rm=TRUE){
    exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
  }

  geoMeans <- apply(counts(ds), 1, gm_mean)
  ds <- estimateSizeFactors(ds, geoMeans=geoMeans)
  DESeq(ds, fitType=fitType)
}


# Macroscopic testing functions -------------------------------------------

#' Test differences between study groups using the generalized linear mixed model
#' @param ranks.to.test a vector giving the taxonomic ranks to collapse and test
#' @param agg an appropriately filtered agglomerated data frame
#' @param md the metadata dataframe
#' @param sparsity omit testing taxa that appear in fewer that this pct of samples
RunGLMM <- function(ranks.to.test, agg, md, sparsity=0.1, grouping.var="OriginatingSampleID") {
  plyr::adply(ranks.to.test, .margins=1, function(tax.level) {
    .md <- md[, 1:which(colnames(md) == tax.level)+1]
    results <- TestMatchedControlMixedModel(
      agg, tax.level=tax.level, sparsity=0.1, grouping.var = grouping.var)
    sig.results <- ExtractSignificantResults(results, padj.threshold = 1) %>%
      select(-c(term, group)) %>%
      mutate(tax.level=tax.level,
             taxa=as.character(taxa))
    sig.results[[tax.level]] <- sig.results$taxa
    left_join(sig.results, .md)
  }, .id=NULL) %>% distinct(taxa, .keep_all=TRUE)
}

#' Test differences between study groups using DESeq2.
#' @param ranks.to.test a vector giving the taxonomic ranks to collapse and test
#' @param agg an appropriately filtered agglomerated data frame
#' @param md the metadata dataframe
RunDESeq2 <- function(ranks.to.test, agg, md, sarcoid.name="sarcoid") {
  plyr::adply(ranks.to.test, .margins=1, function(tax.level) {
    .md <- md[, 1:which(colnames(md) == tax.level)+1]
    ds <- MakeDESeqFromAgglomerated(agg, tax.level=tax.level)
    ds <- TestDESeq(ds)
    ds <- ds %>% results(name=sprintf("StudyGroup%s", sarcoid.name)) %>% data.frame
    ds$taxa <- as.character(rownames(ds))
    ds[[tax.level]] <- rownames(ds)
    ds <- ds %>% filter(stat > 0) %>%
      left_join(.md) %>%
      mutate(test.rank=tax.level)
  }, .id=NULL) %>% distinct(taxa, .keep_all=TRUE)
}


# Testing output functions ----------------------------------------

SummarizeResults <- function(results, p.cutoff=0.1, taxa.level="taxa") {
  out <- filter(results, padj < p.cutoff)
  nlineages <- nrow(out)
  if (nlineages > 0) {
    .out <- out[[taxa.level]]

  } else {
    return(NA)
  }
  return(out)
}

# Case-Control Plotting ---------------------------------------------------

#' Links matched samples and (environmental) controls via the grouping.col
#' by completing any missing cases.
#' @param agg agglomerated dataframe
#' @param s sample metadata
#' @param md otu metadata
#' @return the modified agg with a new column, PlotGroup, for line drawing
LinkMatchedControls <- function(agg, s, md, grouping.col) {
  if (nrow(agg) == 0) {
    warning("Empty input")
    return()
  }

  # Complete missing cases
  agg <- ungroup(agg) %>% select_(grouping.col, "SampleType", "otu", "count") %>%
    droplevels()

  agg <- complete_(agg, list(grouping.col, "otu", "SampleType"), fill=list(count=0))
  # Re-add metadata
  agg <- left_join(agg, s) %>%
    left_join(md)
  agg$PlotGroup <- paste(agg[[grouping.col]], agg$otu)
  return(agg)
}



# Prelude stuff (to be moved) ---------------------------------------------

##
## Global defaults
##

ranks.to.test <- c("otu", "Species", "Genus", "Family")

# Dominant bacterial and fungal orders should have consistent colors between
# plots.
bacterial.orders <- sort(c(
  "Burkholderiales",
  "Bacillales",
  "Bacteroidales",
  "Clostridiales",
  "Actinomycetales",
  "Rhizobiales",
  "Lactobacillales",
  "Neisseriales",
  "Pseudomonadales",
  "Sphingomonadales",
  "Myxococcales",
  "Gemellales",
  "Rhodobacterales",
  "Flavobacteriales",
  "Enterobacteriales"
))

other.bacteria <- c(
  "Other Actinomycetales",
  "Other Bacillales",
  "Other Rhizobiales",
  "Other Burkholderiales",
  "Other Clostridiales",
  "Other Lactobacillales",
  "Other Bacteria")

fungal.orders <- rev(sort(c(
  "Agaricales",
  "Capnodiales",
  "Corticiales",
  "Eurotiales",
  "Hypocreales",
  "Pleosporales",
  "Saccharomycetales",
  "Sporidiobolales",
  "Filobasidiales",
  "Tremellales",
  "Polyporales",
  "Russulales",
  "Dothideales",
  "Malasseziales",
  "Sordariomycetes",
  "Trichosporonales"
)))

viruses <- sort(c(
  "Bacillus virus phi29",
  "Betapapillomavirus 2",
  "Dyosigmapapillomavirus 1",
  "Human herpesvirus 7",
  "Porcine stool-associated circular virus 5",
  "Torque teno virus 16",
  "Torque teno virus 19"))

primary.bacterial.colors <- eclpalettes::named_palette(
  levels=bacterial.orders,
  colors=eclpalettes::tol_palette(length(bacterial.orders))
) %>%
  eclpalettes::color_level("Other", "grey85")

other.bacterial.colors <- eclpalettes::named_palette(
  levels=other.bacteria,
  colors=grey.colors(length(other.bacteria), start=0.5)
)

#' @export
bacterial.colors <- c(primary.bacterial.colors, other.bacterial.colors)

#' @export
fungal.colors <- eclpalettes::named_palette(
  levels=fungal.orders,
  colors=eclpalettes::tol_palette(length(fungal.orders))
) %>%
  eclpalettes::color_level("Fungi (unclassified)", "grey55") %>%
  eclpalettes::color_level("Other", "grey85")

#' @export
viral.colors <- eclpalettes::named_palette(
  levels=viruses,
  colors=eclpalettes::tol_palette(length(viruses))) %>%
  eclpalettes::color_level("Other", "grey85")

# Makes LN and BAL filled circles, and paraffin and prewash open circles
# on barcharts
#' @export
ln.labels <-  c("lymph_node"="\u25CF", "paraffin"="\u25CB", "water"="\u2205")
#' @export
bal.labels <-  c("BAL"="\u25CF", "prewash"="\u25CB", "water"="\u2205")

# Consistent color/fill pairing for PCoA plots
#' @export
pcoa.colors <- c(healthy="#a6611a", sarcoidosis="#018571")
#' @export
pcoa.fills <- c(healthy="#dfc27d", sarcoidosis="#80cdc1")

# Consistent theme for horizontal barcharts
horiz.barchart.theme <- theme(
  axis.line.y=element_line(color="black", size=1, linetype =1),
  axis.text.x=element_blank(),
  axis.text.y=element_text(hjust=0, size=14),
  axis.title=element_blank(),
  # strip.text.y=element_text(size=18, angle=180),
  strip.text.y=element_blank(),
  axis.ticks.x=element_blank(),
  plot.margin=unit(c(0.4,0,0.4,0), "lines"),
  panel.grid=element_blank(),
  panel.border=element_rect(fill=NA, color="black", size=0.8, linetype=1),
  legend.key.height=unit(1, "lines"),
  legend.key.width=unit(1, "lines")
)

make_standard_barcharts <- function(plot, data, heading, ...) {
  MakeEqualSizePlots(
    plot, data, at_most=barcharts.max, ncol=barcharts.col,
    extra_text=heading, ...)
}
eclarke/sarcoid-microbiome-paper documentation built on May 29, 2019, 2:52 p.m.