R/tree_methods.R

Defines functions daOTU_plot

## Methods for representing differentially abundant (across conditions) otus on a grid or on a tree.
## Also useful for general sets of interesting otus. 

## Small multiple plot: each otu in set daOTUS has its own plot and each
## its (mean) relative abundance in different levels of condition variable
## is represented as a point. If replicates are available for a given level,
## SEM are represented as error bars.
daOTU_plot <- function(physeq, daOTUS, variable, title = NULL, plot = TRUE) {
  ## Args:
  ## - physeq: phyloseq class object
  ## - daOTUS: set of OTUS to plot
  ## - variable: x axis, used for plotting and aggregating samples
  ## - title: optional, plot title
  ##
  ## Returns,
  ## - p: invisible, ggplot object
  ##
  ## check variable and otu set
  numberOfProvidedTaxa <- length(daOTUS)
  daOTUS <- intersect(daOTUS, taxa_names(physeq))
  stopifnot(variable %in% sample_variables(physeq), length(daOTUS) > 0)
  if ( numberOfProvidedTaxa != length(daOTUS) ) {
    cat(paste(length(daOTUS), "otus out of", numberOfProvidedTaxa,
              "provided were found, use only those for plotting"), sep = "\n")
  }
  ## Get count data set
  tdf <- otu_table(physeq)
  if (!taxa_are_rows(physeq)) { tdf <- t(tdf) }
  ## Normalize counts to relative abundances
  tdf <- apply(tdf, 2, function(x) x/sum(x))
  tdf <- tdf[daOTUS, ]
  ## sort OTUs by overall abundance
  otuAbun <- sort(rowSums(tdf), TRUE)
  tdf <- data.frame(otu = rownames(tdf), tdf[daOTUS, ])
  ## Get within group mean and sem
  tdf <- melt(tdf)
  colnames(tdf)[2] <- "Sample"
  tdf <- merge(tdf,
               data.frame(Sample = sample_names(physeq), Replicate = get_variable(physeq, variable)),
               by.y = "Sample")
  meandf <- ddply(tdf, .(otu, Replicate), summarize,
                  mean = mean(value), sem = sd(value)/sqrt(length(value)))
  colnames(meandf)[ colnames(meandf) == "Replicate"] <- variable
  ## Get best taxonomic annotation
  bestAnn <- function(x) {
    x <- x[!is.na(x)]
    return(ifelse(length(x), x[length(x)], ""))
  }
  taxdf <- as(tax_table(physeq), "matrix")
  ann <- apply(taxdf, 1, bestAnn)
  taxdf <- data.frame(otu = rownames(taxdf), taxdf, tax = ann)[daOTUS, ]
  taxdf$otu2 <- paste(taxdf$otu, taxdf$tax)
  mdf <- merge(meandf, taxdf, by.x = "otu")
  ## Sort results by overall abundance
  mdf$otu2 <- factor(mdf$otu2, levels = taxdf[names(otuAbun), "otu2"])
  mdf[[variable]] <- factor(mdf[[variable]], levels = levels(get_variable(physeq, variable)))
  ## Plot results
  p <- ggplot(mdf, aes_string(x = variable, y = "mean", color = variable))
  p <- p + geom_errorbar(aes(ymin=mean-sem, ymax=mean+sem), width=.1) + geom_point()
  ## Add custom title if provided
  if (is.null(title)) {
    title.text <- paste("Differentially Abundant OTUs (", variable, ")", sep = "")
  } else {
    title.text <- title
  }
  p <- p + facet_wrap(~otu2, scales = "free_y") + ggtitle(title)
  if (plot) {
      plot(p)
  }
  return(invisible(p))
}

## subsidiary function to daOTU_tree_plot
## Add thermometer to figure
addThermo <- function(lastPP, meandf, adj.x, width.thermo = ncol(meandf)*8*strwidth(" ")) {
  ## Add Thermometer
  BOTHlabels(XX = max(lastPP$xx[1:lastPP$Ntip] + adj.x) + 0.3*8*strwidth(" ") + 0.5*width.thermo,
             YY = lastPP$yy[1:lastPP$Ntip], thermo = meandf,
             horiz = TRUE, height = 0.9, adj = c(0.5, 0.5), pch = NULL,
             width = width.thermo, piecol = gg_color_hue(ncol(meandf)),
             frame = "none", pie = NULL, color = "black")
  return(invisible(width.thermo))
}

## subsidiary function to daOTU_tree_plot
## Add Histogram to figure
addHist <- function(lastPP, meandf, adj.x, width.thermo = ncol(meandf)*8*strwidth(" "), cex = CEX) {
  ## Add squares
  nSquares <- ncol(meandf)
  XX <- max(lastPP$xx[1:lastPP$Ntip] + adj.x) + 0.3*8*strwidth(" ")
  YY = lastPP$yy[1:lastPP$Ntip] - 1/2 * strheight(" ", cex = cex)
  base.size <- width.thermo/nSquares ## baseline for histogram bin width
  height.size <- 0.9
  xl <- rep(XX + (seq(nSquares) - 1)*base.size, times = nrow(meandf))     ## left x positions
  xr <- xl + base.size ## right x positions
  yb <- rep(YY, each = ncol(meandf))     ## bottom y positions
  yt <- as.vector(yb + t(height.size*meandf))     ## top y positions
  rect(xl, yb, xr, yt, col = gg_color_hue(ncol(meandf)))
  return(invisible(width.thermo))
}

## subsidiary function to daOTU_tree_plot
## Add Rectangles to figure
addRectangle <- function(lastPP, meandf, adj.x, width.thermo = ncol(meandf)*8*strwidth(" ")) {
  ## Add rectangles
  nRectangles <- ncol(meandf)
  XX <- max(lastPP$xx[1:lastPP$Ntip] + adj.x) + 0.3*8*strwidth(" ")
  YY = lastPP$yy[1:lastPP$Ntip]
  base.size <- width.thermo/nRectangles ## baseline for rectangles side length
  height.size <- 0.9
  scaling <- sqrt(meandf)
  xcenter <- rep(XX + (seq(nRectangles) - 1/2)*base.size, times = nrow(scaling))
  ycenter <- rep(YY, each = ncol(scaling))
  rectangles <- cbind(as.vector(base.size*t(scaling)), as.vector(height.size*t(scaling)))
  symbols(xcenter, ycenter, rectangles = rectangles,
          inches = FALSE, bg = gg_color_hue(ncol(scaling)), add = TRUE)
  return(invisible(widh.thermo))
}

## subsidiary function to daOTU_tree_plot
## Add circles to figure
addCircle <- function(lastPP, meandf, adj.x, width.thermo = ncol(meandf)*8*strwidth(" ")) {
  ## Add circles
  ## Requires plotrix
  nCircles <- ncol(meandf)
  XX <- max(lastPP$xx[1:lastPP$Ntip] + adj.x) + 0.3*8*strwidth(" ")
  YY = lastPP$yy[1:lastPP$Ntip]
  ## baseline for circles radii
  ratio <- diff(lastPP$y.lim)/diff(lastPP$x.lim)
  base.size <- min(width.thermo/nCircles, 0.95/(ratio * sqrt(2))) 
  scaling <- sqrt(meandf)
  xcenter <- rep(XX + (seq(nCircles) - 1/2)*base.size, times = nrow(scaling))
  ycenter <- rep(YY, each = ncol(scaling))
  ## Area proportional to frequency, sqrt(2) used to maximize space occupation while
  ## avoiding overlap (worst case scenario is two neighbor modalities with frequencies of
  ## 0.5 and 0.5)
  circles <- 0.5 * as.vector(base.size*t(scaling))
  symbols(xcenter, ycenter, circles = circles,
          inches = FALSE, bg = gg_color_hue(ncol(scaling)), add = TRUE)
  width.thermo <- min(width.thermo, nCircles * base.size)
  return(invisible(width.thermo))
}

## subsidiary function to daOTU_tree_plot
## Add squares to figure
addSquare <- function(lastPP, meandf, adj.x, width.thermo = ncol(meandf)*8*strwidth(" ")) {
  ## Add squares
  ## Requires plotrix
  nSquares <- ncol(meandf)
  XX <- max(lastPP$xx[1:lastPP$Ntip] + adj.x) + 0.3*8*strwidth(" ")
  YY = lastPP$yy[1:lastPP$Ntip]
  ## baseline for squares radii
  ratio <- diff(lastPP$y.lim)/diff(lastPP$x.lim)
  base.size <- min(width.thermo/nSquares, 0.95/ratio) 
  scaling <- sqrt(meandf)
  xcenter <- rep(XX + (seq(nSquares) - 1/2)*base.size, times = nrow(scaling))
  ycenter <- rep(YY, each = ncol(scaling))
  squares <- as.vector(base.size*t(scaling)) / sqrt(2)
  symbols(xcenter, ycenter, squares = squares,
          inches = FALSE, bg = gg_color_hue(ncol(scaling)), add = TRUE)
  width.thermo <- min(width.thermo, nSquares * base.size)
  return(invisible(width.thermo))
}


## subsidiary function to daOTU_tree_plot
## Add abundance, automatically calibrate scale to roughly match 1:4
addAbundance <- function(otuAbun, lastPP, adj.x, width.thermo,
                         abundance.type = c("circle", "bar"),
                         add.ruler = FALSE) {
  scaling <- log(otuAbun, base = 10)
  minAbun <- floor(min(scaling))
  maxAbun <- floor(max(scaling)) ## Only to construct scale and breaks
  if (maxAbun==minAbun) {
      scale <- minAbun
  } else {
      scale <- seq(minAbun, maxAbun, by = floor((maxAbun - minAbun)/4) + 1)
  }
  breaks <- 10^scale
  maxAbun <- max(scaling) ## to properly scale all quantities
  abundance.type <- match.arg(abundance.type)
  if (abundance.type == "circle") {
      ## Maximum and minimum circle radius
      ratio <- diff(lastPP$y.lim)/diff(lastPP$x.lim)
      max.size <- 0.95 /(ratio * 2 * sqrt(2))
      min.size <- max.size / 20
      ## Scaling radius
      scaling <- sqrt( (scaling - minAbun) / (maxAbun - minAbun))
      scaling <- scaling * (max.size - min.size) + min.size
      ## Construct scale
      scale <- sqrt( (scale - minAbun) / (maxAbun - minAbun))
      scale <- scale * (max.size - min.size) + min.size
      ## Centre coordinates
      adj.radii <- max(scaling)
      xcenter <- rep(max(lastPP$xx[1:lastPP$Ntip] + adj.x) + 0.5*8*strwidth(" ") + width.thermo + adj.radii,
                     lastPP$Ntip)
      ycenter <- lastPP$yy[1:lastPP$Ntip]
      ## Plot circles or segments
      symbols(xcenter, ycenter, circles = scaling,
              inches = FALSE, bg = "grey40", fg = "grey40", add = TRUE)
  } else {
      ## Min and max widths
      if (width.thermo >= 4*8*strwidth(" ")) {
          max.size <- width.thermo / 2
      } else  {
          max.size <- width.thermo 
      }
      min.size <- max.size / 20
      ## Scaling widths
      scaling <- (scaling - minAbun) / (maxAbun - minAbun) * (max.size - min.size) + min.size
      ## Construct scale
      scale <- (scale - minAbun) / (maxAbun - minAbun) * (max.size - min.size) + min.size
      XX <- rep(max(lastPP$xx[1:lastPP$Ntip] + adj.x) + 0.5*8*strwidth(" ") + width.thermo,
                lastPP$Ntip) + scaling / 2
      YY <- lastPP$yy[1:lastPP$Ntip]
      symbols(XX, YY, inches = FALSE, rectangles = cbind(scaling, 0.5),
              add = TRUE, fg = "grey40", bg = "grey40")
      if (add.ruler) {
          offset <- max(lastPP$xx[1:lastPP$Ntip] + adj.x) + 0.5 * 8 * strwidth(" ") + width.thermo
          segments(x0 = offset + scale,
                   y0 = rep(lastPP$y.lim[1] - 0.5, length(scale)),
                   x1 = offset + scale,
                   y1 = rep(lastPP$y.lim[2] + 0.5, length(scale)), col = "grey80")
          text(x = offset + scale,
               y = lastPP$y.lim[1] - 1,
               adj = c(0, 0.5),
               labels = prettyNum(breaks),
               srt = -90, cex = 0.7)
      }
  }
  return(invisible(list(scale = scale, type = abundance.type,
                        breaks = breaks)))
}

## subsidiary function to daOTU_tree_plot
## Add legend
addLegend <- function(meandf, variable, scale, type, breaks) {
  leg <- legend(x = "topleft",
                xjust = 0,
                legend = colnames(meandf),
                fill = gg_color_hue(ncol(meandf)),
                border = NA,
                title = variable,
                bty = "n")
  leg.abundance <- legend(x = leg$rect$left,
                          y = leg$rect$top - 1.1*leg$rect$h, 
                          col = "grey40",
                          legend =  prettyNum(breaks),
                          bty = "n", 
                          title = "Relative abundance",
                          plot = TRUE)
  if (type == "circle") {
      xcenter <- leg.abundance$text$x - max(scale) - strwidth(" ")
      ycenter <- leg.abundance$text$y
      symbols(xcenter, ycenter, circles = scale,
              inches = FALSE, bg = "grey40", fg = "grey40", add = TRUE)
  } else {
      xcenter <- leg.abundance$text$x - strwidth(" ") - scale / 2
      ycenter <- leg.abundance$text$y
      symbols(xcenter, ycenter, rectangles = cbind(scale, 0.5),
              inches = FALSE, bg = "grey40", fg = "grey40", add = TRUE)
  }
}


## Tree plot with thermometer: plot a tree with all otus in set daOTUS and add
## overall abundance and distribution between levels of variable at tips of tree. 
## Samples are first agreggated by level of variable to compute relative abundance
## within this level and different levels are then agreggated (and eventually weighted)
## to determine which levels contribute most to the overall relative abundance. Overall
## abundance is also plotted (in logarithmic scale) next to the thermometer
daOTU_tree_plot <- function(physeq, daOTUS, variable, colorTip = FALSE, 
                            type = c("thermo", "circle", "square", "rectangle", "hist"),
                            abundance.type = c("bar", "circle"),
                            add.ruler = TRUE, 
                            weight = NULL, ...) {
  ## Args:
  ## - physeq: phyloseq class object
  ## - daOTUS: set of OTUS to plot
  ## - variable: x axis, used for plotting and aggregating samples
  ## - type: type of graphic to plot on leaves of tree
  ## - abundance.type: type of symbol to use for abundance plotting
  ## - add.ruler: logical, should a ruler be added to abundance (only when abundance.type is bar)
  ## - colorTip: logical, should tips be colored according to dominant condition
  ##             (defaults to FALSE)
  ## - weight: optional, contribution of each factor level of variable to the whole
  ## - ...: Additional arguments passed to ‘plot.phylo’.
  ##
  ## Returns,
  ## - plot of tree, thermometer and relative abundance
  ##
  ## check variable and otu set
  numberOfProvidedTaxa <- length(daOTUS)
  daOTUS <- intersect(daOTUS, taxa_names(physeq))
  stopifnot(variable %in% sample_variables(physeq),
            length(daOTUS) > 0)
  if ( numberOfProvidedTaxa != length(daOTUS) ) {
    cat(paste(length(daOTUS), "otus out of", numberOfProvidedTaxa,
              "provided were found, use only those for plotting"), sep = "\n")
  }
  ## Get count data frame
  tdf <- otu_table(physeq)
  if (!taxa_are_rows(physeq)) { tdf <- t(tdf) }
  ## Normalize counts to relative abundances
  tdf <- apply(tdf, 2, function(x) x/sum(x))
  tdf <- data.frame(tdf[daOTUS, ])
  ## sort OTUs by overall abundance
  otuAbun <- sort(rowMeans(tdf), TRUE)
  ## Get within group mean
  meandf <- aggregate(t(tdf), by = list(variable = get_variable(physeq, variable)), FUN = mean)
  groupLevels <- rownames(meandf) <- meandf[ , 1]
  meandf <- meandf[ , -1]
  ## Normalize by otu (use weight if necessary)
  if (is.null(weight)) {
    meandf <- t(apply(meandf, 2, function(x) x/sum(x)))
  } else {
    if (!is.null(names(weight)) && all(groupLevels %in% names(weight))) {
      w <- weight[groupLevels]
    } else {
      cat("No name or non-matching names for weight, using the first weights.",
          paste("Group names are", groupLevels), sep = "\n")
      w <- weight[length(groupLevels)]
    }
    meandf <- t(apply(meandf, 2, function(x) w*x/sum(w*x)))
  }
  ## Get best taxonomic annotation
  bestAnn <- function(x) {
    x <- x[!is.na(x)]
    return(ifelse(length(x), x[length(x)], ""))
  }
  taxdf <- as(tax_table(physeq), "matrix")
  ann <- apply(taxdf, 1, bestAnn)
  taxdf <- data.frame(otu = rownames(taxdf), tax = paste(rownames(taxdf), ann))
  rownames(taxdf) <- taxdf$otu
  ## get pruned tree
  phy <- phy_tree(physeq)
  phy <- drop.tip(phy, tip = setdiff(phy$tip.label, daOTUS))
  ## Reorder taxdf, meandf, otuAbun to match phy tips and add trailing
  ## spaces at end of phy tip labels
  taxdf <- taxdf[phy$tip.label, ]
  meandf <- meandf[phy$tip.label, ]
  otuAbun <- otuAbun[phy$tip.label]
  trailing.space <- paste(rep(" ", 8*ncol(meandf)*ifelse(ncol(meandf) >= 4, 1.5, 2)), collapse = "")
  phy$tip.label <- paste(taxdf$tax, trailing.space, sep = "")
  ## Plot results
  plot.phylo(phy, tip.color = "white", ...)
  args <- list(...)
  CEX <- ifelse("cex" %in% names(args), args$cex, par("cex"))
  if (colorTip) {
      tipColor <- gg_color_hue(ncol(meandf))[ apply(meandf, 1, which.max) ]
      tiplabels(paste("", taxdf$tax), frame = "n", adj = c(0, 0.5), col = tipColor, cex = CEX)
  } else {
      tiplabels(paste("", taxdf$tax), frame = "n", adj = c(0, 0.5), cex = CEX)
  }
  width.thermo <- strwidth(paste(rep(" ", 8*ncol(meandf)), collapse = ""))
  adj.x <- strwidth(as.character(taxdf$tax), units = "user", cex = CEX)
  lastPP <- get("last_plot.phylo", envir = .PlotPhyloEnv)
  ## Add Graphics at leaves of tree
  type <- match.arg(type)
  width.thermo <- switch(type,
                         thermo    = addThermo(lastPP, meandf, adj.x, width.thermo),
                         hist      = addHist(lastPP, meandf, adj.x, width.thermo, CEX),
                         square    = addSquare(lastPP, meandf, adj.x, width.thermo),
                         rectangle = addRectangle(lastPP, meandf, adj.x, width.thermo),
                         circle    = addCircle(lastPP, meandf, adj.x, width.thermo)
                         )
  ## Add overall average abundance
  abundance.type <- match.arg(abundance.type)
  abunScale <- addAbundance(otuAbun, lastPP, adj.x, width.thermo, abundance.type, add.ruler)
  ## Add segments
  XX <- lastPP$xx[1:lastPP$Ntip] + strwidth(as.character(taxdf$tax), cex = CEX)
  YY <- lastPP$yy[1:lastPP$Ntip] 
  segments(x0 = XX, x1 = max(XX),
           y0 = YY, y1 = YY, col = "grey80", lty = 2)
  ## Add legend
  addLegend(meandf, variable, abunScale[[1]], abunScale[[2]], abunScale[[3]])
}



#' ggplot-like implementation of plot.phylo from the ape package for \code{phyloseq} class
#' objects. Allows more representation that ggphylo and plot_tree. Returns a data frame representing
#' the tree that is useful for ggplot graphical function. 
#'
#' @title ggtree
#' @param physeq Required. An instance of a \code{phyloseq} class object that has
#'               tree slot. 
#' @param group  Optional. Either a single character string matching a variable
#'               name in the corresponding sam_data of `physeq`, or a factor with
#'               the same length as the number of taxa in `physeq`. If provided, the samples
#'               are aggregated by factor levels before being plotted
#' @param freq    Logical. Should counts be replaced by frequencies (TRUE) or kept as such (FALSE).
#'                 Defaults to TRUE. TRUE corresponding to weighting all samples equally while FALSE
#'                 weights them with their library size.
#' @param method Optional. Edge fattening method; either 'linear' or
#'                    'logarithmic' (or an abbrevation of these). Defaults to "linear"
#' @param width.lim Optional. Numeric. Minimum and maximal edge after fattening.
#'                  Defaults to c(0.1, 4). Set to c(0, x) for true linear scaling.
#' @param base      Optional. Numeric. Base used for logarithmic scaling. Defaults to 10.
#' @param edge.group  Optional. Either a single character string matching a variable
#'               name in the corresponding tax_table of `physeq`, or a factor with
#'               the same length as the number of taxa in `physeq`. Defaults to "Phylum"
#' @param edge.method Optional. Ancestral group reconstruction method; either 'majority' or
#'                    'ace' (or an abbrevation of these). Defaults to "majority"
#' @param ...    Optional. Additional arguments passed on to plot.phylo to determine
#'               the shape of the tree. 
#' @note ggtree assumes that the tree is rooted. 
#' @return A list of two data frames properly formatted for ggplot. 
ggtree <- function(physeq, group = NULL, freq = TRUE, method = "linear", base = 10,
                              edge.group = "Phylum", edge.method = "majority",
                              width.lim = c(0.1, 4), ...) {
    ## Exception handling
    if (is.null(phy_tree(physeq, FALSE))) {
        stop("Object \"physeq\" does not have a tree slot")
    } else {
        tree <- phy_tree(physeq)
    }
    if (!is.rooted(tree)) {
        stop("Tree must be rooted, consider using midpoint rooting")
    }

    ## Potentially normalize counts to frequencies
    if (freq) {
        physeq <- transform_sample_counts(physeq, function(x) x /sum(x))
    }
    
    ## Merge samples by grouping factor.
    if (!is.null(group)) {
        physeq <- merge_samples(physeq, group)
    }

    ## Get edge group factor
    if (!is.null(edge.group)) {
        x <- color_edges(physeq, edge.group, edge.method)
        edge.group <- factor(names(x$palette)[match(x$edge, x$palette)])
        tip.group <- factor(names(x$palette)[match(x$tip, x$palette)])
    }

    ## Fatten edges and compute average 
    fattened.edges <- fattenEdges(physeq, method, width.lim, base)$edge.raw.width
    mean.fattened.edges <- colMeans(fattened.edges)
    edge.names <- colnames(fattened.edges)
    fattened.edges <- data.frame(edge.name = edge.names,
                                 t(fattened.edges))
    mean.fattened.edges <- data.frame(edge.name = edge.names,
                                      mean.width = mean.fattened.edges)
    fattened.edges <- melt(fattened.edges, id.vars = "edge.name")
    colnames(fattened.edges)[2:3] <- c("Sample", "width")
    fattened.edges <- merge(fattened.edges, mean.fattened.edges)
    
    ## Call phylo.plot
    plot.phylo(phy_tree(physeq), plot = FALSE, ...)
    lastPP <- get("last_plot.phylo", envir = .PlotPhyloEnv)
    ggdata <- data.frame(edge.name = edge.names,
                         edge.x1   = lastPP$xx[ lastPP$edge[, 1]],
                         edge.x2   = lastPP$xx[ lastPP$edge[, 2]],
                         edge.y1   = lastPP$yy[ lastPP$edge[, 1]],
                         edge.y2   = lastPP$yy[ lastPP$edge[, 2]])
    ggdata$edge <- lastPP$edge
    if (!is.null(edge.group)) {
        edge.group <- data.frame(edge.name = edge.names, edge.group = edge.group)
        ggdata <- merge(ggdata, edge.group)
    }
    ggdata <- merge(ggdata, fattened.edges)
    if (lastPP == "cladogram") {
        p <- ggplot(ggdata) + geom_segment(aes(x = edge.x1, y = edge.y1, xend = edge.x2, yend = edge.y2))
    }
    if (lastPP == "phylogram") {
        p <- ggplot(ggdata) + geom_segment(aes(x = edge.x1, y = edge.y1, xend = edge.x1, yend = edge.y2))
        p <- p + geom_segment(aes(x = edge.x1, y = edge.y2, xend = edge.x2, yend = edge.y2))
    }
    if (lastPP == "radial") {
        
    }
    if (lastPP == "") {}
}
mahendra-mariadassou/phyloseq-extended documentation built on Nov. 12, 2022, 11:51 p.m.