R/distance-methods.R

Defines functions fastUniFrac phyloseq_JSD_pair

################################################################################
#' Calculate distance, dissimilarity 
#'
#' Takes a \code{\link{phyloseq-class}} object and method option, and returns
#'  a \code{\link{dist}}ance object suitable for certain 
#'  ordination methods and other distance-based analyses. 
#'  Only
#'  sample-wise distances are currently supported (the \code{type} argument),
#'  but eventually species-wise (OTU-wise)
#'  distances may be supported as well. 
#'
#'  Depending on the \code{method}
#'  argument, \code{distance()} wraps one of 
#'  \code{\link{UniFrac}},
#'  \code{\link{DPCoA}},
#'  \code{\link{JSD}},
#'  \code{\link[vegan]{vegdist}},
#'  \code{\link[vegan]{betadiver}},
#'  \code{\link[vegan]{designdist}}, or
#'  \code{\link{dist}}.
#' 
#' @param physeq (Required).  A \code{\link{phyloseq-class}} or
#'  an \code{\link{otu_table-class}} object. The latter is only appropriate
#'  for methods that do not require any additional data (one-table). 
#'  For example, the ``wunifrac'' option (\code{\link{UniFrac}}) requires
#'  \code{\link{phyloseq-class}} that contains both an \code{otu_table}
#'  and a phylogenetic tree (\code{phylo}).
#'
#' @param method (Required). A character string. 
#'  Provide one of the currently supported options. 
#'  See \code{\link{distanceMethodList}} for a detailed list 
#'  of the supported options here,
#'  and links to accompanying documentation.
#'  
#'  Note that for the common definition of \code{Jaccard} distance
#'  using the \code{vegan-package} implementation,
#'  an additional argument is needed, with the full call having the form:
#'  \code{distance(physeq, method = "jaccard", binary = TRUE)}
#'  
#'  The following methods are implemented explicitly within
#'  the \code{\link{phyloseq-package}}, 
#'  and accessed by the following \code{method} options:
#'  
#'  \describe{
#'   \item{\code{"unifrac"}}{Original (unweighted) UniFrac distance,
#'         \code{\link[phyloseq]{UniFrac}}}
#'   \item{\code{"wunifrac"}}{weighted-UniFrac distance, \code{\link[phyloseq]{UniFrac}}}
#'   \item{\code{"dpcoa"}}{
#'     sample-wise distance used in 
#'     Double Principle Coordinate Analysis, \code{\link[phyloseq]{DPCoA}}}
#'   \item{\code{"jsd"}}{Jensen-Shannon Divergence, \code{\link{JSD}}}
#'  }
#'
#'  Alternatively, you can provide
#'  a character string that defines a custom distance method, if it has the form
#'  described in \code{\link{designdist}}. 
#'
#' @param type (Optional). A character string. The type of pairwise comparisons
#'  being calculated: sample-wise or taxa-wise. The default is 
#'  \code{c("samples")}.
#'
#' @param ... Additional arguments passed on to the appropriate distance 
#'  function, determined by the \code{method} argument.
#'
#' @return An object of class ``\code{\link{dist}}'' suitable for certain 
#'  ordination methods and other distance-based analyses.
#' 
#' @seealso 
#'  \code{\link{plot_ordination}}, 
#'  \code{\link{UniFrac}},
#'  \code{\link{DPCoA}},
#'  \code{\link{JSD}},
#'  \code{\link[vegan]{vegdist}},
#'  \code{\link[vegan]{betadiver}},
#'  \code{\link[vegan]{designdist}},
#'  \code{\link{dist}}.
#'
#' @importFrom vegan betadiver
#' @importFrom vegan designdist
#' @importFrom vegan vegdist
#' @export
#' @examples 
#' data(esophagus)
#' distance(esophagus, "uunifrac") # Unweighted UniFrac
#' distance(esophagus, "wunifrac") # weighted UniFrac
#' distance(esophagus, "jaccard", binary = TRUE) # vegdist jaccard
#' distance(esophagus, "gower") # vegdist option "gower"
#' distance(esophagus, "g") # designdist method option "g"
#' distance(esophagus, "minkowski") # invokes a method from the base dist() function.
#' distance(esophagus, "(A+B-2*J)/(A+B)") # designdist custom distance
#' distanceMethodList
#' help("distance")
setGeneric("distance", function(physeq, method, type="samples", ...){
  standardGeneric("distance")
})
#' @rdname distance
setMethod("distance", c("phyloseq", "ANY"), function(physeq, method){
  stop("You must specify a `method` argument as a character string.
       \nIt was missing/NA or not a character string.
       \nSee `?distanceMethodList`")
})
#' @rdname distance
setMethod("distance", c("otu_table", "character"), function(physeq, method, type="samples", ...){
  OTU = physeq
  if( method == "jsd" ){
    return(JSD(OTU))
  }
  # Hard-coded dispatch according to certain method groups
  if( method %in% distanceMethodList$vegdist ){
    dfun <- "vegdist"
  } else if( method %in% distanceMethodList$betadiver ){
    dfun <- "betadiver"
  } else if( method %in% distanceMethodList$dist ){
    dfun <- "dist"		
  } else {
    dfun <- "designdist"
  }
  # get the extra arguments to pass to functions (this can be empty)
  extrargs <- list(...)	
  # If necessary (non phyloseq funs), enforce orientation, build function.
  # disambiguate type argument... Must be "species" for vegan integration...
  # The following should all work: "OTUs", "OTU", "otus", "Taxas", "site"
  type <- gsub("(OTU(s)?)|(taxa(s)?)|(Species)", "species", type, ignore.case = TRUE)
  # The following should all work: "SaMplE", "Samples", "site", "sites"
  type <- gsub("(Sample(s)?)|(site(s)?)", "samples", type, ignore.case = TRUE)
  # Test type, and enforce orientation accordingly	
  if( type == "species"){
    # For species-distance, species need to be rows (vegan-style)
    if( !taxa_are_rows(OTU) ){OTU <- t(OTU)}	
  } else if( type == "samples" ){
    # For sample-distance, samples need to be rows (vegan-style)
    if( taxa_are_rows(OTU) ){OTU <- t(OTU)}
  } else {
    stop("type argument must be one of \n (1) samples \n or \n (2) species")
  }
  OTU <- as(OTU, "matrix")
  fun.args <- c(list(OTU, method=method), extrargs)	
  return( do.call(dfun, fun.args) )
})
#' @rdname distance
setMethod("distance", c("phyloseq", "character"), function(physeq, method, type="samples", ...){
  # Only one method at a time.
  if(length(method) > 1){
    stop("`distance` only accepts one method at a time. ",
         "You provided ", length(method), " methods. ")
  }
  if(length(method) < 1 | is.na(method)){
    stop("You must specify a `method` argument. \nIt was missing/NA. \nSee `?distanceMethodList`")
  }
  # Regular Expression detect/convert unifrac/weighted-UniFrac args
  method <- gsub("^(u.*)*unifrac$", "unifrac", method, ignore.case = TRUE)
  method <- gsub("^w.*unifrac$", "wunifrac", method, ignore.case = TRUE)
  # Distances that require a phyloseq object
  # because they make use of additional information (e.g. a tree)
  if( method ==  "unifrac" ){ return(UniFrac(physeq, ...)) }
  if( method == "wunifrac" ){ return(UniFrac(physeq, weighted=TRUE, ...)) }
  if( method == "dpcoa"    ){
    # Remove diagnol entries from "dist" object returned in `RaoDis` slot.
    return(as.dist(DPCoA(physeq, ...)$RaoDis, diag=FALSE))
	} 
  # Else, dispatch to OTU table method
  return(distance(otu_table(physeq), method, type, ...))
})
################################################################################
#' List of distance method keys supported in \code{\link[phyloseq]{distance}}
#'
#' Distance methods should be specified by exact string match.
#' Cannot do partial matching for all options, 
#' because too many similar options in downstream method dispatch. 
#'
#' @format A list of character vectors. 
#' Every entry specifies a supported distance method.
#' Names in the list indicate which downstream function
#' is being utilized for further details.
#' Same functions are linked in the itemized list below.
#' 
#' \describe{
#'   \item{\code{unifrac}}{\code{\link[phyloseq]{UniFrac}}}
#'   \item{\code{wunifrac}}{\code{\link[phyloseq]{UniFrac}}}
#'   \item{\code{dpcoa}}{\code{\link[phyloseq]{DPCoA}}}
#'   \item{\code{jsd}}{\code{\link{JSD}}}
#'   \item{\code{manhattan}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{euclidean}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{canberra}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{bray}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{kulczynski}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{jaccard}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{gower}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{altGower}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{morisita}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{horn}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{mountford}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{raup}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{binomial}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{chao}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{cao}}{\code{\link[vegan]{vegdist}}}
#'   \item{\code{w}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{-}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{c}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{wb}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{r}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{I}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{e}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{t}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{me}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{j}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{sor}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{m}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{-}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{co}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{cc}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{g}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{-}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{l}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{hk}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{rlb}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{sim}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{gl}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{z}}{\code{\link[vegan]{betadiver}}}
#'   \item{\code{maximum}}{\code{\link[stats]{dist}}}
#'   \item{\code{binary}}{\code{\link[stats]{dist}}}
#'   \item{\code{minkowski}}{\code{\link[stats]{dist}}}
#'   \item{\code{ANY}}{\code{\link[vegan]{designdist}}}
#' }
#' 
#' @seealso 
#' \code{\link[phyloseq]{distance}}
#' 
#' @export
#' 
#' @examples 
#' distanceMethodList
distanceMethodList <- list(
  UniFrac    = c("unifrac", "wunifrac"),
  DPCoA      = "dpcoa",
  JSD        = "jsd",
  # The methods supported by vegan::vegdist function.
  vegdist    = c("manhattan", "euclidean", "canberra", "bray", 
                 "kulczynski", "jaccard", "gower", "altGower", "morisita", "horn", 
                 "mountford", "raup" , "binomial", "chao", "cao"),
  # The methods supported by vegan::betadiver function.
  betadiver  = c("w", "-1", "c", "wb", "r", "I", "e", "t", "me", "j",
                 "sor", "m", "-2", "co", "cc", "g", "-3", "l", "19", "hk", "rlb",
                 "sim", "gl", "z"),
  dist       = c("maximum", "binary", "minkowski"),
  designdist = "ANY"
)
################################################################################
# Shannon-Jensen Divergence, in R.
################################################################################
#' @keywords internal
phyloseq_JSD_pair <- function(x, y){
	# Function to compute Shannon-Jensen Divergence
	# x and y are the frequencies for the same p categories
  # Assumes relative abundance transformation already happened (for efficiency)
  
  # Define the mean point
	m <- (x+y)/2
	# Define each samples component
	P1 <- x*log(x/m)
	P2 <- y*log(y/m)
	# In the case of zeroes entries log is undefined, JSD is defined as zero
	P1[!is.finite(P1)] <- 0
	P2[!is.finite(P2)] <- 0
	d <- (P1+P2)/2
	return(sum(d, na.rm = TRUE))
}
################################################################################
#' Calculate the Jensen-Shannon Divergence (distance)
#'
#' This is a phyloseq-specific implementation of the Jensen-Shannon Divergence
#' for comparing pairs of microbial communities (samples) in an experiment. 
#' The expectation is that you have many samples (say. more than two) and you
#' want a distance matrix on which will perform further analysis. \code{JSD} is
#' intended to be ``wrapped'' by the more general \code{\link{distance}} 
#' function in phyloseq, and it can be invoked using \code{"jsd"} as the 
#' argument to the \code{method} parameter of \code{\link{distance}}.
#'
#' One of the motivations for providing JSD in phyloseq was its recent use in 
#' the analysis of the \code{\link{enterotype}} dataset. 
#' 
#' @param physeq (Required). \code{\link{phyloseq-class}}. 
#'  The phyloseq data on which to compute the 
#'  pairwise sample distance matrix.
#'
#' @return An object of class ``\code{\link{dist}}'' suitable for certain 
#'  ordination methods and other distance-based analyses.
#'  See \code{\link{distance}}.
#'  
#' @seealso
#'  \code{\link{distance}} 
#' 
#'  \code{\link{enterotype}}
#' 
#'  \url{http://en.wikipedia.org/wiki/Jensen-Shannon_divergence}
#'
#' @references
#' Jensen-Shannon Divergence and Hilbert space embedding.  
#' Bent Fuglede and Flemming Topsoe University of Copenhagen,
#' Department of Mathematics
#' \url{http://www.math.ku.dk/~topsoe/ISIT2004JSD.pdf}
#'
#' @author
#'  Susan Holmes \email{susan@@stat.stanford.edu}.
#'  Adapted for phyloseq by Paul J. McMurdie.
#'
#' @keywords internal
#' @examples
#' # library(doParallel)  # Do this and next line only if you have multi-cores
#' # registerDoParallel(cores=6)
#' # data(enterotype)
#' # # ent.jsd <- JSD(enterotype, TRUE) # internal only
#' # ent.jsd <- distance(enterotype, "jsd", parallel=TRUE)
#' # ent.PCoA <- ordinate(enterotype, "PCoA", ent.jsd) # Perform principle coordinate analysis
#' # p <- plot_ordination(enterotype, ent.PCoA, color="Enterotype", shape="SeqTech") 
#' # (p <- p + geom_point(size=5, alpha=0.5))
setGeneric("JSD", function(physeq){
  standardGeneric("JSD")
})
setMethod("JSD", "ANY", function(physeq){
  stop("JSD requires specific input classes. Check call and try again")
})
setMethod("JSD", "phyloseq", function(physeq){
  JSD(otu_table(physeq))
})
setMethod("JSD", "otu_table", function(physeq){
  # Coerce to species-as-columns
  if(taxa_are_rows(physeq)){ physeq <- t(physeq) }
  # Coerce physeq to matrix and pass on
  return(JSD(as(physeq, "matrix")))
})
# Assumes samples are rows
setMethod("JSD", "matrix", function(physeq){
  # Coerce to relative abundance by sample (row)
  physeq <- sweep(physeq, 1, rowSums(physeq), "/")
  # Parallelization not needed for this.
  # Fix at sequential (eventually update code to remove parallelization complexity)
  registerDoSEQ()
  # create N x 2 matrix of all pairwise combinations of samples.
  spn <- combn(row.names(physeq), 2, simplify=FALSE)
  # initialize DistMat with NAs
  DistMat <- matrix(NA, nrow(physeq), nrow(physeq))
  # define the rows/cols of DistMat with the sample names (rownames)    
  rownames(DistMat) <- row.names(physeq)
  colnames(DistMat) <- row.names(physeq)
  # optionally-parallel implementation with foreach
  distlist <- foreach( i = spn, .packages="phyloseq") %dopar% {
    A <- i[1]
    B <- i[2]
    return( phyloseq_JSD_pair(physeq[A, ], physeq[B, ]) )
  }
  # return(distlist)
  # This is in serial, but it is quick.
  distlist2distmat <- function(i, spn, DL){
    DistMat[ spn[[i]][2], spn[[i]][1] ] <<- DL[[i]]
  }
  junk <- sapply(1:length(spn), distlist2distmat, spn, distlist)	
  return(as.dist(DistMat))
})
##############################################################################
#' Calculate weighted or unweighted (Fast) UniFrac distance for all sample pairs.
#'
#' This function calculates the (Fast) UniFrac distance for all sample-pairs
#' in a \code{\link{phyloseq-class}} object.
#'
#' \code{UniFrac()} accesses the abundance
#' (\code{\link{otu_table-class}}) and a phylogenetic tree (\code{\link{phylo-class}})
#' data within an experiment-level (\code{\link{phyloseq-class}}) object.
#' If the tree and contingency table are separate objects, suggested solution
#' is to combine them into an experiment-level class
#' using the \code{\link{phyloseq}} function. For example, the following code
#'
#' \code{phyloseq(myotu_table, myTree)}
#'
#' returns a \code{phyloseq}-class object that has been pruned and comprises
#' the minimum arguments necessary for \code{UniFrac()}. 
#'
#' Parallelization is possible for UniFrac calculated with the \code{\link{phyloseq-package}},
#' and is encouraged in the instances of large trees, many samples, or both.
#' Parallelization has been implemented via the \code{\link{foreach-package}}.
#' This means that parallel calls need to be preceded by 2 or more commands
#' that register the parallel ``backend''. This is acheived via your choice of
#' helper packages. One of the simplest seems to be the \emph{doParallel} package.
#'
#' For more information, see the following links on registering the ``backend'':
#' 
#' \emph{foreach} package manual: 
#'
#' \url{http://cran.r-project.org/web/packages/foreach/index.html}
#'
#' Notes on parallel computing in \code{R}. Skip to the section describing 
#' the \emph{foreach Framework}. It gives off-the-shelf examples for registering
#' a parallel backend using the \emph{doMC}, \emph{doSNOW}, or \emph{doMPI} packages:
#' 
#' \url{http://trg.apbionet.org/euasiagrid/docs/parallelR.notes.pdf}
#'
#' Furthermore, as of \code{R} version \code{2.14.0} and higher, a parallel package
#' is included as part of the core installation, \code{\link{parallel-package}}, 
#' and this can be used as the parallel backend with the \code{\link{foreach-package}}
#' using the adaptor package ``doParallel''. 
#' \url{http://cran.r-project.org/web/packages/doParallel/index.html}
#'
#' See the vignette for some simple examples for using doParallel.
#' \url{http://cran.r-project.org/web/packages/doParallel/vignettes/gettingstartedParallel.pdf}
#'
#' UniFrac-specific examples for doParallel are provided in the example
#' code below.  
#'
#' @usage UniFrac(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE, fast=TRUE)
#'
#' @param physeq (Required). \code{\link{phyloseq-class}}, containing at minimum
#'  a phylogenetic tree (\code{\link{phylo-class}}) and 
#'  contingency table (\code{\link{otu_table-class}}). See
#'  examples below for coercions that might be necessary.
#'
#' @param weighted (Optional). Logical. Should use weighted-UniFrac calculation?
#'  Weighted-UniFrac takes into account the relative abundance of species/taxa
#'  shared between samples, whereas unweighted-UniFrac only considers 
#'  presence/absence. Default is \code{FALSE}, meaning the unweighted-UniFrac
#'  distance is calculated for all pairs of samples.
#'
#' @param normalized (Optional). Logical. Should the output be normalized such that values 
#'  range from 0 to 1 independent of branch length values? Default is \code{TRUE}.
#'  Note that (unweighted) \code{UniFrac} is always normalized by total branch-length,
#'  and so this value is ignored when \code{weighted == FALSE}.
#'
#' @param parallel (Optional). Logical. Should execute calculation in parallel,
#'  using multiple CPU cores simultaneously? This can dramatically hasten the
#'  computation time for this function. However, it also requires that the user
#'  has registered a parallel ``backend'' prior to calling this function. 
#'  Default is \code{FALSE}. If FALSE, UniFrac will register a serial backend
#'  so that \code{foreach::\%dopar\%} does not throw a warning.
#' 
#' @param fast (Optional). Logical. DEPRECATED.
#'  Do you want to use the ``Fast UniFrac''
#'  algorithm? Implemented natively in the \code{phyloseq-package}.
#'  \code{TRUE} is now the only supported option.
#'  There should be no difference in the output between the two algorithms.
#'  Moreover, the original UniFrac algorithm
#'  only outperforms this implementation of fast-UniFrac if the datasets are so
#'  small 
#'  (approximated by the value of \code{ntaxa(physeq) * nsamples(physeq)}) 
#'  that the difference in time is inconsequential (less than 1 second).
#'  In practice it does not appear that this parameter should 
#'  have ever been set to \code{FALSE}, and therefore 
#'  the original UniFrac implementation perhaps never should have been supported here.
#'  For legacy code support the option is now deprecated here
#'  (the implementation was an internal function, anyway)
#'  and the \code{fast} option will remain for one release cycle before
#'  being removed completely
#'  in order to avoid causing unsupported-argument errors.
#'
#' @return a sample-by-sample distance matrix, suitable for NMDS, etc.
#' 
#' @seealso
#' 
#' \code{\link{distance}}
#' 
#' \code{unifrac} in the picante package.
#'
#' @references
#' 
#' \url{http://bmf.colorado.edu/unifrac/}
#'
#' The main implementation (Fast UniFrac) is adapted from the algorithm's
#' description in:
#' 
#' Hamady, Lozupone, and Knight,
#' ``\href{http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html}{Fast UniFrac:}
#' facilitating high-throughput phylogenetic analyses of 
#' microbial communities including analysis of pyrosequencing and PhyloChip data.''
#' The ISME Journal (2010) 4, 17--27.
#'
#' See also additional descriptions of UniFrac in the following articles:
#'
#' Lozupone, Hamady and Knight, ``UniFrac - An Online Tool for Comparing Microbial 
#' Community Diversity in a Phylogenetic Context.'', BMC Bioinformatics 2006, 7:371
#'
#' Lozupone, Hamady, Kelley and Knight, ``Quantitative and qualitative (beta) 
#' diversity measures lead to different insights into factors that structure
#' microbial communities.'' Appl Environ Microbiol. 2007
#'
#' Lozupone C, Knight R. ``UniFrac: a new phylogenetic method for comparing microbial
#' communities.'' Appl Environ Microbiol. 2005 71 (12):8228-35.
#' 
#' @docType methods
#' @export
#' @import foreach
#' @rdname UniFrac-methods
#' @examples
#' ################################################################################
#' # Perform UniFrac on esophagus data
#' ################################################################################
#' data("esophagus")
#' (y <- UniFrac(esophagus, TRUE))
#' UniFrac(esophagus, TRUE, FALSE)
#' UniFrac(esophagus, FALSE)
#' # ################################################################################
#' # # Now try a parallel implementation using doParallel, which leverages the 
#' # # new 'parallel' core package in R 2.14.0+
#' # # Note that simply loading the 'doParallel' package is not enough, you must
#' # # call a function that registers the backend. In general, this is pretty easy
#' # # with the 'doParallel package' (or one of the alternative 'do*' packages)
#' # #
#' # # Also note that the esophagus example has only 3 samples, and a relatively small
#' # # tree. This is fast to calculate even sequentially and does not warrant
#' # # parallelized computation, but provides a good quick example for using UniFrac()
#' # # in a parallel fashion. The number of cores you should specify during the
#' # # backend registration, using registerDoParallel(), depends on your system and
#' # # needs. 3 is chosen here for convenience. If your system has only 2 cores, this
#' # # will probably fault or run slower than necessary.
#' # ################################################################################
#' # library(doParallel)
#' # data(esophagus)
#' # # For SNOW-like functionality (works on Windows):
#' # cl <- makeCluster(3)
#' # registerDoParallel(cl)
#' # UniFrac(esophagus, TRUE)
#' # # Force to sequential backed:
#' # registerDoSEQ()
#' # # For multicore-like functionality (will probably not work on windows),
#' # # register the backend like this:
#' # registerDoParallel(cores=3)
#' # UniFrac(esophagus, TRUE)
#' ################################################################################
setGeneric("UniFrac", function(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE, fast=TRUE){
	standardGeneric("UniFrac")
})
################################################################################
#' @aliases UniFrac,phyloseq-method
#' @rdname UniFrac-methods
#' @importFrom ape is.rooted
#' @importFrom ape root
setMethod("UniFrac", "phyloseq", function(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE, fast=TRUE){
  if(is.null(phy_tree(physeq)$edge.length)){
    stop("Tree has no branch lengths. See tree$edge.length. Cannot compute UniFrac without branch lengths")
  }  
  # Check if tree is rooted, set random root with warning if it is not.
  if( !is.rooted(phy_tree(physeq)) ){
    randoroot = sample(taxa_names(physeq), 1)
    warning("Randomly assigning root as -- ", randoroot, " -- in the phylogenetic tree in the data you provided.")
    phy_tree(physeq) <- root(phy=phy_tree(physeq), outgroup=randoroot, resolve.root=TRUE, interactive=FALSE)
    if( !is.rooted(phy_tree(physeq)) ){
      stop("Problem automatically rooting tree. Make sure your tree is rooted before attempting UniFrac calculation. See ?ape::root")
    }
  } 
	if( fast ){
		fastUniFrac(physeq, weighted, normalized, parallel)
	} else {
    warning("Option `fast=FALSE` is deprecated. Only 'fast' UniFrac is supported in phyloseq.")
	  fastUniFrac(physeq, weighted, normalized, parallel)
	}
})
################################################################################
# Fast UniFrac for R.
# Adapted from The ISME Journal (2010) 4, 17-27; doi:10.1038/ismej.2009.97;
# http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html
################################################################################
#' @importFrom ape prop.part
#' @importFrom ape reorder.phylo
#' @importFrom ape node.depth
#' @importFrom ape node.depth.edgelength
#' @keywords internal
#' @import foreach
fastUniFrac <- function(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE){
	# Access the needed components. Note, will error if missing in physeq.
	OTU  <- otu_table(physeq)
	tree <- phy_tree(physeq)
	# Some important checks.
	if( is.null(tree$edge.length) ) {
	  stop("Tree has no branch lengths, cannot compute UniFrac")
	}
	if( !is.rooted(tree) ) {
	  stop("Rooted phylogeny required for UniFrac calculation")
	}
	### Some parallel-foreach housekeeping.    
	# If user specifies not-parallel run (the default), register the sequential "back-end"
	if( !parallel ){ registerDoSEQ() }
	# create N x 2 matrix of all pairwise combinations of samples.
	spn <- combn(sample_names(physeq), 2, simplify=FALSE)
	# Make sure OTU is in species-are-rows orientation
	if( !taxa_are_rows(physeq) ){OTU <- t(OTU)}
  # Convert to standard matrix
	OTU <- as(OTU, "matrix")  
	# Enforce that tree and otu_table indices are the same order, 
	# by re-ordering OTU, if needed
	if( !all(rownames(OTU) == taxa_names(tree)) ){
	  OTU <- OTU[taxa_names(tree), ]
	}
	########################################
	# Build the requisite matrices as defined 
	# in the Fast UniFrac article.
	########################################
	## This only needs to happen once in a call to UniFrac.
	## Notice that A and B do not appear in this section.
	# Begin by building the edge descendants matrix (edge-by-sample)
  # `edge_array`
  #
	# Create a list of descendants, starting from the first internal node (root)
	ntip <- length(tree$tip.label)
	if(ntip != ntaxa(physeq)) stop("Incompatible tree and OTU table!")
	# Create a matrix that maps each internal node to its 2 descendants
	# This matrix doesn't include the tips, so must use node#-ntip to index into it
	node.desc <- matrix(tree$edge[order(tree$edge[,1]),][,2],byrow=TRUE,ncol=2)
	# Define the edge_array object
	# Right now this is a node_array object, each row is a node (including tips)
	# It will be subset and ordered to match tree$edge later
	edge_array <- matrix(0, nrow=ntip+tree$Nnode, ncol=nsamples(physeq), 
	                     dimnames=list(NULL, sample_names=sample_names(physeq)))
	# Load the tip counts in directly
	edge_array[1:ntip,] <- OTU
	# Get a list of internal nodes ordered by increasing depth
	ord.node <- order(node.depth(tree))[(ntip+1):(ntip+tree$Nnode)]
	# Loop over internal nodes, summing their descendants to get that nodes count
	for(i in ord.node){
	  edge_array[i,] <- colSums(edge_array[node.desc[i-ntip,], , drop=FALSE], na.rm = TRUE)
	}
	# Keep only those with a parental edge (drops root) and order to match tree$edge
	edge_array <- edge_array[tree$edge[,2],]
	# Remove unneeded variables.
	rm(node.desc)
	# If unweighted-UniFrac, coerce to a presence-absence contingency, occ
	if(!weighted){
		# For unweighted UniFrac, convert the edge_array to an occurrence (presence/absence binary) array
		edge_occ <- (edge_array > 0) - 0
	}
	if( weighted & normalized ){
		# This is only relevant to weighted-UniFrac.
		# For denominator in the normalized distance, we need the age of each tip.
	  # 'z' is the tree in postorder order used in calls to .C
	  # Descending order of left-hand side of edge (the ancestor to the node)
	  z = reorder.phylo(tree, order="postorder")
	  # Call phyloseq-internal function that in-turn calls ape's internal
	  # horizontal position function, in C, using the re-ordered phylo object, `z`
	  tipAges = node.depth.edgelength(tree)
	  # Keep only the tips, and add the tip labels in case `z` order differs from `tree`
	  tipAges <- tipAges[1:length(tree$tip.label)]
	  names(tipAges) <- z$tip.label	
    # Explicitly re-order tipAges to match OTU
	  tipAges <- tipAges[rownames(OTU)]
	}
	########################################	
  # optionally-parallel implementation with foreach
	########################################
	samplesums = sample_sums(physeq)
	distlist <- foreach( i = spn, .packages="phyloseq") %dopar% {
	  A  <- i[1]
	  B  <- i[2]
	  AT <- samplesums[A]
	  BT <- samplesums[B]
	  if( weighted ){
      # weighted UniFrac
	    wUF_branchweight <- abs(edge_array[, A]/AT - edge_array[, B]/BT)
	    # calculate the w-UF numerator
	    numerator <- sum({tree$edge.length * wUF_branchweight}, na.rm = TRUE)
	    # if not-normalized weighted UniFrac, just return "numerator";
	    # the u-value in the w-UniFrac description
	    if(!normalized){
	      return(numerator)
	    } else {
	      # denominator (assumes tree-indices and otu_table indices are same order)
	      denominator <- sum({tipAges * (OTU[, A]/AT + OTU[, B]/BT)}, na.rm = TRUE)
	      # return the normalized weighted UniFrac values
	      return(numerator / denominator)
	    }
	  } else {
      # Unweighted UniFrac
	    # Subset matrix to just columns A and B
	    edge_occ_AB <- edge_occ[, c(A, B)]
      # Keep only the unique branches. Sum the lengths
      edge_uni_AB_sum <- sum((tree$edge.length * edge_occ_AB)[rowSums(edge_occ_AB, na.rm=TRUE) < 2, ], na.rm=TRUE)
	    # Normalize this sum to the total branches among these two samples, A and B
	    uwUFpairdist <- edge_uni_AB_sum / sum(tree$edge.length[rowSums(edge_occ_AB, na.rm=TRUE) > 0])
	    return(uwUFpairdist)
	  }
	}
	# Initialize UniFracMat with NAs
	UniFracMat <- matrix(NA_real_, nsamples(physeq), nsamples(physeq))
	rownames(UniFracMat) <- colnames(UniFracMat) <- sample_names(physeq)
  # Matrix-assign lower-triangle of UniFracMat. Then coerce to dist and return.
  	matIndices <- do.call(rbind, spn)[, 2:1]
  	# Take care of edge case where there are two samples -> 1 pair of indices -> rbind doesn't return a matrix
  	if(!is.matrix(matIndices)) matIndices <- matrix(matIndices, ncol=2)
	UniFracMat[matIndices] <- unlist(distlist)
	return(as.dist(UniFracMat))	
}
################################################################################

Try the phyloseq package in your browser

Any scripts or data that you put into this service are public.

phyloseq documentation built on Nov. 8, 2020, 6:41 p.m.