R/read_tree.r

#' Read a newick formatted phylogenetic tree.
#' 
#' A phylogenetic tree is required for computing UniFrac distance matrices.
#' You can load a tree from a file or by providing the tree string directly. 
#' This tree must be in Newick format, also known as parenthetic format and
#' New Hampshire format.
#' 
#' @family phylogeny
#'
#' @param src   Input data as either a file path, URL, or Newick string. 
#'        Compressed (gzip or bzip2) files are also supported.
#'        
#' @return A \code{phylo} class object representing the tree.
#' 
#' @export
#' @examples
#'     library(rbiom)
#'     
#'     infile <- system.file("extdata", "newick.tre", package = "rbiom")
#'     tree <- read_tree(infile)
#'     
#'     tree <- read_tree("
#'         (t9:0.99,((t5:0.87,t2:0.89):0.51,(((t10:0.16,(t7:0.83,t4:0.96)
#'         :0.94):0.69,(t6:0.92,(t3:0.62,t1:0.85):0.54):0.23):0.74,t8:0.1
#'         2):0.43):0.67);")
#'
read_tree <- function (src) {
  
  stopifnot(is_scalar_character(src) && !is_na(src))
  
  src <- trimws(src)
  stopifnot(isTRUE(nchar(src) > 0))
  
  
  #________________________________________________________
  # Shortened src string for use in error messages.
  #________________________________________________________
  text <- src
    
  if (nchar(src) > 20)
    src <- paste0(
      substr(src, 1, 10), 
      "...", 
      substr(src, nchar(src)-10, nchar(src)) )
  
  
  #________________________________________________________
  # Get the Newick data into a string
  #________________________________________________________
  if (!startsWith(text, '('))
    text <- tryCatch(
      error = function (e) stop("Can't read file ", src, ".\n", e), 
      expr  = local({
        con <- gzfile(text)
        on.exit(close(con))
        paste0(collapse = "", readLines(con, warn = FALSE)) }))
  
  stopifnot(is_scalar_character(text) && !is_na(text))
  
  
  #________________________________________________________
  # Remove newlines, comments, and leading whitespace
  #________________________________________________________
  text <- gsub("[\ \t]*[\r\n]+[\ \t]*", "", text)
  text <- gsub("\\[.*?\\]", "",             text, perl=TRUE)
  text <- sub("^[\ \t]+",  "",              text)
  
  stopifnot(isTRUE(nchar(text) >= 2))
  stopifnot(isTRUE(startsWith(text, '(')))
  
  
  #________________________________________________________
  # Parse the Newick string into a phylo object
  #________________________________________________________
  
  tree <- rcpp_read_tree(text)
  
  if (all(nchar(tree$node.label) == 0))
    tree$node.label <- NULL
  
  if (all(tree$edge.length == 0))
    tree$edge.length <- NULL
  
  return (tree)
}



#' Create a subtree by specifying tips to keep.
#' 
#' @name tree_subset
#' 
#' @family phylogeny
#' 
#' @param tree  A phylo object, as returned from [read_tree()].
#' @param tips  A character, numeric, or logical vector of tips to keep.
#' @return A \code{phylo} object for the subtree.
#' @export
#' @examples
#'     library(rbiom)
#'     
#'     infile <- system.file("extdata", "newick.tre", package = "rbiom")
#'     tree <- read_tree(infile)
#'     tree
#'     
#'     subtree <- tree_subset(tree, tips = head(tree$tip.label))
#'     subtree
#'
tree_subset <- function (tree, tips) {
  
  validate_tree()
  params <- eval_envir(environment())
  
  
  #________________________________________________________
  # See if this result is already in the cache.
  #________________________________________________________
  cache_file <- get_cache_file('tree_subset', params)
  if (isTRUE(attr(cache_file, 'exists', exact = TRUE)))
    return (readRDS(cache_file))
  
  
  #________________________________________________________
  # Did we get a tree?
  #________________________________________________________
  
  if (!is(tree, "phylo"))
    cli_abort(c(x = "Can't convert {.type {tree}} to a `phylo` object."))
  
  nTips <- length(setdiff(tree$edge[,2], tree$edge[,1]))
  
  
  #________________________________________________________
  # Tips specified as strings c("SteSpe52", "HrbSpe65", ...)
  #________________________________________________________
  
  if (is.character(tips)) {
    
    if (length(missing <- tips[!tips %in% tree$tip.label]) > 0)
      cli_abort(('x' = sprintf("Tips missing from tree: {missing}")))
    
    tips <- which(tree$tip.label %in% tips)
  }
  
  
  #________________________________________________________
  # Tips specified as logicals c(TRUE, TRUE, FALSE, TRUE, ...)
  #________________________________________________________
  
  if (is.logical(tips)) {
    
    if (length(tips) != nTips)
      cli_abort("Logical vector `tips` must be same length as `tree$tip.label`.")
    
    tips <- which(tips)
  }
  
  
  #________________________________________________________
  # Tips specified as numeric c(5, 1, 2, 31, 3, 12, ...)
  #________________________________________________________
  
  if (is.numeric(tips)) {
    if (max(tips) > nTips)  cli_abort("There aren't that many tips in the tree.")
    if (min(tips) < 1)      cli_abort("Tips with index < 1 are not allowed.")
    if (any(tips %% 1 > 0)) cli_abort("Tip indices must be whole numbers.")
    
  } else {
    cli_abort("Tips must be given as character, logical, or numeric.")
  }
  
  
  #________________________________________________________
  # Keeping zero or all tips?
  #________________________________________________________
  
  tips <- unique(tips)
  
  if (length(tips) < 2)
    cli_abort("At least two tips must be specified.")
  
  if (length(tips) == nTips) {
    set_cache_value(cache_file, tree)
    return (tree)
  }
  
  
  #________________________________________________________
  # Prune the tree
  #________________________________________________________
  
  root      <- setdiff(tree$edge[,1], tree$edge[,2])
  childAt   <- order(c(tree$edge[,2], root))
  nChildren <- c(rep(0, nTips), unname(table(tree$edge[,1])))
  
  eLength   <- tree$edge.length
  if (is_null(eLength))
    eLength <- rep(0, nrow(tree$edge))
  
  for (tip in setdiff(1:nTips, tips)) repeat {
    
    tipIdx <- childAt[tip]
    parent <- tree$edge[tipIdx,1]
    
    nChildren[parent] <- nChildren[parent] - 1
    eLength[tipIdx] <- NA
    
    if (nChildren[parent] > 0) break
    tip <- parent
  }
  
  
  #________________________________________________________
  # Merge branches
  #________________________________________________________
  
  for (tip in tips) repeat {
    
    if (tip == root) break
    
    tipIdx <- childAt[tip]
    parent <- tree$edge[tipIdx,1]
    
    if (nChildren[parent] == 1 && parent != root) {
      
      parentIdx                   <- childAt[parent]
      tree$edge[parentIdx, 2]     <- tree$edge[tipIdx, 2]
      eLength[parentIdx] <- eLength[parentIdx] + eLength[tipIdx]
      eLength[tipIdx]    <- NA
      
      nChildren[parent] <- 0
    }
    
    tip <- parent
  }
  
  
  #________________________________________________________
  # Move the root
  #________________________________________________________
  
  if (nChildren[root] == 1)
    eLength[which(tree$edge[,1] == root)] <- NA
  
  
  #________________________________________________________
  # Discard nodes flagged with NA; make the new subtree
  #________________________________________________________
  
  tree$edge  <- tree$edge[!is.na(eLength),]
  tree$Nnode <- length(unique(tree$edge[,1]))
  
  if (!is_null(tree$edge.length)) tree$edge.length <- eLength[!is.na(eLength)]
  if (!is_null(tree$tip.label))   tree$tip.label   <- tree$tip.label[tips]
  if (!is_null(tree$node.label))  tree$node.label  <- tree$tip.label[unique(sort(tree$edge[,1]))]
  
  tree$edge <- matrix(as.numeric(factor(as.vector(tree$edge))), ncol=2)
  
  
  set_cache_value(cache_file, tree)
  return (tree)
}
cmmr/rbiom documentation built on April 28, 2024, 6:38 a.m.