R/tree_generation.R

Defines functions .PreorderTree EnforceOutgroup.character EnforceOutgroup.phylo .EnforceOutgroup EnforceOutgroup ConstrainedNJ Hamming NJTree StarTree .BalancedBit BalancedTree PectinateTree .RandomParent YuleTree RandomTree

Documented in BalancedTree ConstrainedNJ EnforceOutgroup EnforceOutgroup.character EnforceOutgroup.phylo Hamming NJTree PectinateTree .RandomParent RandomTree StarTree YuleTree

#' Generate pectinate, balanced or random trees
#'
#' `RandomTree()`, `PectinateTree()`, `BalancedTree()` and `StarTree()`
#' generate trees with the specified shapes and leaf labels.
#'
#'
#' @template tipsForTreeGeneration
#'
#' @return
#' Each function returns an unweighted binary tree of class `phylo` with
#' the specified leaf labels. Trees are rooted unless `root = FALSE`.
#'
#' @family tree generation functions
#'
#' @template MRS
#' @references \insertAllCited()
#' @name GenerateTree
NULL

#' @rdname GenerateTree
#'
#' @param root Character or integer specifying tip to use as root;
#' or `TRUE` to root the tree on a random edge;
#' or `FALSE` to return an unrooted tree.
#' @param nodes Number of nodes to generate.  The default and maximum,
#' `tips - 1`, generates a binary tree; setting a lower value will induce
#' polytomies.
#'
#' @return `RandomTree()` returns a topology drawn at random from the uniform
#' distribution (i.e. each binary tree is drawn with equal probability).
#' Trees are generated by inserting
#' each tip in term at a randomly selected edge in the tree.
#' Random numbers are generated using a Mersenne Twister.
#' If `root = FALSE`, the tree will be unrooted, with the first tip in a
#' basal position.  Otherwise, the tree will be rooted on `root`.
#'
#' @examples
#' RandomTree(LETTERS[1:10])
#'
#' data("Lobo")
#' RandomTree(Lobo.phy)
#'
#' @export
RandomTree <- function(tips, root = FALSE, nodes) {
  tips <- TipLabels(tips)
  nTips <- length(tips)
  if (length(root) > 1L) {
    root <- root[[1]]
    warning("More than one entry in `root`; using ", root)
  }
  if (is.na(root)) {
    warning("Treating `root = NA` as `FALSE`")
    root <- FALSE
  }
  unrooted <- !is.logical(root) || root == FALSE
  nodesInBinary <- nTips - ifelse(unrooted, 2L, 1L)
  if (missing(nodes)) {
    nodes <- nodesInBinary
  }
  if (nodes > nodesInBinary) {
    warning("`nodes` higher than number in binary tree. Ignoring.")
  }
  if (nodes < 1L) {
    stop("A tree must contain one or more `nodes`")
  }

  edge <- do.call(cbind,
                  RenumberEdges(.RandomParent(nTips),
                                seq_len(nTips + nTips - 2L)))
  # If root identifies a specific tip, root edge on that tip
  if (!is.logical(root) && root != 1L) {
    if (is.character(root)) {
      root <- which(tips == root)
    }
    if (length(root) == 0L) {
      stop("No match found for `root`")
    }
    if (!is.integer(root)) {
      root <- as.integer(root)
    }
    edge <- root_binary(edge, root)
  }
  tree <- structure(list(edge = edge,
                         Nnode = nTips - 1L,
                         tip.label = tips),
                    order = "preorder",
                    class = "phylo")

  # If root is simply a logical T/F, then root our tree randomly - or don't!
  if (is.logical(root)) {
    if (root) {
      tree <- root_on_node(tree, sample.int(nTips + nodes, 1))
    } else {
      tree <- UnrootTree(tree)
    }
  }

  # Collapse nodes at random to obtain desired resolution
  if (nodes < nodesInBinary) {
    tree <- CollapseNode(tree,
                         nTips + 1L + sample.int(nodesInBinary - 1L,
                                                 tree[["Nnode"]] - nodes))
  }
  
  # Return:
  tree
}

#' @rdname GenerateTree
#'
#' @param addInTurn Logical specifying whether to add leaves in the order of
#' `tips`.  If `FALSE`, leaves will be added in a random order.
#' @return `YuleTree()` returns a topology generated by the Yule process
#' \insertCite{Steel2001}{TreeTools},
#' i.e. adding leaves in turn adjacent to a randomly-chosen existing leaf.
#' @examples
#' YuleTree(LETTERS[1:10])
#'
#' @export
YuleTree <- function(tips, addInTurn = FALSE, root = TRUE) {
  tips <- TipLabels(tips)
  if (!addInTurn) {
    tips <- sample(tips)
  }
  nTips <- length(tips)
  if (any(is.na(root))) {
    warning("treating `root = NA` as `FALSE`")
    root[is.na(root)] <- FALSE
  }
  if (nTips < 1) {
    return(ZeroTaxonTree())
  }
  if (nTips == 1) {
    return(SingleTaxonTree(tips))
  }
  tree <- BalancedTree(tips[1:2])
  if (nTips > 2) for (i in 3:nTips) {
    tree <- AddTip(tree, sample.int(i - 1, 1), tips[i])
  }
  unrooted <- !is.logical(root) || root == FALSE
  # Return:
  if (unrooted) {
    UnrootTree(tree)
  } else {
    tree
  }
}

#' Random parent vector
#'
#' @param n Integer specifying number of leaves.
#' @param seed (Optional) Integer with which to seed Mersenne Twister random
#' number generator in C++.
#'
#' @return Integer vector corresponding to the "parent" entry of
#' `tree[["edge"]]`, where the "child" entry, i.e. column 2, is numbered
#' sequentially from `1:n`.
#' @template MRS
#' @keywords internal
#' @importFrom stats runif
#' @export
.RandomParent <- function(n, seed = sample.int(2147483647L, 1L)) {
  if (!is.numeric(n) || is.na(n) || n[1] < 3) {
    stop("nTip must be > 2");
  }
  random_parent(as.integer(n), as.integer(seed))
}

#' @rdname GenerateTree
#' @return `PectinateTree()` returns a pectinate (caterpillar) tree.
#' @examples
#' plot(PectinateTree(LETTERS[1:10]))
#'
#' @export
PectinateTree <- function(tips) {
  tips <- TipLabels(tips)
  nTips <- length(tips)

  nEdge <- nTips + nTips - 2L
  tipSeq <- seq_len(nTips - 1L)

  parent <- rep(seq_len(nTips - 1L) + nTips, each = 2L)

  child <- integer(nEdge)
  child[tipSeq + tipSeq - 1L] <- tipSeq
  child[tipSeq + tipSeq] <- tipSeq + nTips + 1L
  child[nEdge] <- nTips

  structure(list(
    edge = matrix(c(parent, child), ncol = 2L),
    Nnode = nTips - 1L,
    tip.label = tips
  ), order = "cladewise", class = "phylo")
}


#' @rdname GenerateTree
#'
#' @return `BalancedTree()` returns a balanced (symmetrical) tree, in preorder.
#'
#' @examples
#' plot(BalancedTree(LETTERS[1:10]))
#' @export
BalancedTree <- function(tips) {
  tips <- TipLabels(tips)
  nTip <- length(tips)
  if (nTip < 2L) {
    if (nTip == 1L) {
      SingleTaxonTree(tips)
    } else if (nTip == 0L) {
      ZeroTaxonTree()
    } else {
      NULL
    }
  } else {
    # Return:
    structure(list(edge = .BalancedBit(seq_len(nTip)), Nnode = nTip - 1L,
                         tip.label = as.character(tips)),
              order = "preorder", class = "phylo")
  }
}

#' @keywords internal
.BalancedBit <- function(tips, nTips = length(tips), rootNode = nTips + 1L) {
  if (nTips < 4L) {
    if (nTips == 2L) {
      matrix(c(rootNode, rootNode, tips), 2L, 2L)
    } else if (nTips == 3L) {
      matrix(c(rootNode + c(0L, 1L, 1L, 0L, 1L), tips), 4L, 2L)
    } else {
      tips
    }
  } else {
    # Recurse:
    firstN <- as.integer(ceiling(nTips / 2L))
    firstHalf <- seq_len(firstN)
    root2 <- rootNode + firstN
    rbind(rootNode + 0:1,
          .BalancedBit(tips[firstHalf], rootNode = rootNode + 1L),
          c(rootNode, root2),
          .BalancedBit(tips[-firstHalf], rootNode = root2))
  }
}

#' @rdname GenerateTree
#' @return `StarTree()` returns a completely unresolved (star) tree.
#' @examples
#' plot(StarTree(LETTERS[1:10]))
#'
#' @export
StarTree <- function(tips) {
  tips <- TipLabels(tips)
  nTips <- length(tips)

  parent <- rep.int(nTips + 1L, nTips)
  child <- seq_len(nTips)

  structure(list(
    edge = matrix(c(parent, child), ncol = 2L),
    Nnode = 1L,
    tip.label = tips
  ), order = "cladewise", class = "phylo")
}

#' Generate a neighbour joining tree
#'
#' `NJTree()` generates a rooted neighbour joining tree from a phylogenetic
#' dataset.
#'
#' @template datasetParam
#' @param edgeLengths Logical specifying whether to include edge lengths.
#' @param ambig,ratio Settings of `ambig` and `ratio` to be used when
#' computing [`Hamming()`] distances between sequences.
#'
#' @return `NJTree` returns an object of class \code{phylo}.
#'
#' @examples
#' data("Lobo")
#' NJTree(Lobo.phy)
#'
#' @template MRS
#' @importFrom ape nj
#' @family tree generation functions
#' @export
NJTree <- function(dataset, edgeLengths = FALSE,
                    ratio = TRUE, ambig = "mean") {
  tree <- nj(Hamming(dataset, ratio = ratio, ambig = ambig))
  tree <- RootTree(tree, names(dataset)[1])
  if (!edgeLengths) {
    tree[["edge.length"]] <- NULL
  }
  tree
}

#' Hamming distance between taxa in a phylogenetic dataset
#'
#' The Hamming distance between a pair of taxa is the number of characters
#' with a different coding, i.e. the smallest number of evolutionary steps
#' that must have occurred since their common ancestor.
#'
#' Tokens that contain the inapplicable state are treated as requiring no steps
#' to transform into any applicable token.
#'
#' @param dataset Object of class `phyDat`.
#' @param ratio Logical specifying whether to weight distance against
#' maximum possible, given that a token that is ambiguous in either of two taxa
#' cannot contribute to the total distance between the pair.
#' @param ambig Character specifying value to return when a pair of taxa
#' have a zero maximum distance (perhaps due to a preponderance of ambiguous
#' tokens).
#' "median", the default, take the median of all other distance values;
#' "mean", the mean;
#' "zero" sets to zero; "one" to one;
#' "NA" to `NA_integer_`; and "NaN" to `NaN`.
#'
#' @return `Hamming()` returns an object of class `dist` listing the Hamming
#' distance between each pair of taxa.
#'
#' @seealso
#' Used to construct neighbour joining trees in [`NJTree()`].
#'
#' `dist.hamming()` in the \pkg{phangorn} package provides an alternative
#' implementation.
#' 
#' @examples 
#' tokens <- matrix(c(0, 0, "0", 0, "?",
#'                    0, 0, "1", 0, 1,
#'                    0, 0, "1", 0, 1,
#'                    0, 0, "2", 0, 1,
#'                    1, 1, "-", "?", 0,
#'                    1, 1, "2", 1, "{01}"),
#'                    nrow = 6, ncol = 5, byrow = TRUE,
#'                    dimnames = list(
#'                      paste0("Taxon_", LETTERS[1:6]),
#'                      paste0("Char_", 1:5)))
#'
#' dataset <- MatrixToPhyDat(tokens)
#' Hamming(dataset)
#' @template MRS
#' @importFrom stats median
#' @importFrom utils combn
#' @export
Hamming <- function(dataset, ratio = TRUE,
                     ambig = c("median", "mean", "zero", "one", "na", "nan")) {
  at <- attributes(dataset)
  if (!inherits(dataset, "phyDat") || is.null(at[["contrast"]])) {
    stop("`dataset` must be a valid `phyDat` object")
  }
  contrast <- at[["contrast"]] > 0L
  if ("-" %in% colnames(contrast)) {
    # TODO This is debatable but perhaps acceptable behaviour.
    # Is the distance between {0, -} and {1} necessarily zero?
    contrast[contrast[, "-"], ] <- TRUE
  }
  weight <- at[["weight"]]
  tokens <- vapply(dataset, function(codings) contrast[codings, ],
                   matrix(NA, at[["nr"]], dim(contrast)[2]))
  hamming <- apply(combn(length(dataset), 2L), 2L, function(ij) {
    sum(weight[!apply(tokens[, , ij[1], drop = FALSE] &
                      tokens[, , ij[2], drop = FALSE], 1, any)])
  })

  if (ratio) {
    informative <- apply(!contrast, 1, any)
    nonAmbig <- .vapply(dataset, function(codings) informative[codings],
                       logical(at[["nr"]]))
    bothInformative <- apply(combn(length(dataset), 2L), 2L, function(ij) {
      sum(weight[nonAmbig[, ij[1]] & nonAmbig[, ij[2]]])
    })
    hamming <- hamming / bothInformative
    if (ambig[1] == 0) {
      ambig <- "zero"
    } else if (ambig[1] == 1) {
      ambig <- "one"
    }
    nanMethod <- pmatch(tolower(ambig[1]),
                        c("median", "mean", "zero", "one", "na", "nan"))
    if (is.na(nanMethod)) {
      stop("Invalid `ambig` value specified")
    }
    hamming[is.nan(hamming)] <- switch(nanMethod,
      median(hamming[!is.na(hamming)]),
      mean(hamming[!is.na(hamming)]),
      0L,
      1L,
      NA_integer_,
      NaN)

  }
  attributes(hamming) <- list(
    Size = length(dataset),
    Labels = names(dataset),
    Diag = FALSE,
    Upper = FALSE,
    method = "hamming",
    class = "dist")
  
  # Return:
  hamming
}

#' Constrained neighbour-joining tree
#'
#' Constructs an approximation to a neighbour-joining tree, modified in order
#' to be consistent with a constraint.  Zero-length branches are collapsed
#' at random.
#'
#' @template datasetParam
#' @template constraintParam
#' @param weight Numeric specifying degree to up-weight characters in
#' `constraint`.
#'
#' @inheritParams NJTree
#' @return `ConstrainedNJ()` returns a tree of class `phylo`.
#' @examples
#' dataset <- MatrixToPhyDat(matrix(
#'   c(0, 1, 1, 1, 0, 1,
#'     0, 1, 1, 0, 0, 1), ncol = 2,
#'   dimnames = list(letters[1:6], NULL)))
#' constraint <- MatrixToPhyDat(
#'   c(a = 0, b = 0, c = 0, d = 0, e = 1, f = 1))
#' plot(ConstrainedNJ(dataset, constraint))
#' @template MRS
#' @importFrom ape nj multi2di
#' @family tree generation functions
#' @export
ConstrainedNJ <- function(dataset, constraint, weight = 1L,
                           ratio = TRUE, ambig = "mean") {
  missing <- setdiff(names(dataset), names(constraint))
  if (length(missing)) {
    constraint <- AddUnconstrained(constraint, missing)
  }
  constraint <- .SubsetPhyDat(constraint, names(dataset))
  tree <- multi2di(nj((Hamming(constraint, ratio = ratio, ambig = ambig)
                       * weight) +
                        Hamming(dataset, ratio = ratio, ambig = ambig)))
  tree[["edge.length"]] <- NULL
  tree <- ImposeConstraint(tree, constraint)
  tree <- RootTree(tree, names(dataset)[1])

  # Return:
  tree
}

#nocov begin
#' Generate a tree with a specified outgroup
#'
#' **Deprecated.** This function will be removed in a future version of
#' \pkg{TreeTools}.
#' Use `RootTree()` instead.
#'
#' Given a tree or a list of taxa, `EnforceOutgroup()` rearranged the ingroup
#' and outgroup taxa such that the two are sister taxa across the root, without
#' changing the relationships within the ingroup or within the outgroup.
#'
#' @param tree Either a tree of class \code{phylo}; or (for `EnforceOutgroup()`)
#' a character vector listing the names of all the taxa in the tree, from which
#' a random tree will be generated.
#' @param outgroup Character vector containing the names of taxa to include in
#' the outgroup.
#'
#' @return `EnforceOutgroup()` returned a tree of class `phylo` where all
#' outgroup taxa are sister to all remaining taxa, without modifying the
#' ingroup topology.
#'
# @examples
# tree <- EnforceOutgroup(letters[1:9], letters[1:3])
# plot(tree)
#
#' @seealso For a more robust implementation, see [`RootTree()`], which will
#' eventually replace this function
#' ([#30](https://github.com/ms609/TreeTools/issues/30)).
#'
#' @template MRS
#' @family tree manipulation
#' @export
EnforceOutgroup <- function(tree, outgroup) UseMethod("EnforceOutgroup")

#' @importFrom ape bind.tree
.EnforceOutgroup <- function(tree, outgroup, taxa) {
  .Deprecated("RootTree")
  if (length(outgroup) == 1L) {
    return(RootTree(tree, outgroup))
  }

  ingroup <- taxa[!(taxa %fin% outgroup)]
  if (!all(outgroup %fin% taxa) ||
      length(ingroup) + length(outgroup) != length(taxa)) {
    stop("All outgroup taxa must occur in tree")
  }

  ingroup.branch <- DropTip(tree, outgroup)
  outgroup.branch <- DropTip(tree, ingroup)

  result <- RootTree(bind.tree(outgroup.branch, ingroup.branch, 0, 1),
                     outgroup)
  RenumberTips(Renumber(result), taxa)
}

#' @rdname EnforceOutgroup
#' @export
EnforceOutgroup.phylo <- function(tree, outgroup) {
  .EnforceOutgroup(tree, outgroup, tree[["tip.label"]])
}

#' @rdname EnforceOutgroup
#' @export
EnforceOutgroup.character <- function(tree, outgroup) {
  taxa <- tree
  .EnforceOutgroup(RandomTree(taxa, taxa[1]), outgroup, taxa)
}
#nocov end

.PreorderTree <- function(edge,
                          tip.label,
                          Nnode =  dim(edge)[1] + 1 - length(tip.label)) {
  structure(
    list(
      # Order is consistent with ape::read.tree (but not ape::rtree...)
      edge = edge,
      Nnode = Nnode,
      tip.label = tip.label
    ),
    order = "preorder",
    class = "phylo"
  )
}

Try the TreeTools package in your browser

Any scripts or data that you put into this service are public.

TreeTools documentation built on June 22, 2024, 9:27 a.m.