R/tree_generation.R

Defines functions .PreorderTree ConstrainedNJ Hamming NJTree StarTree .BalancedBit BalancedTree PectinateTree .RandomParent YuleTree RandomTree

Documented in BalancedTree ConstrainedNJ Hamming NJTree PectinateTree .RandomParent RandomTree StarTree YuleTree

#' Generate pectinate, balanced or random trees
#'
#' `RandomTree()`, `PectinateTree()`, `BalancedTree()` and `StarTree()`
#' generate trees with the specified shapes and leaf labels.
#'
#' @param tips An integer specifying the number of tips, or a character vector
#' naming the tips, or any other object from which [`TipLabels()`] can
#' extract leaf labels.
#'
#' @return
#' Each function returns an unweighted binary tree of class `phylo` with
#' the specified leaf labels. Trees are rooted unless `root = FALSE`.
#'
#' @family tree generation functions
#'
#' @template MRS
#' @references \insertAllCited()
#' @name GenerateTree
NULL

#' @rdname GenerateTree
#'
#' @param root Character or integer specifying tip to use as root;
#' or `TRUE` to root the tree on a random edge;
#' or `FALSE` to return an unrooted tree.
#' @param nodes Number of nodes to generate.  The default and maximum,
#' `tips - 1`, generates a binary tree; setting a lower value will induce
#' polytomies.
#'
#' @return `RandomTree()` returns a topology drawn at random from the uniform
#' distribution (i.e. each binary tree is drawn with equal probability).
#' Trees are generated by inserting
#' each tip in term at a randomly selected edge in the tree.
#' Random numbers are generated using a Mersenne Twister.
#' If `root = FALSE`, the tree will be unrooted, with the first tip in a
#' basal position.  Otherwise, the tree will be rooted on `root`.
#'
#' @examples
#' RandomTree(LETTERS[1:10])
#'
#' data("Lobo")
#' RandomTree(Lobo.phy)
#'
#' @export
RandomTree <- function(tips, root = FALSE, nodes, lengths = NULL) {
  tips <- TipLabels(tips)
  nTips <- length(tips)
  if (length(root) > 1L) {
    root <- root[[1]]
    warning("More than one entry in `root`; using ", root)
  }
  if (is.na(root)) {
    warning("Treating `root = NA` as `FALSE`")
    root <- FALSE
  }
  unrooted <- !is.logical(root) || root == FALSE
  nodesInBinary <- nTips - ifelse(unrooted, 2L, 1L)
  if (missing(nodes)) {
    nodes <- nodesInBinary
  }
  if (nodes > nodesInBinary) {
    warning("`nodes` higher than number in binary tree. Ignoring.")
  }
  if (nTips < 3) {
    return(BalancedTree(tips = tips, lengths = lengths))
  }
  if (nodes < 1L) {
    stop("A tree must contain one or more `nodes`")
  }

  edge <- do.call(cbind,
                  RenumberEdges(.RandomParent(nTips),
                                seq_len(nTips + nTips - 2L)))
  # If root identifies a specific tip, root edge on that tip
  if (!is.logical(root) && root != 1L) {
    if (is.character(root)) {
      root <- which(tips == root)
    }
    if (length(root) == 0L) {
      stop("No match found for `root`")
    }
    if (!is.integer(root)) {
      root <- as.integer(root)
    }
    edge <- root_binary(edge, root)
  }
  tree <- structure(list(edge = edge,
                         Nnode = nTips - 1L,
                         tip.label = tips),
                    order = "preorder",
                    class = "phylo")

  # If root is simply a logical T/F, then root our tree randomly - or don't!
  if (is.logical(root)) {
    if (root) {
      tree <- root_on_node(tree, sample.int(nTips + nodes, 1))
    } else {
      tree <- UnrootTree(tree)
    }
  }

  # Collapse nodes at random to obtain desired resolution
  if (nodes < nodesInBinary) {
    tree <- CollapseNode(tree,
                         nTips + 1L + sample.int(nodesInBinary - 1L,
                                                 tree[["Nnode"]] - nodes))
  }
  
  if (!is.null(lengths)) {
    tree[["edge.length"]] <- rep(lengths, length.out = dim(tree[["edge"]])[[1]])
  }
  
  # Return:
  tree
}

#' @rdname GenerateTree
#'
#' @param addInTurn Logical specifying whether to add leaves in the order of
#' `tips`.  If `FALSE`, leaves will be added in a random order.
#' @return `YuleTree()` returns a topology generated by the Yule process
#' \insertCite{Steel2001}{TreeTools},
#' i.e. adding leaves in turn adjacent to a randomly-chosen existing leaf.
#' @examples
#' YuleTree(LETTERS[1:10])
#'
#' @export
YuleTree <- function(tips, addInTurn = FALSE, root = TRUE, lengths = NULL) {
  tips <- TipLabels(tips)
  if (!addInTurn[[1]]) {
    tips <- sample(tips)
  }
  nTips <- length(tips)
  if (any(is.na(root))) {
    warning("treating `root = NA` as `FALSE`")
    root[is.na(root)] <- FALSE
  }
  
  if (nTips < 1) {
    return(ZeroTaxonTree())
  } else if (nTips == 1) {
    return(SingleTaxonTree(tips, lengths))
  }
  
  tree <- BalancedTree(tips[1:2])
  if (nTips > 2) for (i in 3:nTips) {
    tree <- AddTip(tree, sample.int(i - 1, 1), tips[[i]])
  }
  
  unrooted <- !is.logical(root) || root == FALSE
  if (unrooted) {
    tree <- UnrootTree(tree)
  }
  
  if (!is.null(lengths)) {
    tree[["edge.length"]] <- rep(lengths, length.out = dim(tree[["edge"]])[[1]])
  }
  
  # Return:
  tree
}

#' Random parent vector
#'
#' @param n Integer specifying number of leaves.
#' @param seed (Optional) Integer with which to seed Mersenne Twister random
#' number generator in C++.
#'
#' @return Integer vector corresponding to the "parent" entry of
#' `tree[["edge"]]`, where the "child" entry, i.e. column 2, is numbered
#' sequentially from `1:n`.
#' @template MRS
#' @keywords internal
#' @importFrom stats runif
#' @export
.RandomParent <- function(n, seed = sample.int(2147483647L, 1L)) {
  if (!is.numeric(n) || is.na(n) || n[1] < 3) {
    stop("nTip must be > 2");
  }
  random_parent(as.integer(n), as.integer(seed))
}

#' @rdname GenerateTree
#' @return `PectinateTree()` returns a pectinate (caterpillar) tree.
#' @examples
#' plot(PectinateTree(LETTERS[1:10]))
#'
#' @export
PectinateTree <- function(tips, lengths = NULL) {
  tips <- TipLabels(tips)
  nTip <- length(tips)
  if (nTip == 0) {
    ZeroTaxonTree()
  } else if (nTip == 1) {
    SingleTaxonTree(tips, lengths)
  } else {
  
    nEdge <- nTip + nTip - 2L
    tipSeq <- seq_len(nTip - 1L)
  
    parent <- rep(seq_len(nTip - 1L) + nTip, each = 2L)
  
    child <- integer(nEdge)
    child[tipSeq + tipSeq - 1L] <- tipSeq
    child[tipSeq + tipSeq] <- tipSeq + nTip + 1L
    child[nEdge] <- nTip
  
    tr <- list(
        edge = matrix(c(parent, child), ncol = 2L),
        Nnode = nTip - 1L,
        tip.label = tips
      )
    if (!is.null(lengths) && nTip > 1) {
      tr[["edge.length"]] <- rep(lengths, length.out = 2 * (nTip - 1))
    }
    structure(tr, order = "cladewise", class = "phylo")
  }
}


#' @rdname GenerateTree
#'
#' @inheritParams SingleTaxonTree
#' @return `BalancedTree()` returns a balanced (symmetrical) tree, in preorder.
#'
#' @examples
#' plot(BalancedTree(LETTERS[1:10]))
#' @export
BalancedTree <- function(tips, lengths = NULL) {
  tips <- TipLabels(tips)
  nTip <- length(tips)
  if (nTip < 2L) {
    if (nTip == 1L) {
      SingleTaxonTree(tips, lengths)
    } else if (nTip == 0L) {
      ZeroTaxonTree()
    } else {
      NULL
    }
  } else {
    tr <- list(edge = .BalancedBit(seq_len(nTip), nTips = nTip),
               Nnode = nTip - 1L,
               tip.label = as.character(tips))
    if (!is.null(lengths)) {
      tr[["edge.length"]] <- rep(lengths, length.out = 2 * (nTip - 1))
    }
    # Return:
    structure(tr, order = "preorder", class = "phylo")
  }
}

#' @keywords internal
.BalancedBit <- function(tips, nTips = length(tips), rootNode = nTips + 1L) {
  if (nTips < 4L) {
    if (nTips == 2L) {
      matrix(c(rootNode, rootNode, tips), 2L, 2L)
    } else if (nTips == 3L) {
      matrix(c(rootNode + c(0L, 1L, 1L, 0L, 1L), tips), 4L, 2L)
    } else {
      tips
    }
  } else {
    # Recurse:
    firstN <- as.integer(ceiling(nTips / 2L))
    firstHalf <- seq_len(firstN)
    root2 <- rootNode + firstN
    rbind(rootNode + 0:1,
          .BalancedBit(tips[firstHalf], rootNode = rootNode + 1L),
          c(rootNode, root2),
          .BalancedBit(tips[-firstHalf], rootNode = root2))
  }
}

#' @rdname GenerateTree
#' @return `StarTree()` returns a completely unresolved (star) tree.
#' @examples
#' plot(StarTree(LETTERS[1:10]))
#'
#' @export
StarTree <- function(tips, lengths = NULL) {
  tips <- TipLabels(tips)
  nTip <- length(tips)

  parent <- rep.int(nTip + 1L, nTip)
  child <- seq_len(nTip)

  tr <- if (is.null(lengths)) {
    list(
      edge = matrix(c(parent, child), ncol = 2L),
      Nnode = 1L,
      tip.label = tips
    )
  } else {
    list(
      edge = matrix(c(parent, child), ncol = 2L),
      Nnode = 1L,
      tip.label = tips,
      edge.length =  rep(lengths, length.out = nTip)
    )
  }
  structure(tr, order = "cladewise", class = "phylo")
}

#' Generate a neighbour joining tree
#'
#' `NJTree()` generates a rooted neighbour joining tree from a phylogenetic
#' dataset.
#'
#' @template datasetParam
#' @param edgeLengths Logical specifying whether to include edge lengths.
#' @param ambig,ratio Settings of `ambig` and `ratio` to be used when
#' computing [`Hamming()`] distances between sequences.
#'
#' @return `NJTree` returns an object of class \code{phylo}.
#'
#' @examples
#' data("Lobo")
#' NJTree(Lobo.phy)
#'
#' @template MRS
#' @importFrom ape nj
#' @family tree generation functions
#' @export
NJTree <- function(dataset, edgeLengths = FALSE,
                    ratio = TRUE, ambig = "mean") {
  tree <- nj(Hamming(dataset, ratio = ratio, ambig = ambig))
  tree <- RootTree(tree, names(dataset)[[1]])
  if (!edgeLengths) {
    tree[["edge.length"]] <- NULL
  }
  tree
}

#' Hamming distance between taxa in a phylogenetic dataset
#'
#' The Hamming distance between a pair of taxa is the number of characters
#' with a different coding, i.e. the smallest number of evolutionary steps
#' that must have occurred since their common ancestor.
#'
#' Tokens that contain the inapplicable state are treated as requiring no steps
#' to transform into any applicable token.
#'
#' @param dataset Object of class `phyDat`.
#' @param ratio Logical specifying whether to weight distance against
#' maximum possible, given that a token that is ambiguous in either of two taxa
#' cannot contribute to the total distance between the pair.
#' @param ambig Character specifying value to return when a pair of taxa
#' have a zero maximum distance (perhaps due to a preponderance of ambiguous
#' tokens).
#' "median", the default, take the median of all other distance values;
#' "mean", the mean;
#' "zero" sets to zero; "one" to one;
#' "NA" to `NA_integer_`; and "NaN" to `NaN`.
#'
#' @return `Hamming()` returns an object of class `dist` listing the Hamming
#' distance between each pair of taxa.
#'
#' @seealso
#' Used to construct neighbour joining trees in [`NJTree()`].
#'
#' `dist.hamming()` in the \pkg{phangorn} package provides an alternative
#' implementation.
#' 
#' @examples 
#' tokens <- matrix(c(0, 0, "0", 0, "?",
#'                    0, 0, "1", 0, 1,
#'                    0, 0, "1", 0, 1,
#'                    0, 0, "2", 0, 1,
#'                    1, 1, "-", "?", 0,
#'                    1, 1, "2", 1, "{01}"),
#'                    nrow = 6, ncol = 5, byrow = TRUE,
#'                    dimnames = list(
#'                      paste0("Taxon_", LETTERS[1:6]),
#'                      paste0("Char_", 1:5)))
#'
#' dataset <- MatrixToPhyDat(tokens)
#' Hamming(dataset)
#' @template MRS
#' @family utility functions
#' @importFrom stats median
#' @importFrom utils combn
#' @export
Hamming <- function(dataset, ratio = TRUE,
                     ambig = c("median", "mean", "zero", "one", "na", "nan")) {
  at <- attributes(dataset)
  if (!inherits(dataset, "phyDat") || is.null(at[["contrast"]])) {
    stop("`dataset` must be a valid `phyDat` object")
  }
  contrast <- at[["contrast"]] > 0L
  if ("-" %in% colnames(contrast)) {
    # TODO This is debatable but perhaps acceptable behaviour.
    # Is the distance between {0, -} and {1} necessarily zero?
    contrast[contrast[, "-"], ] <- TRUE
  }
  weight <- at[["weight"]]
  tokens <- vapply(dataset, function(codings) contrast[codings, ],
                   matrix(NA, at[["nr"]], dim(contrast)[2]))
  hamming <- apply(combn(length(dataset), 2L), 2L, function(ij) {
    sum(weight[!apply(tokens[, , ij[1], drop = FALSE] &
                      tokens[, , ij[2], drop = FALSE], 1, any)])
  })

  if (ratio) {
    informative <- apply(!contrast, 1, any)
    nonAmbig <- .vapply(dataset, function(codings) informative[codings],
                       logical(at[["nr"]]))
    bothInformative <- apply(combn(length(dataset), 2L), 2L, function(ij) {
      sum(weight[nonAmbig[, ij[1]] & nonAmbig[, ij[2]]])
    })
    hamming <- hamming / bothInformative
    if (ambig[1] == 0) {
      ambig <- "zero"
    } else if (ambig[1] == 1) {
      ambig <- "one"
    }
    nanMethod <- pmatch(tolower(ambig[1]),
                        c("median", "mean", "zero", "one", "na", "nan"))
    if (is.na(nanMethod)) {
      stop("Invalid `ambig` value specified")
    }
    hamming[is.nan(hamming)] <- switch(nanMethod,
      median(hamming[!is.na(hamming)]),
      mean(hamming[!is.na(hamming)]),
      0L,
      1L,
      NA_integer_,
      NaN)

  }
  attributes(hamming) <- list(
    Size = length(dataset),
    Labels = names(dataset),
    Diag = FALSE,
    Upper = FALSE,
    method = "hamming",
    class = "dist")
  
  # Return:
  hamming
}

#' Constrained neighbour-joining tree
#'
#' Constructs an approximation to a neighbour-joining tree, modified in order
#' to be consistent with a constraint.  Zero-length branches are collapsed
#' at random.
#'
#' @template datasetParam
#' @template constraintParam
#' @param weight Numeric specifying degree to up-weight characters in
#' `constraint`.
#'
#' @inheritParams NJTree
#' @return `ConstrainedNJ()` returns a tree of class `phylo`.
#' @examples
#' dataset <- MatrixToPhyDat(matrix(
#'   c(0, 1, 1, 1, 0, 1,
#'     0, 1, 1, 0, 0, 1), ncol = 2,
#'   dimnames = list(letters[1:6], NULL)))
#' constraint <- MatrixToPhyDat(
#'   c(a = 0, b = 0, c = 0, d = 0, e = 1, f = 1))
#' plot(ConstrainedNJ(dataset, constraint))
#' @template MRS
#' @importFrom ape nj multi2di
#' @family tree generation functions
#' @export
ConstrainedNJ <- function(dataset, constraint, weight = 1L,
                           ratio = TRUE, ambig = "mean") {
  missing <- setdiff(names(dataset), names(constraint))
  if (length(missing)) {
    constraint <- AddUnconstrained(constraint, missing)
  }
  constraint <- .SubsetPhyDat(constraint, names(dataset))
  tree <- multi2di(nj((Hamming(constraint, ratio = ratio, ambig = ambig)
                       * weight) +
                        Hamming(dataset, ratio = ratio, ambig = ambig)))
  tree[["edge.length"]] <- NULL
  tree <- ImposeConstraint(tree, constraint)
  tree <- RootTree(tree, names(dataset)[1])

  # Return:
  tree
}

# Wrap an edge matrix as a tree in preorder
.PreorderTree <- function(edge,
                          tip.label,
                          Nnode =  dim(edge)[[1]] + 1 - length(tip.label)) {
  structure(
    list(
      # Order is consistent with ape::read.tree (but not ape::rtree...)
      edge = edge,
      Nnode = Nnode,
      tip.label = tip.label
    ),
    order = "preorder",
    class = "phylo"
  )
}

Try the TreeTools package in your browser

Any scripts or data that you put into this service are public.

TreeTools documentation built on April 23, 2026, 5:06 p.m.