R/RUtreebalance.R
In TreeTools: Create, Modify and Analyse Phylogenetic Trees

Documented in J1Index

# This file is adapted from code by Rob Noble, doi:10.5281/zenodo.5873857
# https://github.com/robjohnnoble/RUtreebalance

# MIT License
# 
# Copyright (c) 2021 Rob Noble
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#   
#   The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


# Find the parent of a node.
# 
# Example:
# edges1 <- data.frame(Parent = c(1, 1, 1, 3, 3), Identity = 2:6)
# .MoveUp(edges1, 3)
.MoveUp <- function(edges, identity) {
  if (!(identity %in% edges[["Identity"]]) &&
      !(identity %in% edges[["Parent"]])) {
    stop("Invalid identity.")
  }
  parent <- edges[edges[["Identity"]] == identity, "Parent"]
  if (length(parent) == 0) {
    # if identity is the root then don't move
    identity
  } else if (is.factor(parent)) {
    levels(parent)[parent]
  } else {
    parent 
  }
}

# Find the root node of a tree.
# 
# Example:
# edges1 <- data.frame(Parent = c(1,1,1,3,3), Identity = 2:6)
# .FindRoot(edges1)
.FindRoot <- function(edges) {
  start <- edges[["Parent"]][[1]] # reasonable guess
  if (is.factor(start)) {
    start <- levels(start)[start]
  }
  repeat {
    if (.MoveUp(edges, start) == start) break
    start <- .MoveUp(edges, start)
  }
  return(start)
}

# Get a list of all subtree sizes via depth-first search.
# Optionally provide root node i and adjacency list Adj if known.
# 
# Example:
# tree1 <- data.frame(Parent = c(1, 1, 1, 1, 2, 3, 4), 
#                     Identity = 1:7, 
#                     Population = c(1, rep(5, 6)))
# .GetSubtreeSizes(tree1)
.GetSubtreeSizes <- function(
    tree,
    i = which(tree[["Identity"]] == .FindRoot(tree[, 1:2])),
    Adj = .GetAdjacency(tree),
    Col = NULL, Cumul = NULL, is_leaf =  NULL) {
  n <- length(tree[["Identity"]])
  has_pops <- "Population" %in% colnames(tree)
  
  uniqueID <- unique(tree[["Identity"]])
  if (is.null(Col)) {
    Col <- setNames(rep("w", n), uniqueID)
  }
  if (is.null(Cumul)) {
    Cumul <- setNames(rep(NA, n), uniqueID)
  }
  if (is.null(is_leaf)) {
    is_leaf <- setNames(logical(n), uniqueID)
  }
  if (is.null(Adj[[i]])) {
    is_leaf[[i]] <- TRUE
  }
  for (j in Adj[[i]]) {
    if (Col[[j]] == "w"){
      L <- .GetSubtreeSizes(tree, j, Adj, Col, Cumul, is_leaf)
      Col<- L[["colour"]]
      Cumul <- L[["cumulative"]]
      is_leaf <- L[["is_leaf"]]
    }
  }
  Col[[i]] <- "b"
  Cumul[[i]] <- if (has_pops) {
    tree[["Population"]][[i]] + sum(Cumul[Adj[[i]]])
  } else {
    ifelse(is_leaf[[i]], 1, 0) + sum(Cumul[Adj[[i]]])
  }
  # Return:
  list("colour" = Col,"cumulative" = Cumul, "is_leaf" = is_leaf)
}

# Get adjacency list of a tree.
# 
# Example:
# tree1 <- data.frame(Parent = c(1, 1, 1, 1, 2, 3, 4), 
#                     Identity = 1:7, 
#                     Population = c(1, rep(5, 6)))
# .GetAdjacency(tree1)
.GetAdjacency <- function(tree) {
  n <- length(tree[["Identity"]])
  Adj <- vector(mode = "list", length = n)
  for (i in seq_len(n)) {
    if(tree$Parent[[i]] != tree$Identity[[i]]) {
      p <- match(tree$Parent[[i]], tree$Identity)
      Adj[[p]] <- c(Adj[[p]], i)
    }
  }
  
  # Return:
  Adj
}

# Add a row to the edges list to represent the root node (if not already present)
.AddRootRow <- function(tree) {
  start <- setdiff(tree[["Parent"]], tree[["Identity"]])
  if (length(start) > 1) {
    stop("Input dataframe is missing one or more rows")
  }
  if (length(start) > 0) { # add row for root node
    root_row <- if ("Population" %in% colnames(tree)) {
      data.frame(Parent = start, Identity = start, Population = 0)
    } else {
      data.frame(Parent = start, Identity = start)
    }
    tree <- rbind(root_row, tree)
  }
  
  # Return:
  tree
}

#' Robust universal tree balance index
#' 
#' Calculate tree balance index \ifelse{html}{\out{J<sup>1</sup>}}{\eqn{J^1}}
#' (when `nonRootDominance = FALSE`) or
#' \ifelse{html}{\out{J<sup>1c</sup>}}{\eqn{J^{1c}}}
#' (when `nonRootDominance = TRUE`) from \insertRef{Lemant2022}{TreeTools}.
#' 
#' If population sizes are not provided, then the function assigns size 0 to
#' internal nodes, and size 1 to leaves.
#' 
#' @param tree Either an object of class 'phylo', or a dataframe with column 
#' names Parent, Identity and (optionally) Population.
#' The latter is similar to `tree$edge`, where tree is an object of class
#' 'phylo'; the differences are in class (data.frame versus matrix) and column
#' names.
#' The dataframe may (but isn't required to) include a row for the root node.
#' If population sizes are omitted then internal nodes will be assigned
#' population size zero and leaves will be assigned population size one.
#' @param q Numeric between zero and one specifying sensitivity to type
#' frequencies.  If `q < 1`, the \ifelse{html}{\out{J<sup>q</sup>}}{\eqn{J^q}}
#' index - based on generalized entropy - will be returned; see 
#' \insertCite{Lemant2022;textual}{TreeTools}, page 1223.
#' @param nonRootDominance Logical specifying whether to use non-root dominance
#' factor.
#' 
#' @examples
#' # Using phylo object as input:
#' phylo_tree <- read.tree(text="((a:0.1)A:0.5,(b1:0.2,b2:0.1)B:0.2);")
#' J1Index(phylo_tree)
#' phylo_tree2 <- read.tree(text='((A, B), ((C, D), (E, F)));')
#' J1Index(phylo_tree2)
#' 
#' # Using edges lists as input:
#' tree1 <- data.frame(Parent = c(1, 1, 1, 1, 2, 3, 4),
#'                     Identity = 1:7,
#'                     Population = c(1, rep(5, 6)))
#' J1Index(tree1)
#' tree2 <- data.frame(Parent = c(1, 1, 1, 1, 2, 3, 4),
#'                     Identity = 1:7,
#'                     Population = c(rep(0, 4), rep(1, 3)))
#' J1Index(tree2)
#' tree3 <- data.frame(Parent = c(1, 1, 1, 1, 2, 3, 4),
#'                     Identity = 1:7,
#'                     Population = c(0, rep(1, 3), rep(0, 3)))
#' J1Index(tree3)
#' cat_tree <- data.frame(Parent = c(1, 1:14, 1:15, 15),
#'                        Identity = 1:31,
#'                        Population = c(rep(0, 15), rep(1, 16)))
#' J1Index(cat_tree)
#'
#' # If population sizes are omitted then internal nodes are assigned population
#' # size zero and leaves are assigned population size one:
#' sym_tree1 <- data.frame(Parent = c(1, rep(1:15, each = 2)),
#'                        Identity = 1:31,
#'                        Population = c(rep(0, 15), rep(1, 16)))
#' # Equivalently:                        
#' sym_tree2 <- data.frame(Parent = c(1, rep(1:15, each = 2)),
#'                        Identity = 1:31)
#' J1Index(sym_tree1)
#' J1Index(sym_tree2)
#' @author Rob Noble, adapted by Martin R. Smith
#' @references \insertAllCited{}
#' @family tree characterization functions
#' @importFrom stats na.omit
#' @export
J1Index <- function(tree, q = 1, nonRootDominance = FALSE) {
  
  if (!is.na(tree)[[1]]) {
    if(inherits(tree, "phylo")) {
      # convert from phylo object to data frame
      tree <- as.data.frame(tree[["edge"]])
      colnames(tree) <- c("Parent", "Identity")
    }
    tree <- na.omit(tree) # remove any rows containing NA
    if(is.factor(tree[["Parent"]])) {
      tree[["Parent"]] <- levels(tree[["Parent"]])[tree[["Parent"]]]
    }
    if (is.factor(tree[["Identity"]])) {
      tree[["Identity"]] <- levels(tree[["Identity"]])[tree[["Identity"]]]
    }
    tree <- .AddRootRow(tree)
  }
  
  n <- length(tree[["Identity"]])
  if (n <= 1) {
    return(0)
  }
  
  # Adjacency list
  Adj <- .GetAdjacency(tree)
  
  # Get the list of all subtree sizes
  subtree_sizes <- .GetSubtreeSizes(tree, Adj = Adj)
  
  # Subtree sizes, including the root
  Cumul <- subtree_sizes[["cumulative"]]
  
  # Vector of internal nodes
  eff_int_nodes <- which(!subtree_sizes[["is_leaf"]])
  
  # Vector of leaves
  leaves <- which(subtree_sizes[["is_leaf"]])
  
  # If population sizes are missing then assign size 0 to internal nodes,
  # and size 1 to leaves:
  if(!("Population" %in% colnames(tree))) {
    tree[["Population"]] <- double(n)
    tree[["Population"]][leaves] <- 1
  }
  if (sum(tree$Population) <= 0) {
    stop("At least one node must have Population > 0")
  }
  J <- 0
  # Subtree sizes, excluding the root
  Star <- Cumul - tree[["Population"]]
  for (i in seq_len(n)){ # Loop over all nodes
    if (Star[[i]] > 0){ # Node has at least one child with non-zero size
      K <- 0
      if(length(Adj[[i]]) > 1) { # Otherwise i has only one child and its 
                                 # balance score is 0
        eff_children <- 0 # Number of children with non-zero size
        for (j in Adj[[i]]) {
          if (Cumul[[j]] > 0){ # Otherwise child j has a 0-sized subtree and 
                               # does not count
            eff_children <- eff_children + 1
            
            # p is the ratio of the child subtree size including the root
            # (root = the child) 
            # to the parent subtree size excluding the root
            p <- Cumul[[j]] / Star[[i]]
            
            # K is the sum of the node balance scores
            K <- K + if (q == 1) {-p * log(p)} else {p ^ q}
          }
        }
        # Non-root dominance factor:
        if (isTRUE(nonRootDominance)) {
          h_factor <- Star[[i]] / Cumul[[i]]
        } else {
          h_factor <- 1
        }
        
        # Normalize the sum of balance scores, adjust for non-root dominance, 
        # and then add the result to the index
        if (eff_children > 1) { # Exclude nodes that have only one child with 
                                # size greater than zero
          J_increment <- h_factor * Star[[i]] * if (q == 1) {
             K / log(eff_children)
          } else {
            (1 - K) * eff_children ^ (q - 1) / (eff_children ^ (q - 1) - 1)
          }
          J <- J + J_increment
        }
      }
    }
  }
  # normalize the index by dividing by the sum of all subtree sizes:
  if (length(eff_int_nodes) > 0) {
    J <- J / sum(Star[eff_int_nodes])
  }
  
  # Return:
  as.numeric(J)
}

#' @rdname J1Index
#' @export
JQIndex <- J1Index