R/recpd_calc.R

Defines functions prune_state branch_state_assign node_ident5 node_state_ace node_state_mpr node_state_nn nn_nodes recpd_calc

Documented in recpd_calc

#recpd_calc() - main wrapper function for performing RecPD calculations and
#branch state identification, and other cladistic and evolutionary distance
#based metrics.

#' Calculate the RecPD of a feature and other associated measures.
#'
#' See \href{https://www.biorxiv.org/content/10.1101/2021.10.01.462747v1}{Bundalovic-Torma, Guttman; (2021).}
#'
#' @param tree A phylogenetic tree, in phylo format.
#' @param tab A feature(s) presence/absence array, matrix, or data.frame:
#'   columns should be named according to the tip labels of the tree.
#' @param option Ancestral state reconstruction method to apply, either 'nn',
#'   'mpr', or 'ace'.
#' @param params Additional paramaters to pass to ancestral reconstruction
#'   methods.
#' @param calc logical. If TRUE, calculate additional measures: span,
#'   clustering, longevity, and lability.
#' @param prune logical. If TRUE and option == 'nn', will provide a conservative
#'   reconstruction of feature lineages.
#'
#' @return A named list including metrics and ancestral state annotations for
#'   tree nodes and branches for each feature in tab.
#' @export
#'
#' @examples
#' library(ape)
#' library(dplyr)
#'
#' ##Test case for a randomly generated tree (10 tips) and 10 randomly generated
#' ##feature presence/absence (1, 0) data.frame.
#'
#' #Generate a random phylogenetic tree:
#' n_tip <- 10
#' tree <- rtree(n_tip)
#'
#' #Generate 10 randomly distributed features (rows = features, columns =
#' #phylogenetic tree tips):
#' tab <- replicate(10,
#'                  sapply(10, function(x) sample(c(0,1), x, replace=TRUE)),
#'                  simplify='matrix') %>%
#'   t() %>%
#'   data.frame(check.names=FALSE) %>%
#'   rename_with(function(x) tree$tip.label[as.numeric(x)])
#'
#' #Note: columns should be labelled by tip.label of the tree. This also means that
#' #all of the tips in the tree should have an associated feature presence absence
#' #state. If tips lack this information, the tree should be parsed (see
#' #ape::keep.tip()).
#'
#'
#' #Run recpd_calc() without calculating additional measures (span, cluster,
#' #longevity, lability):
#' recpd_calc(tree, tab, calc=FALSE)
#'
#' #Run recpd_calc() with additional measures calculated:
#' recpd_calc(tree, tab, calc=TRUE)
#'
#' @author Cedoljub Bundalovic-Torma
#'
#' @references Insert reference with Rdpack.
#'
recpd_calc <- function(tree,
                       tab,
                       option = c('nn', 'mpr', 'ace'),
                       params = list(nn_select=c('min', 'median'), model_type=c('ER', 'ARD'), kappa=1, lik_cut=0.99),
                       calc = FALSE,
                       prune = TRUE){

  #Input arguments:

  #tree - the species tree tab - a presence absence matrix in dataframe format:
  #rows are a given trait/locus distribution, columns are the corresponding
  #species names matching the tip labels of the tree #Note: all species in the
  #presence/absence matrix (t) should be in the tree


  #method - the method for assigning internal node states - 3 options:
  #1) 'nn' : nearest-neighbours approach (by default)
  #2) 'mpr' : Most Parsimonious Reconstruction (using the MPR() function from the ape package)
  #3) 'ace' : Maximum Likelihood Ancestral Character Estimation (using the ace() function
  ##from the ape package).

  #Note that the ACE state ancestral likelihood measurements (marginal likelihoods)
  #are referred to as empirical bayesian posterior probabilities (see:
  #https://lukejharmon.github.io/ilhabela/instruction/2015/07/03/ancestral-states-2/)

  #In the future: 4) a bayesian approach? anc.Bayes() from the phytools package?
  #This only works for continuous features.


  #Additional parameters are supplied using the params list:
  #If option = 'nn':
  #nn_select = choice of evolutionary distance cutoff to select nearest-neighbour nodes - 'min' for minimum pairwise distance, 'median' for median pairwise distance.
  ##If option = 'ace':
  #model_type -  can choose between rates equal ('ER') or all rates different ('ARD') models.
  #kappa - exponent for transforming tree branch-lengths.

  #calc - TRUE/FALSE if you want to perform topology or evolutionary metric calculations.



  #If calc == TRUE, get the maximum spanning distributions here, to prevent
  #rerunning multiple times for multiple trait distributions - majority of time,
  #is spent on this step, especially for larger species trees.
  if(calc == TRUE) max_span <- get_max_span(tree)


  #Output is a list, for now...:
  res <- list()

  #An alternative: results is an object (of class 'recpd' - ?) which stores a
  #data.frame of RecPD and associated measures (measures) calculated for each feature, with
  #the following attributes:

  #1) anc_new - list of arrays storing finalized trait lineage phylogenetic tree
  #tip/node/branch state mappings.

  #2) anc_old - list of arrays storing preliminary trait phylogenetic tree
  #tip/node/branch state mappings.

  #3) tree - the user-provided phylogenetic tree phylo object.

  #Initialize the measures data.frame:
  measures <- data.frame()

  #Initialize anc_old and anc_new lists:
  anc_new <- list()
  anc_old <- list()



  #If tab supplied is a named vector (nrow = NULL), convert it to a matrix format:
  if(is.null(nrow(tab))){
    tab <- matrix(tab, nrow=1, ncol=length(tab), dimnames=list(make.names(1), names(tab)))
  }


  #Iterate through the rows/features of tab,
  #perform ancestral state reconstruction and identify feature lineages:
  for(i in 1:nrow(tab)){
    #Make sure tips in the presence absence matrix are ordered according to the
    #tip ordering in the phylogenetic tree: Names of columns in t must match the
    #tree! Have to have some error checking to make sure that names match...
    #Resulting array should have strain names....
    t <- tab[i,tree$tip.label]

    #Initialize an array to store the states of terminal tip and internal nodes
    node_state <- NULL

    #Assign states based on nearest neighbour descendant tips of a given node

    #Option 1: Nearest neighbour descendants:
    if(option[1] == 'nn'){
      #Only generate the species-tree interal-node nearest-neighbour tip list once!
      if(i == 1){
        #Check if any input parameters have been changed:
        nn_select <- 'min'
        if(!is.null(params$nn_select[1])) nn_select <- params$nn_select[1]

        node_sp <- nn_nodes(tree, option=nn_select)
      }
      node_state <- node_state_nn(tree, t, node_sp)

    }
    #Option 2: Most Parsimonious Reconstruction (from ape package)
    else if(option[1] == 'mpr'){
      node_state <- node_state_mpr(tree, t)

    }
    #Option 3: Maximum Likelihood Ancestral Character Estimation *from ape package)
    else if(option[1] == 'ace'){

      #Update input parameters for ace, if provided
      if(i == 1){
        #defaul parameters:
        model_type <- 'ER'
        kappa <- 1
        lik_cut <- 0.99

        #Check if any of the model paramaters have been changed:
        if(!is.null(params$model_type[1])) model_type <- params$model_type[1]
        if(!is.null(params$kappa)) kappa <- params$kappa
        if(!is.null(params$lik_cut)) lik_cut <- params$lik_cut
      }

      #ace will only work if there is at least one presence or absence state on the tips:
      if(sum(t) < length(t)){

        node_state <- node_state_ace(tree, t, model_type, kappa, lik_cut)
      }
      else{
        node_state <- array(rep(1, ape::Ntip(tree) + ape::Nnode(tree)),
                            dimnames=list(1:(ape::Ntip(tree) + ape::Nnode(tree))))
      }
    }

    #Assign branch states based on ancestral node state reconstruction:
    branch_state <- branch_state_assign(tree, node_state)

    #Merge descendant and ancestral nsode states together to identify putative
    #gain and loss nodes: also output all of the updated state annotations
    #(gain, loss, absence) for tips, branches, and internal nodes, for both
    #plotting and additional calculations.
    node_state_merge <- node_ident5(tree, node_state, branch_state)

    #Update node, tip, and branch state annotations, pruning excessive internal
    #gain lineage branches resulting from 'nn':
    if(option[1] == 'nn' & prune == TRUE) node_state_merge <- prune_state(tree, node_state_merge)


    #Calculate RecPD and nRecPD:
    recpd <- sum(tree$edge.length*branch_state)/sum(tree$edge.length)

    nrecpd <- nrecpd_calc(recpd, tree, t)


    ##Perform other calculations?:

    span <- NA
    cluster <- NA
    evo <- NA
    longevity <- NA
    lability <- NA

    if(calc){

      #Topology:
      ##Span:
      span <- span_calc(tree, t, max_span)

      ##Clustering (has to take the entire set of annotated internal node states):
      cluster <- cluster_calc(tree, t, node_state, branch_state)[[2]]

      #Evolutionary:
      ##Longevity, lability, gain and loss times.
      evo <- longevity_calc(tree, node_state_merge)
      longevity <- stats::median(evo$longevity)
      lability <- evo$lability3

    }

    #Old:
    # res[[rownames(tab)[i]]] <- list('metrics' = list(recpd=recpd,
    #                                                  span=span,
    #                                                  cluster=cluster[[2]],
    #                                                  #Note, longevity is taken
    #                                                  #as the median of trait
    #                                                  #lineage branches, leading
    #                                                  #from gain nodes to both
    #                                                  #presence state tips and
    #                                                  #descendant loss nodes (if
    #                                                  #present).
    #                                                  longevity=stats::median(evo$longevity),
    #                                                  lability=evo$lability3),
    # 'node_state' = node_state_merge,
    # 'ns_old' = list(tip_state=node_state[(1:ape::Ntip(tree))],
    #                 branch_state=branch_state,
    #                 node_state=node_state[-(1:ape::Ntip(tree))]))


    #Update measures, anc_old, and anc_new:
    measures <- rbind.data.frame(measures,
                                 data.frame(feature = rownames(tab)[i],
                                            prevalence = sum(tab[i,]),
                                            recpd = signif(recpd, 3),
                                            nrecpd = signif(nrecpd, 3),
                                            span = signif(span, 3),
                                            cluster = signif(cluster, 3),
                                            longevity = signif(longevity, 3),
                                            lability = signif(lability, 3))
                                 )


    anc_old[[rownames(tab)[i]]] <- list(tip_state=node_state[(1:ape::Ntip(tree))],
                                        branch_state=branch_state,
                                        node_state=node_state[-(1:ape::Ntip(tree))])

    anc_new[[rownames(tab)[i]]] <- node_state_merge
  }

  #Remove uncalculated measures from the measures data.frame:
  if(calc == FALSE) measures <- measures[, 1:4]

  #Assign anc_old, anc_new, and tree as attributes of the measures data.frame:
  measures <- structure(measures,
                        anc_old = anc_old,
                        anc_new = anc_new,
                        tree = tree)

  return(measures)
}


####
##The following functions are helpers called by recpd_calc().
####

##nn_nodes() - Identify Nearest Neighbour Tips Descended From Each Internal Node
#of the Phylogenetic Tree. The Preliminary Step For Nearest-Neighbours Ancestral
#State Reconstruction (required by the node_state_nn()).

#For each internal node of the tree, identifying the subset of tips of each
#child-node descendant, then ompare the pairwise phylogenetic distances between
#each tip subset, and select those tips which are: 1) Equal to the minimum
#pairwise distance, or 2) <= median pairwise distance.

nn_nodes <- function(tree, option=c('min', 'median')){
  #Get the pairwise tip distance matrix
  #dist <- cophenetic.phylo(tree)

  #Get the distances between tips and nodes:
  dist <- ape::dist.nodes(tree)

  #Get internal node indices
  nodes <- ape::Ntip(tree) + seq(1, ape::Nnode(tree))
  node_sp <- list()

  for(n in nodes){
    #Get descendant children nodes for each internal node
    descend <- phangorn::Descendants(tree, n, type='children')

    i <- which(descend <= ape::Ntip(tree))

    #Check if descendant nodes are tips:
    #If nodes include 2 tips:
    if(length(i) > 1){

      t1 <- descend[1]
      t2 <- descend[2]

      close_t <- descend

    }
    #If only one descendant is a tip, get the descendant tips of the other child
    #internal node
    else if(length(i) == 1){
      #If nodes include 1 tip:
      t1 <- descend[i]

      #Get the descendant tips of the other child descendant node:
      t2 <- unlist(phangorn::Descendants(tree, descend[-i], type='tips'))

      #Get their pairwise evolutionary distances:
      d_sub <- dist[t1, t2]

      #Find the closest tip to the child descendant node:
      if(option[1] == 'min'){
        close_t <- c(t1, as.numeric(names(d_sub)[which(d_sub == min(d_sub))]))
      }
      #Or, find the pairs of tips <= the 25%-tile (?median) pairwise distance:
      else{
        close_t <- c(t1, as.numeric(names(d_sub)[which(d_sub <= stats::median(d_sub))]))
      }

    }
    #If both descendant nodes are internal-nodes, get the descendant tips for
    #each:
    else{
      #If nodes are internal, find the nearest neighbour tips between the
      #descendant nodes:
      t1 <- unlist(phangorn::Descendants(tree, descend[1], type='tips'))
      t2 <- unlist(phangorn::Descendants(tree, descend[2], type='tips'))

      #Get their parwise evolutionary distances:
      d_sub <- dist[t1, t2]

      #Take the closest related tips:
      if(1){

        #Find the closest tip pair to the child descendant node:
        if(option[1] == 'min'){
          m_ind <- which(d_sub == min(d_sub), arr.ind=TRUE)
          #close_t <- as.numeric(c(rownames(d_sub)[m_ind[1,1]], colnames(d_sub)[m_ind[1,2]]))
        }
        #Or, find the pairs of tips <= the 25%-tile (?median) pairwise distance:
        else{
          m_ind <- which(d_sub < stats::median(d_sub), arr.ind=TRUE)
          #close_t <- as.numeric(c(rownames(d_sub)[unique(m_ind[,1])], colnames(d_sub)[unique(m_ind[,2])]))

        }
        close_t <- as.numeric(c(rownames(d_sub)[unique(m_ind[,1])], colnames(d_sub)[unique(m_ind[,2])]))
      }
      else{
        #Or choose the closest related pairs of strains between descendant nodes.
        m_ind <- apply(d_sub, 1, function(x) which(x == min(x)))
        close_t <- as.numeric(c(rownames(d_sub)), unique(as.numeric(colnames(d_sub)[m_ind])))
      }
    }

    #Order the pair of tips based on increasing distance to the given node:
    #Sort tips by increasing distance to their parent node:
    d <- dist[n, close_t]

    #Store closest related descendant species of a node:
    node_sp[[as.character(n)]] <- tree$tip.label[close_t[order(d)]] #list(close_t, tree$tip.label[close_t[order(d)]], sort(d))
  }

  return(node_sp)
}


####
#Ancestral Reconstruction Functions of Internal Node Trait States, Used For
#Branch Selection to Calculate RecPD and Associated Metrics:
# node_state_nn() - The Default / Original Implemenatation Based on Nearest-Neighbours:
####

node_state_nn <- function(tree, t, node_sp){

  #Define a function to assign states to internal nodes:
  assign_state <- function(j, node_sp, presabs){
    close_sp <- as.character(unlist(node_sp[j]))

    #Assign split states based on the states of nearest neighbour descendant
    #nodes:

    #Version 1: look at the closest related pair of strains descended
    #from each node, ensuring that they come from different child nodes. If they
    #both have the same state, then assign the parent node to that state,
    #otherwise annotate the parent node as a split (potential gain/loss event).

    #Version 2: only look at the first, closest descendant of the parent node.

    s <- sum(presabs[1,close_sp])

    state <- 0
    #If all tips are present state:
    if(s == length(close_sp)){
      state <- 1
    }
    else if(s != 0){
      state <- 2
    }

    return(state)
  }

  #Presence / absence state of locus/trait
  presabs <- ifelse(t != 0, 1, 0)

  #return(presabs)

  #An array to store the states of internal nodes and tips
  node_state <- NULL

  n <- 1:(ape::Ntip(tree) + ape::Nnode(tree))

  for(i in n){
    if(i <= ape::Ntip(tree)){
      node_state <- append(node_state, presabs[1,tree$tip.label[i]])
    }
    else{
      node_state <- append(node_state, assign_state(as.character(i), node_sp, presabs))
    }
  }
  names(node_state) <- n
  return(node_state)

}



#node_state_mpr() - Ancestral Node State Reconstruction Using Most Parsimonious
#Reconstruction: *requires the MPR() function from the 'ape' package.
node_state_mpr <- function(tree, t){
  #Presence / absence state of locus/trait
  c <- ifelse(t[tree$tip.label] != 0, 1, 0)
  names(c) <- 1:ape::Ntip(tree)

  #Add the internal node labels to the tree, replacing


  #Use the Most Parsimonious Reconstruction (MPR) function in the ape package to
  #assign ancestral states for a given T3SE to all nodes in the tree. The MPR
  #algorithm will produce a matrix of nodes (in their order of appearance in the
  #tree?) and their corresponding lower and upper value character state
  #prediction for each node. Therefore, in the case of a binary discrete
  #character, e.g. presence = 1, absence = 0:

  #- Nodes with [1,1] indicate that the ancestral state of the is predicted to be present.
  #- Nodes with [0,0] indicate the ancestral state of the trait is predicted to be absent.
  #- Nodes with [0,1] indicate an ambiguous ancestral state.

  #Note: the tree must be unrooted.

  #The states assigned according to the appearance of parent nodes in the edge
  #matrix of the unrooted tree: Note, have to supply an unrooted tree but also
  #designate a tip to use as a root. Given that midpoint rooted and chronograms
  #(usually) do not have a tip to designate as a root, so the resulting
  #node-state mappings have to be remapped to the original tree to be visualized
  #properly. To make this process easier by preserving the original clade
  #structuring of the tree, employ the following trick:

  #Replace node labels (usually bootstrap supports) with node IDs,
  #So output from mpr() is intelligible.
  tree$node.label <- (ape::Ntip(tree)+1):(ape::Ntip(tree) + ape::Nnode(tree))

  #Add an artificial outgroup, attaching it to the root node of the original
  #tree, then root the new tree on it, which will preserve the :
  r_tree <- phytools::bind.tip(tree, 'outgroup', edge.length=1)
  r_tree <- ape::root(r_tree, 'outgroup')

  r <- 'outgroup'

  #Make all branch lengths the same length.
  r_tree$edge.length <- rep(1, length(r_tree$edge.length))

  #Add the outgroup to the original tip presence/absence state array:
  t2 <- c(t, `outgroup`=0)

  #Now perform MPR:
  parsim <- ape::MPR(ifelse(t2[r_tree$tip.label] != 0, 1, 0), ape::unroot(r_tree), r)

  #Assign node labels to the node parsimony state array according to their
  #appearance in the rooted tree (note: in some imported trees, the original
  #node labels assigned by MPR are the node bootstrap support values, but
  #generally they will be in the order of the node numbers of the tree).

  #Convert the table into an array, assigning states based on correspondence of
  #lower and upper state predictions.
  node_state <- NULL
  node_state <- apply(parsim, 1, function(x) if(x[1] == x[2]){return(x[1])}else{return(2)})

  #Assign internal node IDs to the node_state array
  names(node_state) <- rownames(parsim)

  #Also append the presence/absence states of the tips:
  node_state <- append(c, node_state)

  #Check if there are any presence state tips without ancestral nodes
  #annotated: will occur for trait distributions of low prevalence,
  #or if the trait appears in a clade where states are largely absent.
  #If there are, assign as a split (= 2).

  a <- phangorn::Ancestors(tree, which(c == 1), type='parent')

  i <- which(node_state[as.character(a)] == 0)

  if(length(i) != 0) node_state[as.character(a)[i]] <- 2

  return(node_state)
}



#node_state_ace() -  Ancestral Node State Reconstruction Using ML (Ancestral
#Character Estimation): *Requires the ace() function from the 'ape' package.
node_state_ace <- function(tree, t, model_type='ER', kappa=1, lik_cut=0.9){
  #lik_cut - a cutoff for selecting states based on the scaled state likelihood
  #of each node.

  #This will effect the corresponding node state assignments, if it is too high,
  #all nodes will be splits.

  #Presence / absence state of locus/trait
  c <- ifelse(t[tree$tip.label] != 0, 1, 0)
  names(c) <- 1:ape::Ntip(tree)

  #ace will throw an error if there are 0 or negative length branches.
  #Convert all of these edges to a minimum edge length. Let's say 1e-10
  if(length(which(tree$edge.length == 0)) != 0){
    tree$edge.length[which(tree$edge.length == 0)] <- 1e-9
  }

  #Results will be quite different if branch lengths are all set to 1, and RecPD
  #appears to be close to the other approaches
  #tree$edge.length <- rep(1,length(tree$edge.length))

  #Equal Rates assumed for state transitions?
  if(model_type == 'ER'){
    ace_res <- ape::ace(ifelse(t[tree$tip.label] != 0, 1, 0),
                        tree,
                        type='discrete',
                        method="ML",
                        model='ER',
                        marginal=TRUE,
                        kappa=kappa)
  }
  else if(model_type == 'ARD'){
    #Unequal Rates?

    #The 'model' argument allows you to specify the number of
    #different rate categories (integer) to describe transitions between states.
    #Rates are estimated using ML using the best fit-model.

    #Note that given the size of the tree and locus/state tip distribution, some
    #rates (e.g. gains) might not be estimated. Changing the scaling of the
    #branch-lengths (kappa) will result in a multiple rates being estimated,
    #although this means that the estimated rates have to be rescaled. This
    #rescaling also changes the resulting likelihood estimates as well.... The
    #intuition is that by raising the branch-lengths of a tree by a certain
    #exponent to approximate speciational change leads to better ancestral
    #character estimates (This applies also to the ER model).

    ace_res <- ape::ace(ifelse(t[tree$tip.label] != 0, 1, 0),
                        tree,
                        type='discrete',
                        method="ML",
                        model='ARD',
                        marginal=TRUE,
                        kappa=kappa)
  }


  #Ancestral state scaled likelihoods:
  anc <- ace_res$lik.anc

  #An array for storing node states
  node_state <- NULL

  #Assign presence, absence, or split state to nodes if their corresponding
  #state likelihoods are >= lik_cut, <= 1-lik_cut, or not, respectively.
  for(i in 1:nrow(anc)){
    if(anc[i,1] >= lik_cut){
      node_state <- append(node_state, 0)
    }
    else if(anc[i,1] <= 1-lik_cut){
      node_state <- append(node_state, 1)
    }
    else{
      node_state <- append(node_state, 2)
    }
  }

  #Nodes in sequence:
  n_ord <- seq(ape::Ntip(tree) + 1, ape::Ntip(tree) + ape::Nnode(tree))

  names(node_state) <- n_ord

  #Add the tip node states
  node_state <- append(c, node_state)

  return(node_state)
}





#node_ident5() - consolidation of previously generated phylogenetic tree
#ancestral state node state assignments.

#Assigns split state nodes to gain or loss events, if they correspond to
#the following ancestor -> descendant branch state changes:

#E.g. : branch state ancestor -> branch state descendant = node state:
#absent -> present = gain node,
#present -> absent = loss node.

#Also merges any loss or gain nodes of idential state which occurr within the
#same phylogenetic tree lineages.

node_ident5 <- function(tree, node_state, branch_state){
  #Get the tip states:
  tip_state <- node_state[1:ape::Ntip(tree)]

  #Get the internal node states:
  node_state <- node_state[-(1:ape::Ntip(tree))]

  #Get the edge matrix of the tree:
  edge <- tree$edge


  #Identify state change nodes based on changes of ancestral and descendant
  #branches of each node:

  #Go through the edge matrix, for each parent node, get its ancestral edge, and
  #check if the branch states change, if they do then keep that node as a state
  #change.

  #Define an array for storing updated node states:
  ns_merge <- array(rep(NA, ape::Nnode(tree)),
                    dimnames=list((ape::Ntip(tree) + 1):(ape::Ntip(tree) + ape::Nnode(tree))))


  for(i in 1:nrow(edge)){
    #Get parent and child nodes for each edge:
    p <- edge[i,1]
    c <- edge[i,2]
    #Exclude the root:
    if(p != ape::Ntip(tree) + 1){
      #Get the ancestral node:
      a <- phangorn::Ancestors(tree, p, 'parent')

      #Find the ancestral edge:
      j <- which(edge[,1] == a & edge[,2] == p)

      #Compare branch states of the ancestral and decendant branches of the parent node:

      #If the state changes, then assign the node state to:

      #Node = Gain if ancestral branch is absent (0), and descendent branch is
      #present (1) = gain lingeage branches

      #Node = Loss if ancestral branch is present (1), and descendent branch is
      #absent (0) = loss lineage branches

      if(branch_state[i] == 1 & branch_state[j] == 0) ns_merge[as.character(p)] <- 1
      if(branch_state[i] == 0 & branch_state[j] == 1) ns_merge[as.character(p)] <- 0
    }
  }

  #Keep only the nodes where states change:
  rm <- which(is.na(ns_merge))
  ns_merge <- ns_merge[-rm]

  #Also check if the root node is assigned to an absence state. Then check
  #descendant nodepaths from the root which are all absent and assign to -1
  #state, i.e. absent lineages.

  r <- which(edge[,1] == ape::Ntip(tree) + 1)

  if(branch_state[r[1]] == 0 | branch_state[r[2]] == 0){
    #Get all nodepaths:
    np <- ape::nodepath(tree)

    #Traverse the nodepaths from the root, and identify any nodes on absent
    #lineages, assign them to -1:
    bs_abs <- list()

    for(i in 1:length(np)){
      x <- np[[i]]
      #Get the edge indices for consecutive nodes in each nodepath:
      j <- which(edge[,1] %in% x[-length(x)] & edge[,2] %in% x[-1])

      bs <- branch_state[j]
      #Find the consecutive absent state edge indices,
      #i.e. where does the branch state change to a gain?
      k <- which(bs[-length(bs)] != bs[-1])

      if(bs[1] == 0){
        if(length(k) != 0){
          bs_abs[[i]] <- j[1:k[1]]
        }
        else{
          bs_abs[[i]] <- j
        }
      }

      #Also assign any tips to absent states if they appear on a absent lineage
      #Descended directly from the root:
      if(bs[length(j)] == 0 & length(k) == 0){
        tip_state[as.character(i)] <- -1
      }
    }

    #Assign the identified branches to absent state:
    branch_state[unique(unlist(bs_abs))] <- -1
  }

  #Add the root node if at least one descendant branch has been assigned as presence:
  r_d <- which(edge[,1] == ape::Ntip(tree) + 1)

  if(length(which(branch_state[r_d] == 1)) != 0){
    ns_merge <- append(array(1, dimnames=list(ape::Ntip(tree) + 1)), ns_merge)
  }

  return(list(tip_state=tip_state,
              branch_state=branch_state,
              node_state=ns_merge)
  )
}





#branch_state_assign() - branch state annotation function based on the ancestral
#state reconstruction of neighbouring internal nodes of the species phylogenetic
#tree.

branch_state_assign <- function(tree, node_state){
  branch_state <- NULL

  for(j in 1:nrow(tree$edge)){
    e <- tree$edge[j,]
    e1 <- e[1]
    e2 <- e[2]

    s1 <- node_state[e1]
    s2 <- node_state[e2]

    #If the branch ancestor node is the root, only include it if its node state
    #is present (=1)?

    #Call branches present if ancestor and child nodes are both assigned as
    #presence, split, or a combination of presence/split:

    #Call branches absent if ancestor and child nodes are both assigned as
    #absence, split->absence, or absence->split.

    #Only keep branches leading to split nodes if its parent is a gain, and vice
    #versa?

    if(s1 != 2 & s1 == s2){
      branch_state <- append(branch_state, as.numeric(s1))
    }
    # else if(s1 != 2 && s2 == 2){
    #   branch_state <- append(branch_state, 1)
    # }
    else if(s1 != 0 && s2 != 0){
      branch_state <- append(branch_state, 1)
    }
    else{
      branch_state <- append(branch_state, 0)
    }
  }

  return(branch_state)
}




#prune_state() - prune excessive internal gain lineage branches resulting from
#nearest-neighbours annotation:

#1) Get the gain nodes, and identify their children descendant branch states:
#2) Remove them if the descendant branches are split between loss and gain
#lineages;
#3) Stop at the last split state node before a gain (both children descendant
#branches present state), or if one of the children nodes is a presence-state
#tip;
#4) Also, reannotate any internal branches which are descended from an absence
#state ancestral node.

prune_state <- function(tree, states){

  #Input: the recpd_res[[i]]$node_state list

  #Extract the previously defined node, branch, and tip states:
  ns <- states$node_state
  bs <- states$branch_state
  ts <- states$tip_state

  edge <- tree$edge

  #Define variables for updated node, branch, and tip states:
  ns_new <- ns
  bs_new <- bs
  ts_new <- ts


  #Get the gain nodes:
  gain_n <- as.numeric(names(ns)[ns == 1])

  #Exclude any gain nodes which have only a pair of tip descendants:
  nt_d <- unlist(lapply(phangorn::Children(tree, gain_n),
                        function(x) length(which(x <= ape::Ntip(tree)))))

  #Condition for checking single gain nodes:
  if(length(gain_n) == 1 & length(nt_d) == 2) gain_n <- NULL else gain_n <- gain_n[which(nt_d != 2)]

  #Need a condition for pruning lineages descended from the root...
  #For now, just remove them:
  #if((Ntip(tree) + 1) %in% gain_n) gain_n <- gain_n[-which(gain_n == Ntip(tree) + 1)]


  #An array for storing any modified/pruned gain nodes:
  rm_nodes <- NULL


  if(length(gain_n) != 0){
    for(n in gain_n){
      while(1){
        #Get the children nodes and branch states of the present gain node:
        c <- phangorn::Children(tree, n)
        c_branch <- which(edge[,1] %in% n & edge[,2] %in% c)

        #Find which child node exists on the gain lineage based on its ancestral
        #branch state:
        branch_pres <- which(bs[c_branch] == 1)

        #Get the ancestral node and its branch state leading to the current
        #node:
        a <- phangorn::Ancestors(tree, n, 'parent')
        a_branch <- which(edge[,1] %in% a & edge[,2] %in% n)


        #If the current node is a split (one child node is connected with a
        #present, the other absent branch), then remove the given node, and reassign
        #its branch state based on the anestral branch-state leading to it:
        if(length(branch_pres) == 1){
          #Remove the current node:
          rm_n <- which(names(ns_new) == as.character(n))
          if(length(rm_n) != 0) ns_new <- ns_new[-rm_n]

          #Store the initially annotated gain nodes which will be removed to
          #later update their descendant branches:
          if(n %in% gain_n) rm_nodes <- append(rm_nodes, n)

          #Reassign ancestral branch states:
          #If the current node is the root, then use the node state of the other
          #child node:
          if(n == ape::Ntip(tree) + 1){
            bs_new[c_branch[branch_pres]] <- bs_new[c_branch[-branch_pres]]

          }
          #Otherwise use the branch-state leading to its ancestral node:
          else{
            bs_new[c_branch[branch_pres]] <- bs_new[a_branch]

          }

          #Update the current node to the present state child:
          n <- edge[c_branch[branch_pres], 2]
        }
        #If the children descendant nodes are both on the present state
        #lineages, remove the node, then assign its ancestral node as a gain and
        #add its branch as present (if not root node):
        else{
          if(n != (ape::Ntip(tree) + 1)){
            #Add the ancestral node and branch leading to the current node:
            ns_new[as.character(a)] <- 1
            bs_new[a_branch] <- 1
            ns_new <- ns_new[order(as.numeric(names(ns_new)))]

            #Remove the current node:
            ns_new <- ns_new[!names(ns_new) %in% as.character(n)]
          }
          break
        }
      }
    }


    #Now that excess present state branches have been trimmed, check if any
    #previously inferred loss lineages should be changed to absent state.

    #Define a function to update branch state annotations on the tree by
    #checking all loss state branches and changing them to the state of their
    #ancestral branch.
    state_change <- function(n){
      #Get the nodepath of the node to the root of the tree:
      np <- ape::nodepath(tree, n, ape::Ntip(tree) + 1)

      #Get the edge indicies corresponding to parent - child node pairs from the
      #root to the given node:
      e <- which(edge[,2] %in% np[1:(length(np) - 1)] &
                   edge[,1] %in% np[2:length(np)])

      #Check if a branch state change occurs between the node to root:
      i <- which(bs_new[length(e):2] != bs_new[(length(e) - 1):1])

      #If so, then update the parent and child descendant branch states of the
      #current node to absent:
      if(length(i) == 0){

        c <- phangorn::Children(tree, n)

        if(length(c) != 0){
          e <- which(edge[,1] %in% n & edge[,2] %in% c)

          e <- e[which(bs_new[e] != 1)]

          if(length(e) != 0){
            #Update branch states
            bs_new[e] <<- -1

            #Also update tip states if descendants are tips:
            t <- which(edge[e,2] <= ape::Ntip(tree))

            #Note, have convert the edge-tip indicies to their appropriate tip numbers;
            if(length(t) != 0) ts_new[as.character(edge[e[t],2])] <<- -1

            for(n in c){
              state_change(n)
            }
            #print(c)
          }
        }
      }
    }

    #First check if the root state is absent:
    r_state <- bs_new[which(edge[,1] == ape::Ntip(tree) + 1)]

    #Reassign the branches for the pruned internal gain-state nodes previously
    #identified to the corresponding state of their ancestral branches:
    if(length(which(r_state == -1)) != 0){
      for(n in rm_nodes){
        if(!n %in% ape::Ntip(tree) + 1) state_change(n)
      }
    }
  }



  #Do a bit of cleaning up: After pruning feature gain lineages, sometimes an
  #orphaned loss lineage may be left behind:
  #Identify any loss state tips which do not have an associated ancestral loss event node,
  #If so, change their associated tip and branch states to absent.

  #Or can replace this with a check for removed loss nodes in the code above...

  if(length(which(bs_new == 0)) != 0){

    #Get the loss nodes
    loss_n <- as.numeric(names(ns_new[which(ns_new == 0)]))

    #If loss state branches exist, but no loss state nodes,
    #then assign these branches to absent:
    if(length(loss_n) == 0){
      bs_new[which(bs_new == 0)] <- -1
    }
    #Otherwise, if any loss state nodes exist:
    else{
      #Check that all loss branch states parent-child nodes are found in the
      #descendant nodes of the loss lineages. If any aren't, set their states to
      #absent:

      #Merge loss nodes as well as all their descendant nodes together:
      l_d <- c(loss_n, unlist(phangorn::Descendants(tree, loss_n, 'all')))

      #Get loss state branches:
      b_l <- which(bs_new == 0)


      #Check if each parent - child node pair of the loss state branches is found
      #in the array of loss node descendants:
      rm_e <- which(sapply(edge[b_l,],
                          function(x){
                            length(which(x %in% l_d))
                          }) == 0)

      if(length(rm_e) != 0){
        bs_new[b_l[rm_e]] <- -1

        #Also get tip descendants:
        rm_t <- which(edge[b_l[rm_e],2] <= ape::Ntip(tree))

        ts_new[as.character(edge[b_l[rm_e][rm_t], 2])] <- -1
      }
    }
  }

  #Old:
  #Remove any of the remaining gain nodes which have a pair of present state
  #child descendant branches:

  #if(length(rm_n)) ns_new <- ns_new[!names(ns_new) %in% rm_n]

  #if(length(rm_nodes) != 0) ns_new <- ns_new[-which(names(ns_new) %in% rm_nodes)]
  #print(rm_nodes)

  return(list(tip_state = ts_new,
              node_state = ns_new,
              branch_state = bs_new)
  )
}
cedatorma/recpd documentation built on April 25, 2022, 10:14 p.m.