R/tree_utils.R

Defines functions plot_structured_tree build_coal_tree.structured build_coal_tree

Documented in build_coal_tree build_coal_tree.structured plot_structured_tree

#' Generate a Newick string representation of a tree corresponding to a realisation of a coalescent process.
#' Note: the topology is RANDOM, If you wish for the topology to be consistent, use set.seed before calling this function.
#' @param sampling_times times of leaves.
#' @param coalescent_times times of coalescent events / internal nodes.
#' @return a Newick string corresponding to the tree.
#' @export

build_coal_tree <- function(sampling_times, coalescent_times, leaf_names=NULL,node_name_prefix="N", terminate_string = TRUE) {
  sam_ord <- order(-sampling_times)
  times_desc <- sampling_times[sam_ord]

  if (!is.null(leaf_names)) {
    leaf_names <- leaf_names[sam_ord]
  }

  if (is.null(leaf_names)) {
    tree_nodes <- seq(1, length(sampling_times))
    tree_nodes <- sapply(tree_nodes, function (x) return (paste0("S", x)))
  } else {
    tree_nodes <- leaf_names
  }
  extant_entries <- c(1)
  extant_times <- c(times_desc[1])
  
  coal_idx <- 1
  s_idx <- 1
  t <- times_desc[1]
  if (length(coalescent_times) > 0) {
    coal_times_desc <- coalescent_times[order(-coalescent_times)]
    while (coal_idx <= length(coalescent_times)) {
      if (s_idx < length(times_desc) &&
          (times_desc[s_idx + 1] > coal_times_desc[coal_idx])) {
        s_idx <- s_idx + 1
        t <- times_desc[s_idx]
        extant_entries <- c(extant_entries, s_idx)
        extant_times <- c(extant_times, t)
      } else {
        t <- coal_times_desc[coal_idx]
        coal_node1_idx <-
          trunc(runif(1, 1, length(extant_entries) + 1))
        
        coal_node1 <- extant_entries[coal_node1_idx]
        ct1 <- extant_times[coal_node1_idx]
        
        extant_times <- extant_times[-coal_node1_idx]
        extant_entries <- extant_entries[-coal_node1_idx]
        
        br_len_1 <- ct1 - t
        entry1 <- tree_nodes[coal_node1]
        
        coal_node2_idx <-
          trunc(runif(1, 1, length(extant_entries) + 1))
        coal_node2 <- extant_entries[coal_node2_idx]
        ct2 <- extant_times[coal_node2_idx]
        br_len_2 <- ct2 - t
        entry2 <- tree_nodes[coal_node2]
        
        tree_nodes[coal_node2] <- paste0("(", entry1,
                                         ":", br_len_1,
                                         ",",
                                         entry2,
                                         ":", br_len_2, ")",node_name_prefix,coal_idx)
        extant_times[coal_node2_idx] <- t
        coal_idx <- coal_idx + 1
      }
    }
  }


  if (terminate_string) {
    tree_str <- paste0(tree_nodes[extant_entries[1]], ";")
  } else{
     tree_str <- tree_nodes[extant_entries[1]]
  }

  return(tree_str)
}

#' Generate a Newick string representation of a tree corresponding to a realisation of a structured,linage-based coalescent process.
#' Note: the topology is RANDOM, If you wish for the topology to be consistent, use set.seed before calling this function.
#'
#' @param sampling_times times of leaves.
#' @param coalescent_times times of coalescent events / internal nodes.
#' @param leaf_colours leaf colouring
#' @param coalescent_colours a numeric vector where number at j-th position indicates that the j-th coalescent event corresponds to j-th lineage.
#' @param div_times times of lineage divergence events. 
#' @param div_events a numeric vector where number at j-th position indicates that the j-th divergence time corresponds to j-th lineage
#' @param div_from numeric vector identifying the parent of the j-th diverging lineage
#' @return a list with the Newick string corresponding to the tree, and a list of leaf colouring assignments
#' @export

build_coal_tree.structured <- function(sampling_times, coalescent_times, leaf_colours, coalescent_colours, div_times, div_events, div_from) {

  sam_ord <- order(-sampling_times)
  coal_ord <- order(-coalescent_times)

  coal_times_desc <- coalescent_times[coal_ord]
  times_desc <- sampling_times[sam_ord]

  leaf_colours_desc <- leaf_colours[sam_ord]
  coalescent_colours_desc <- coalescent_colours[coal_ord]

  subtrees <- rep("", length(div_times))

  leaf_subs <- lapply(c(1:length(div_times)), function(x) which(leaf_colours==x))
  node_subs <- lapply(c(1:length(div_times)), function(x) which(coalescent_colours==x))


  ### First generate appropriate subtrees.
  for (i in c(1:length(div_times))) {
    
    ### Subset leafs and coalescent times

    sam_subs <- sampling_times[leaf_subs[[i]]]
    coal_subs <- coalescent_times[node_subs[[i]]]

    leaf_names <- sapply(c(1:length(leaf_subs[[i]])), function (x) paste0("S_",LETTERS[i],x)) 
    node_name_prefix <- paste0("N_", LETTERS[i])

    ### add any divergence event times as sampling times to this lineage, mark them with D[j] where j is the number of lineage diverging
    if(length(div_from) > 0) idx_set <- c(1:length(div_from)) else idx_set <- c()
    for (j in idx_set) {
      if (div_from[j] == i) {

        sam_subs <- c(sam_subs, div_times[j])
        leaf_names <- c(leaf_names, paste0("#D_", j))
      } 
    }

    tree <- build_coal_tree(sam_subs, coal_subs, leaf_names=leaf_names, node_name_prefix=node_name_prefix, terminate_string = FALSE)

    if (div_times[i] > -Inf) {
        if(length(coal_subs) > 0) {
          branch_len <- min(coal_subs)-div_times[i]
        } else {
          if (length(sam_subs) > 1) warning("Illegal state encountered")
          branch_len <- sam_subs[1]
        }
        tree <- paste0("(",tree,":",branch_len,")","X_",LETTERS[div_from[i]],LETTERS[i])
    }
    subtrees[i] <- tree
  }
  
  subtrees.ret <- sapply(subtrees, function (x) paste0(x,";"))

  ### Next build the combined tree
  if(length(div_events) > 1) idx_set <- c(1:(length(div_events)-1)) else idx_set <- c()
  for(i in idx_set){
    child <- div_events[i]
    parent <- div_from[i]

    diverging_tree <- subtrees[child]
    parent_tree <- subtrees[parent]

    subtrees[parent] <- gsub(paste0("#D_", i), diverging_tree, parent_tree)
  }


  tree_str <- paste0(subtrees[length(subtrees)], ";")
  return(list(full=tree_str, subtrees=subtrees.ret))
}

#'Plots a structured tree with different lineages distinguished by different colours, and divergence events highlighted.
#'
#' @param sampling_times a tree including divergence events
#' @param n_lineages number of lineages
#' @return a list with the Newick string corresponding to the tree, and a list of leaf colouring assignments
#' @export
plot_structured_tree <- function(tree, n_lineages){

    if (n_lineages > 1) exp_lins <- c(1:(n_lineages-1)) else exp_lins <- c()

    labs <- c(tree$node.label, tree$tip.label)

    lineages <- lapply(c(1:n_lineages), function (x) labs[grep(paste0("[N,X,S]_",LETTERS[x]), labs)])

    lin_names <- c(sapply(exp_lins, function (x) paste0("expansion ",x)), "neutral")

    lineage_labs <- unlist(lineages)
    membership <- unlist(lapply(c(1:length(lineages)), function (x) rep(lin_names[x], length(lineages[[x]]))))
    type <- sapply(lineage_labs, 
        function (x) if (length(grep("X_", x, value="FALSE"))>0) "Divergence" else if(length(grep("N_", x, value="FALSE"))>0) "Coalescent" else "Sampling")

    ldf <- data.frame(node = nodeid(tree, lineage_labs), clade = membership, type=type)
    ldf$node <- as.numeric(ldf$node)
    ldf$clade <- as.factor(ldf$clade)
    ldf$type <- as.factor(ldf$type)

    tree.full <- full_join(tree, ldf, by = 'node')

    plt<-ggtree(tree.full, aes(color=clade), ladderize=T, size=0.75) +
                    geom_point(aes(shape=type, size=type)) +
                    scale_size_manual(values=c(1.5,3,1.5)) +
                    scale_shape_manual(values=c(1,8,2)) +
                    theme_tree2()
    return(plt)
}
dhelekal/CaveDive documentation built on June 11, 2024, 4:32 p.m.