#Input a phylogenetic tree in the Newick format.
#If a 'leaf' node has no siblings, then the parent of the leaf is a useless 'singleton' internal node.
#Then, merge the branches together to remove the node, until leaves have siblings.
RemoveSingletonTips <- function(Tree) {
Iterativetree <- Tree
while (TRUE) {
#Find and merge one singleton
singletoninfo <- unlist(
strsplit(
sub("^.*([(]([-A-Za-z0-9_<> ]+):([.0-9]+)[)]:([.0-9]+)).*$",
"\\2 \\3 \\4 \\1", Iterativetree)
,
" ")
#Matches ... (Xyz_0:3.43):1.235 ...
#So that the 3.43 and 1.235 are then combined
)
#Note: singletoninfo has the structure
#3.43; 1.235; ...; Xyz_0
if (length(singletoninfo) != 4)
{break}
#New branch length is a simple addition of the branch lengths before and after the internal node to be removed
newbranchlength <-
as.numeric(singletoninfo[2]) + as.numeric(singletoninfo[3])
newbranch <- paste0(singletoninfo[1], ":", newbranchlength)
oldbranch <- singletoninfo[4]
Iterativetree <- sub(oldbranch, newbranch, Iterativetree, fixed = T)
}
return (Iterativetree)
}
#Removes deep internal singletons, such that all subtrees have siblings.
RemoveSingletons <- function(Tree) {
i <- 1
subtrees <- NULL
replacements <- NULL
subtreepattern <- "^.*([(][-A-Za-z0-9_<> ]+:[.0-9]+,[-A-Za-z0-9_,.:<>]+[)]).*$"
#First round of removing singleton tips, in case there are no subtrees at all
Iterativetree <- RemoveSingletonTips(Tree)
while(grepl(pattern = subtreepattern, x = Iterativetree)) {
#Subtree substitution
while(grepl(pattern = subtreepattern, x = Iterativetree)) {
subtree_i <- sub(subtreepattern, "\\1", Iterativetree)
replacement_i <- paste0("<", i, ">")
subtrees <- c(subtrees, subtree_i)
replacements <- c(replacements, replacement_i)
Iterativetree <- sub(subtree_i, replacement_i, Iterativetree, fixed = TRUE)
i <- i + 1
}
#Collapse the next layer
Iterativetree <- RemoveSingletonTips(Iterativetree)
}
#End of the double loop of replacing subtrees and collapsing singletons.
#Now, restore the subtrees back to its original form
if (length(replacements) > 0) {
for (i in length(replacements):1) {
Iterativetree <- sub(pattern = replacements[i],
replacement = subtrees[i],
x = Iterativetree)
}
}
return(Iterativetree)
}
#' Subset a newick tree
#'
#' Simplifies a newick tree by keeping only the specified subset of tips and the internal nodes joining them. Useless internal branches are merged by simple addition.
#' @param tipnames A character vector with the names of the tips to keep, which must be a subset of the tip names in 'newicktree'
#' @param newicktree The newick-format tree. Tip names (leaf node names) must not contain punctuation marks besides dashes or underscores
#' @export
SubsetNewick <- function(tipnames, newicktree) {
tree_protected <- newicktree
#1.
#Get rid of any tips that are not specified in tipnames
#Protect wanted tips
for (i in 1:length(tipnames)) {
header_i <- tipnames[[i]]
protection_i <- paste0("<", i, ">")
tree_protected <-
sub(pattern = header_i, replacement = protection_i,
x = tree_protected, fixed = T)
}
#Remove all non-protected tips
tree_purged <- gsub("[-A-Za-z0-9_ ]+:[.0-9]+,?", "", tree_protected)
#Remove hanging commas
tree_purged <- gsub(",+)", ")", tree_purged)
#Restore headers
for (i in 1:length(tipnames)) {
header_i <- tipnames[[i]]
protection_i <- paste0("<", i, ">")
tree_purged <- sub(pattern = protection_i,
replacement = header_i,
x = tree_purged,
fixed = T)
}
#2.
#Remove exposed internal nodes
uselessinternalnode <- "[(][)][:][.0-9]+,?"
while(grepl(pattern = uselessinternalnode, x = tree_purged)) {
#Removes "():0.123456," (regardless of presence of comma)
tree_purged <- gsub(pattern = uselessinternalnode,
replacement = "",
x = tree_purged)
#Remove hanging commas
tree_purged <- gsub(pattern = ",+)",
replacement = ")",
x = tree_purged)
}
#3.
#If a branch only has one child, then merge the branches together
#to remove the "singleton" internal node.
return(RemoveSingletons(tree_purged))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.