The following document presents some results from testing the new ape::read.tree function from the ape package version r packageVersion("ape"). The first test consists basically on reading a tree with a singleton, which is a new feature of the package. The second tests compares the performance in terms of time that ape::read.tree takes relative to rncl::read_newick_phylo which has show to be significantly faster.

Data

The data used here is the PANTHER database version 11.1 (which you can get here). You can download a more recent version from the pantherdb website.

Setup

We use the microbenchmark R package together with the ape and rncl R packages

library(microbenchmark)
knitr::opts_chunk$set(echo=TRUE, message = FALSE, warning = FALSE)
# Listing panther trees
tree.files <- list.files(
  "~/PANTHER11.1/books/", pattern = "tree.tree",
  recursive=TRUE, full.names = TRUE)

Reading trees with singletons

This is a tree that I know for sure that has a singleton. Previous version of ape (before 4.1-0.14) returned with an error.

a_tree_with_singleton <- "
((((((((((((((AN14:0.000,AN15:0.000):0.000[&&NHX:Ev=0>1:S=Homo-Pan:ID=AN13],(AN17:0.000,AN18:0.000):0.000[&&NHX:Ev=0>1:S=Murinae:ID=AN16]):0.000[&&NHX:Ev=0>1:S=Euarchontoglires:ID=AN12],(AN20:0.000,AN21:0.000):0.000[&&NHX:Ev=0>1:S=Laurasiatheria:ID=AN19]):0.000[&&NHX:Ev=0>1:S=Eutheria:ID=AN11],AN22:0.004):0.003[&&NHX:Ev=0>1:S=Mammalia:ID=AN10],AN23:0.000):0.014[&&NHX:Ev=0>1:S=Amniota:ID=AN9],AN24:0.017):0.021[&&NHX:Ev=0>1:S=Tetrapoda:ID=AN8],((AN27:0.066,AN28:0.052):0.010[&&NHX:Ev=1>0:ID=AN26],(AN30:0.031,AN31:0.014):0.003[&&NHX:Ev=1>0:ID=AN29]):0.021[&&NHX:Ev=0>1:S=Teleostei:ID=AN25]):0.147[&&NHX:Ev=0>1:S=Osteichthyes:ID=AN7],AN32:0.194):0.104[&&NHX:Ev=0>1:S=Deuterostomia:ID=AN6],((AN35:0.020,AN36:0.038):0.416[&&NHX:Ev=0>1:S=Caenorhabditis:ID=AN34],((AN39:0.119,AN40:0.131):0.154[&&NHX:Ev=0>1:S=Insecta:ID=AN38],AN41:0.317):0.104[&&NHX:Ev=0>1:S=Arthropoda:ID=AN37]):0.118[&&NHX:Ev=0>1:S=Ecdysozoa:ID=AN33],AN42:0.381):0.594[&&NHX:Ev=0>1:S=Bilateria:ID=AN5],AN43:2.000):0.414[&&NHX:Ev=0>1:S=Eumetazoa:ID=AN4],((((((((AN52:0.484,AN53:0.427):0.418[&&NHX:Ev=0>1:S=Saccharomycetaceae:ID=AN51],AN54:0.733):0.320[&&NHX:Ev=0>1:S=Saccharomycetaceae-Candida:ID=AN50],AN55:0.808):0.290[&&NHX:Ev=0>1:S=Saccharomycetales:ID=AN49],(AN57:0.793,((AN60:0.206,AN61:0.186):0.143[&&NHX:Ev=0>1:S=Sordariomyceta:ID=AN59],AN62:0.331):0.332[&&NHX:Ev=0>1:S=Sordariomycetes-Leotiomycetes:ID=AN58]):0.467[&&NHX:Ev=0>1:S=Pezizomycotina:ID=AN56]):0.432[&&NHX:Ev=0>1:S=Pezizomycotina-Saccharomycotina:ID=AN48],AN63:0.864):0.223[&&NHX:Ev=0>1:S=Ascomycota:ID=AN47],(AN65:0.613,AN66:0.930,AN67:0.755):0.332[&&NHX:Ev=0>1:S=Basidiomycota:ID=AN64]):0.618[&&NHX:Ev=0>1:S=Dikarya:ID=AN46],((((((AN74:0.337,AN75:0.406):0.298[&&NHX:Ev=0>1:S=Saccharomycetaceae:ID=AN73],AN76:0.578):0.338[&&NHX:Ev=0>1:S=Saccharomycetaceae-Candida:ID=AN72],AN77:0.567):0.304[&&NHX:Ev=0>1:S=Saccharomycetales:ID=AN71],(AN79:0.379,((AN82:0.398,AN83:0.403):0.191[&&NHX:Ev=0>1:S=Sordariomyceta:ID=AN81],AN84:0.366):0.140[&&NHX:Ev=0>1:S=Sordariomycetes-Leotiomycetes:ID=AN80]):0.457[&&NHX:Ev=0>1:S=Pezizomycotina:ID=AN78]):0.331[&&NHX:Ev=0>1:S=Pezizomycotina-Saccharomycotina:ID=AN70],AN85:0.737):0.128[&&NHX:Ev=0>1:S=Ascomycota:ID=AN69],(AN87:0.599,AN88:0.698):0.161[&&NHX:Ev=0>1:S=Basidiomycota:ID=AN86]):0.398[&&NHX:Ev=0>1:S=Dikarya:ID=AN68]):0.117[&&NHX:Ev=1>0:ID=AN45],(AN90:0.984,AN91:0.893):0.158[&&NHX:Ev=1>0:ID=AN89]):0.262[&&NHX:Ev=0>1:S=Fungi:ID=AN44]):0.394[&&NHX:Ev=0>1:S=Opisthokonts:ID=AN3],(AN93:0.457,AN94:0.462):0.860[&&NHX:Ev=0>1:S=Dictyostelium:ID=AN92]):0.301[&&NHX:Ev=0>1:S=Unikonts:ID=AN2],AN95:1.440,AN96:2.000):2.000[&&NHX:Ev=0>1:S=Eukaryota:ID=AN1])[&&NHX:Ev=1>0:ID=AN0];
"

# Reading using the ape 4.1-0.14 -----------------------------------------------
ans_ape  <- ape::read.tree(text=a_tree_with_singleton)

# Reading using rncl 0.8.2 -----------------------------------------------------

# For some weird reason, rncl::read_newick_phylo prints a progress bar when
# the document is been knitted.
tmptree <- tempfile()
cat(a_tree_with_singleton, file=tmptree)
invisible(capture.output(
  ans_rncl <- rncl::read_newick_phylo(tmptree)
  ))
file.remove(tmptree)

# APE should have an extra node
ans_ape
ans_rncl

# Which is a singleton!
ape::has.singles(ans_ape)
ape::has.singles(ans_rncl)

Speed benchmark

For this benchmark, we will read all r length(tree.files) phylogenetic trees available in PANTHER 11.1

# A function to (partially) read PANTHER trees
read_panther <- function(x, f) {
  x       <- readLines(x, n=1L)
  tmptree <- tempfile()
  cat(x, file=tmptree)
  ans <- f(tmptree)
  file.remove(tmptree)
  ans
}

# Checking how it works with 100 samples
set.seed(1984)
n            <- length(tree.files) # 2000
tree.samples <- sample(tree.files, n)

library(parallel)
cl <- makeForkCluster(10)

ans <- parLapply(cl, tree.samples, function(tree) {
  microbenchmark(
    ape  = read_panther(tree, ape::read.tree),
    rncl = read_panther(tree, rncl::read_newick_phylo),
    times = 1, unit = "ms"
  )
})
stopCluster(cl)

tab <- lapply(ans, function(x) {
  # A bad way of reshaping the data...
  with(x, data.frame(
    ape  = time[expr == "ape"],
    nrcl = time[expr == "rncl"]
    ))
})

tab <- do.call(rbind, tab)
oldpar <- par(no.readonly = TRUE)
par(las = 2)
boxplot(tab/1e6, log = "y", ylab = "Time in Milliseconds (log scale)",
        main = "Distribution of reading times reading\nall the trees from PANTHER 11.1",
        sub = sprintf("Each boxplot represents elapsed times on reading %i trees.", length(tree.files))
        )
par(oldpar)

While ape seems to be faster, it is not significantly faster than rncl.

Session info

devtools::session_info()


USCbiostats/phylogenetic documentation built on Oct. 28, 2023, 7:23 a.m.