In Dmirandae/blepd: Branch Length Evaluation in Phylogenetic Diversity Analysis

I am investigating whether branch lengths in the phylogenetic tree have any effect on the area(s) chosen for conservation. To do this, I will conduct a Phylogenetic Diversity (PD) analysis for the Rhynoclemmys genus. The topology I will use corresponds to a Total Evidence analysis from @alarcon2020a, and the distribution will be modified from @le2008.

We read the distribution and the tree

## For reproductibility purposes

set.seed(121)


options(warn=-1)
library(ggplot2)
suppressMessages(library(ggtree))
#~ library(ape)
suppressMessages(library(blepd))

#~ library(ggtree) 

## Just in case, to install ggtree: 
## https://bioconductor.org/packages/release/bioc/html/ggtree.html
####
#  if (!require("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("ggtree")

## Version

cat("Analyses made with blepd version:",unlist(packageVersion("blepd")))



## Create an object to place the distribution and the tree

RhinoclemmysData <- list()


## Read data

## distribution is a labeled csv file, areas by terminals 


#setwd("./csv/")

## getwd()


csvFile <- list.files(pattern="csv")

##csvFile


### the functions use a matrix object for the distributions

RhinoclemmysData$distribution <-  as.matrix(read.table(csvFile,
                                  stringsAsFactors=FALSE,
                                  header=TRUE,
                                  row.names=1,
                                  sep=",")
                                  )


print(t(RhinoclemmysData$distribution))



## tree(s) in nexus or newick format

##setwd("../tree/")

treeFiles <- list.files(pattern=".tre")

##treeFiles


RhinoclemmysData$tree  <-    read.tree(treeFiles)


## name of tree(s)

treeFiles <- gsub(".tre","",treeFiles)





## Plotting


#par(mfrow=c(2,1))


## The tree

## using ggtree

# RhinoclemmysData$tree <- reorder(RhinoclemmysData$tree, order = "cladewise")

plotTree <-  ggtree(RhinoclemmysData$tree, ladderize=TRUE,
                    color="black", size=0.51, linetype="solid") +
             geom_tiplab(size=4, color="black") +
             xlim(0,0.030) +
             theme_tree2() +
             ggtitle(treeFiles[1])



##
print(plotTree)


#~ Alternatively, we can plot the trees using APE
##
## 
plot.phylo(RhinoclemmysData$tree)
##
####nodelabels()
####tiplabels()



## to plot the distribution, we must transform it into a data.frame 
## object 


## distXY <- matrix2XY(RhinoclemmysData$distribution)


## We could reorder the data.frame following the names on the trees

## terminals <- colnames(RhinoclemmysData$distribution)

## realOrder <- match(terminals,RhinoclemmysData$tree$tip.label)

## equivalencias <- data.frame(terminals,realOrder)

## dXY2 <-  distXY

## for(cambiar in terminals){
##    dXY2$Terminal[distXY$Terminal == cambiar]     <-
##    equivalencias$realOrder[equivalencias$terminals==cambiar]
##}

## distGraficar <- distXY

#~ distGraficar <- dXY2


## plot using ggplot

# plotDistrib <- ggplot(distGraficar,
#                       aes(x= Area, y=Terminal), size =30) +
#                geom_point(shape=19, fill="white", 
#                           color="darkgrey", 
#                           size=4) +
#                labs(title = "Distribution",
#                     y = "",
#                     x = "Area") +
#                theme(axis.line=element_blank(),
#                       # axis.text.y=element_blank(), 
#                    axis.ticks=element_blank(), 
#                       # axis.title.y=element_blank(), 
#                    legend.position="none", 
#                    panel.background=element_blank(), 
#                    panel.border=element_blank(), 
#                    panel.grid.major=element_blank(), 
#                    panel.grid.minor=element_blank(), 
#                    plot.background=element_blank()
#                    ) 
# 



#~ 

## cowplot::plot_grid(plotTree,plotDistrib, ncol=1)

#~ 
plot(plotTree)

#~ plot(plotDistrib)

#~ 
print(t(RhinoclemmysData$distribution))


##dev.off()

Now we plot the branch lengths

library(ggplot2)

##par(mfrow=c(2,1))

datos <- RhinoclemmysData$tree$edge.length
data <- as.data.frame(datos)

a <- ggplot2::ggplot(data, aes(x=datos)) +
     geom_histogram(bins = 12) +
     labs(title = treeFiles, x = "Branch length: internals and terminals", y = "Frequency") +
     xlim(c(0.001, 0.025)) +
     ylim(c(0, 5))



terminals <- RhinoclemmysData$tree$edge[,2] < 1 + length(RhinoclemmysData$tree$tip.label)

datos <- RhinoclemmysData$tree$edge.length[terminals]
data <- as.data.frame(datos)

b <- ggplot(data, aes(x=datos)) +
  geom_histogram(bins = 12) +
  labs(x = "Branch length: terminals", y = "Frequency") +
  xlim(c(0.001, 0.025)) +
  ylim(c(0, 5))



datos <- RhinoclemmysData$tree$edge.length[!terminals]
data <- as.data.frame(datos)

c <- ggplot(data, aes(x=datos)) +
  geom_histogram(bins = 12) +
  labs(x = "Branch length: internals", y = "Frequency") +
  xlim(c(0.001, 0.025)) +
  ylim(c(0, 5))


  ## 

cowplot::plot_grid(a, b, c, nrow=3)

##
#
#cat("Terminals, with BL larger than 0.02 = ",findTerminalgivenLength(RhinoclemmysData$tree,0.02))
#
##


###dev.off()

The branch length histograms and the tree plot show that the internal length branches are similar among each other, and different to terminals'; there are two longer branches (larger than 0.02), R. nasuta (inhabiting Cho) and R. rubida (Pl), while the areas Al and Cho are the richest.

PD values

First, I calculate the PD value for the areas, given this tree.

RhinoclemmysData$tablePD <- PDindex(RhinoclemmysData$tree,
                              distribution=RhinoclemmysData$distribution,
                                    root=TRUE,
                                    percentual = TRUE)



RhinoclemmysData$matrixPD <- as.data.frame(
                                  matrix(
                                      unlist(RhinoclemmysData$tablePD),
                                      nrow= length(treeFiles),
                                      byrow=TRUE))



colnames(RhinoclemmysData$matrixPD) <- row.names(RhinoclemmysData$distribution)

row.names(RhinoclemmysData$matrixPD) <- "PD value"

 new <- (t(RhinoclemmysData$matrixPD))

 ## officially I am an Idiot

values <-  new[order(new[,1])]
names  <-  rownames(new)[order(new[,1])]

print(as.data.frame(paste(names,values)))


## cat("Max value is:",max(t(RhinoclemmysData$matrixPD)),"Area:",colnames(RhinoclemmysData$matrixPD)[which.max(t(RhinoclemmysData$matrixPD))])

The highest PD is for area Cho, followed by Al, the two richest areas, and the difference in PD value is not given by the richness but the species inhabiting each area.

Effect of branch lengths in the PD

We test the effect of the branch length on the PD values by swapping terminal and/or internal branch lengths, using the three available models.

 for( modelo in c("simpleswap","allswap","uniform") ){

        for( rama in c("terminals","internals","all") ){

            val <- swapBL(RhinoclemmysData$tree,
                          RhinoclemmysData$distribution,
                          model = modelo,
                         branch = rama
               )

         cat( "\n\t Tree=",treeFiles,"\n\t Model=",modelo,
             "\n\t Branchs swapped=",rama,"\n" )

        printBlepd(val)

         cat( "\n\n\n" )

     }

  }

The terminal branch length is critical in the decision taken, if we swap all terminal branch lengths (model= "allswap"), or if we replace them with a uniform distribution (model= "uniform"), the area selected might change from Cho to Al.

As the terminal branch lengths are distributed unequally, we might suspect that the results could depend on the longest branches that inhabit the areas Cho/Al.

But first, we test if the number of replicates has any effect.

for(repetir in 1:2){

            val <- swapBL( RhinoclemmysData$tree,
                           RhinoclemmysData$distribution,
                           model = "allswap",
                           branch = "terminals",
                           nTimes = 10**repetir
                         )

            printBlepd(val)

}

Roughly speaking, from 1000 on the results are alike, and the largest difference is using 10 or 100 replicates. As a rule of thumb, we must use at least 100 replicates, but 1000 will be better.

Now, let us see if the difference in results could be assigned to the longest branches (or not).

## we could use a single command (*evalBranch*)

AllTerminals <- evalBranch(tree=RhinoclemmysData$tree,
                           distribution=RhinoclemmysData$distribution,
                           branchToEval="terminals", 
                           approach="all")

print.evalBranchAll(AllTerminals)


AllInternals <- evalBranch(tree=RhinoclemmysData$tree,
                           distribution=RhinoclemmysData$distribution,
                           branchToEval="internals", 
                           approach="all")

print.evalBranchAll(AllInternals)

The area selected depends on the branch length. While the internal terminals had almost no impact, if the terminal branch length of R. nasuta is -47.18\% shorter, or if the terminal branch length is 84.95% larger for R. areolata, the area selected will change from Cho to Al, and if the terminal branch length of R. rubida is 61.32\% larger, the area selected will change from Cho to Pl.