Nothing
################################################################################
## ##
## fit_threshold (v1.0) ##
## ##
## Created by Julien Clavel - 01-09-2025 ##
## (julien.clavel@hotmail.fr/ julien.clavel@univ-lyon1.fr) ##
## ##
## ##
################################################################################
## -------------------------------------------------------------------------------------- ##
# Environmental Threshold Model for binary character - Julien Clavel 2023 - julien.clavel@univ-lyon1.fr/julien.clavel@cnrs.fr
# Uses Hiscott et al. 2016 (GBE) pruning algorithm with numerical integration to approximate the log-likelihood (adaptated from Mathlab code).
# The algorithm is used to compute ML estimate of Felsenstein' threshold model with the climatic model described in Clavel & Morlon 2017 (PNAS), the BM, EB, OU and lambda models
# "phylo" is an object of class phylo (see ?ape)
# data is vector of a discrete (binary) trait (order should match the order in the tree)
# N = is the number of knots used to approximate the integral (through numerical integration) in the calculation of the partial likelihoods. The higher, the better, at the expense of computation time.
# env_data is an environmental curve object. See ?fit_t_env in RPANDA package for further details
## -------------------------------------------------------------------------------------- ##
# NOTE: code is quite slow, should implement the numerical integration in C or find some alternatives
# Workhorse function for the various Threshold models
fit_t_thresh <- function(phylo, data, model=c("BM","OU","EB","lambda","Clim"), N=200, env_data=NULL, ...){
# Reorder the tree => tree to phylo !!!
if(!inherits(phylo,"postorder")) phylo<-reorder.phylo(phylo,"postorder")
# reorder the trait vector according to the tree
if(is.null(names(data))){
stop("You should provide a named vector for \"data\" ")
}else{
data<-data[phylo$tip.label]
}
# TODO: check data type, should be a two-state character
# check model
model = match.arg(model[1],c("BM","OU","EB","lambda","Clim"))
## Use ellipsis for param arguments
para<-list(...)
tot_time = 0; # only used for the Clim model
# switch between methods
switch(model,
"BM"={
fun_to_optim <- function(par){
ll <- thresholdLikelihoodGauss(phylo, data, N, par[1])
return(-ll)
}
# parameter to optimize: only the threshold value
start_guess = 0
lower = -Inf
upper = Inf
},
"EB"={
fun_to_optim <- function(par){
if(abs(par[1])>.Machine$double.eps) phy2 <- transform_EB(phylo, beta=par[1], sigmasq=1) else phy2<-phylo
ll <- thresholdLikelihoodGauss(phy2, data, N, par[2])
if(!is.finite(ll)) return(1e6)
return(-ll)
}
maxHeight <- max(node.depth.edgelength(phylo))
start_guess <- c(-log(2)/(maxHeight/1.5), 0)
lower <- c(log(10^-5)/maxHeight, -Inf)
upper <- c(0, Inf)
},
"OU"={
if(!is.ultrametric(phylo)) stop("This model is currently not suported for non-ultrametric trees")
if(is.null(para[["alpha"]])) start_guess = log(2)/(max(node.depth.edgelength(phylo))/1.5) else start_guess = para$alpha
fun_to_optim <- function(par){
phy2 <- .phyOU(phylo, par[1])
ll <- thresholdLikelihoodGauss(phy2, data, N, par[2])
if(!is.finite(ll)) return(1e6)
return(-ll)
}
# define default upper bound for alpha parameter search (~25 phylogenetic half-lives)
start_guess <- c(start_guess, 0)
upper <- c(log(2)/(max(node.depth.edgelength(phylo))/25), Inf)
lower <- c(1e-10,-Inf)
},
"lambda"={
fun_to_optim <- function(par){
phy2 <- lambda_transform(phylo, par=par[1])
ll <- thresholdLikelihoodGauss(phy2, data, N, par[2])
return(-ll)
}
# define the bounds for the parameter search
upper <- c(1, Inf)
lower <- c(1e-6, -Inf)
# start_guess
start_guess = c(0.5, 0)
},
"Clim"={
# Control for the integrate function: number of subdivisions
if(is.null(para[["subdivisions"]])){
subdivisions<-200L
}else{
subdivisions<-para$subdivisions
}
# Max difference between tips and present day
if(is.null(para[["maxdiff"]])){
maxdiff<-0
}else{
maxdiff<-para$maxdiff
}
# Compute the branching times
if(is.ultrametric(phylo)){
times<-branching.times(phylo)
}else{
# Use "phytools" called by mvMORPH
times<-max(nodeHeights(phylo))-nodeHeights(phylo)[match(1:phylo$Nnode+Ntip(phylo),phylo$edge[,1]),1]
names(times)<-1:(phylo$Nnode+Ntip(phylo))
}
# Root age
tot_time<-max(times)
# Set the root to zero
times<-tot_time-times
# Check if the climatic function is provided
if(is.null(env_data)){
stop("Please provide a time-function or a time-serie dataset for the environmental changes; see ?fit_t_env")
}else if(!is.function(env_data)){
# We check first that the climatic curve matches the phylogeny
if(max(nodeHeights(phylo)) > max(env_data[,1])) stop("The environmental data does not cover the time span of the phylogeny: either enter data that covers the full time span or run analyses on younger clades")
# env_data is a dataframe with two columns: 1) is time; 2) is datapoints
if(is.null(para[["df"]])){
para$df <- smooth.spline(env_data[,1], env_data[,2])$df
}
spline_result <- sm.spline(env_data[,1],env_data[,2], df=para$df)
env_func <- function(t){predict(spline_result,t)}
# if we don't provide a time step in par we take the time steps of the dataset?
t<-unique(env_data[,1])
# If the ClimLin model is used we should standardize the environmental curve: i.e. sigma is the maximum rate value.
# the user can choose by specifying it in the "par" list
if(is.null(para[["scale"]])){
para$scale <- FALSE
}
# We build the interpolated smoothing spline function
if(para$scale==FALSE){
env_data<-splinefun(t,env_func(t))
}else{
curve_int<-env_func(t)
curve_scaled=scale(curve_int,min(curve_int,na.rm=T),max(curve_int, na.rm=T)-min(curve_int,na.rm=T))
env_data<-splinefun(t,curve_scaled)
}
# Otherwise we assume that the environnemental function is given
}
# wrapper to the main Clim-transform function
transClim <- function(tree, beta, sigma=1){
transformClim_thresh(tree, param=c(sigma,beta), fun=env_data, times=times, check=FALSE, mtot=tot_time, subdivisions=subdivisions, maxdiff=maxdiff, model="EnvExp")
}
# Function to optimize
fun_to_optim <- function(par){
phy2 <- transClim(phylo, beta=par[1])
if(any(!is.finite(phy2$edge.length))) return(1e6)
ll <- thresholdLikelihoodGauss(phy2, data, N, par[2])
if(!is.finite(ll)) return(1e6)
return(-ll)
}
# define bounds for parameters search
upper = c(Inf,Inf)
lower = c(-Inf,-Inf)
start_guess = c(0., 0)
})
# Optimization of the threshold model log-likelihood
opt <- optim(par=start_guess, fn = fun_to_optim, method="L-BFGS-B", lower=lower, upper=upper)
# results
nparam = length(opt$par)
nobs = length(data)
AIC = 2*opt$value+2*nparam
AICc = AIC+((2*nparam*(nparam+1))/(nobs-nparam-1))
# estimated parameters
switch(model,
"BM"={parameters <- data.frame(sigma=1, mu=opt$par[1])},
"lambda"={parameters <- data.frame(lambda=opt$par[1], mu=opt$par[2])},
"OU"={parameters <- data.frame(alpha=opt$par[1], mu=opt$par[2])},
"EB"={parameters <- data.frame(a=opt$par[1], mu=opt$par[2])},
"Clim"={parameters <- data.frame(beta=opt$par[1], mu=opt$par[2])})
# return the results
results <- list(LH=-opt$value,
free.parameters=nparam,
param=parameters,
aic=AIC, aicc=AICc, opt=opt, model=model, convergence=opt$convergence, env_func=env_data, tot_time=tot_time)
class(results) = "threshML"
return(results)
}
# pruning algorithm with numerical integration
thresholdLikelihoodGauss <- function(phylo, data, N=200, mu=0){
if(!inherits(phylo,"postorder")) phylo = reorder.phylo(phylo, "postorder")
# parameters
n = Ntip(phylo)
nnodes = 2*n-1
nchar = length(unique(data))
C = model.matrix(~as.factor(data)+0)
# vecteurs, matrices
L = numeric(nnodes)
U = numeric(nnodes)
X = matrix(0, nrow=nnodes, ncol=N+1)
x = (0:N)/N # for linear interpolation
# truncation error for each individual integral
alpha = n*N^(-4)/(2*(n-1))
# populate X value for the root? given or fixed
X[n+1,] = mu
# populate L, U and X
for(i in (n+2):nnodes){
L[i] = qnorm(alpha, mu, sqrt(sum(phylo$edge.length[hit_tip(i,phylo)])))
U[i] = qnorm(1-alpha, mu, sqrt(sum(phylo$edge.length[hit_tip(i,phylo)])))
X[i,] = (U[i] - L[i])*x+L[i]
}
# parameters
logL = 0 # log of the product of the likelihoods
like = numeric(nchar)
pair = seq(from=1, to=(2*n-4), by=2) # to (2*N-2) but (2*N-4) avoid updating the ancestral state
edge1 = phylo$edge[,1]
edge2 = phylo$edge[,2]
for(ch in 1:nchar){
z = C[,ch] # traits values observed at the leaf for characters in C
F = matrix(1, nnodes, N+1) #F(i,x) = likelihood of the data at or directly
#below node i, given a liability of x at i's
#parent node.
# tips values
for(i in 1:n){
tip_i = which(edge2==i)
j = edge1[tip_i] # parent of i
if(z[i]==1){
# integrates over positive liabilities
F[i,] = 1 - pnorm(0, X[j,], sqrt(phylo$edge.length[tip_i]))
}else if(z[i]==0){
# integrates over negative liabilities
F[i,] = pnorm(0, X[j,], sqrt(phylo$edge.length[tip_i]))
}
}
# pruning down the tree (all internal nodes excluding the root)
for(h in pair){
i = edge1[h] # node "i"
i_index = which(edge2==i) # index of node "i"
j = edge1[i_index] # parent of node "i"
u = edge2[h] # left descendent of node "i"
v = edge2[h+1] # right descendent of node "i"
if(phylo$edge.length[i_index]<=.Machine$double.eps){
F[i,] = F[u,]*F[v,]
}else{
# calculate the integral using a kernel (e.g., simpson, gaussian)
weights = buildWeightsSet(X[i,],X[j,], phylo$edge.length[i_index])
F[i,] <- tcrossprod(F[u,]*F[v,], weights) # that is: weights%*%(F[u,]*F[v,]); must check if it's faster than using 'for' loops
}
}
# log-likelihood at the root (assumes it's a fixed value)
u = which(edge1==n+1)
# We have L_r = U_r, so X_r[k] is constant in k.
like[ch] = F[edge2[u[1]],1]*F[edge2[u[2]],1]
# check first that the value is safe?
if(!is.finite(like[ch])){
warning("NaN were produced - there were an issue in the likelihood computation")
return(-Inf)
}
if(like[ch]>0){
logL = logL + log(like[ch])
}else{
logL = -Inf
}
}
return(logL)
}
# recover the path from root to tips
hit_tip <- function(tip, tree){
# parameters
N <- Ntip(tree)
edge1 = tree$edge[,1]
edge2 = tree$edge[,2]
i=2
list_ind <- list()
list_ind[[1]] <- tip
while(tip!=(N+1)){
list_ind[i] <- tip <- edge1[tree$edge[,2]==tip]
i = i+1
}
node_path<-unlist(list_ind)
index = which(tree$edge[,2]%in%node_path[-length(node_path)])
return(index)
}
# compute weights for the Gaussian kernel quadrature
buildWeightsSet <- function(X, mu, sig2){
#weights calculated for approximating \int phi(x,mu,sig2) f(x) dx
N = length(X)-1;
w = matrix(0,length(X),length(X)); #weight array
h = X[2] - X[1]; # spacing between two consecutive points.
f = matrix(0, nrow=3, ncol=(N/2)+1); #matrix of antiderivatives used to calculate the weights.
sig = sqrt(sig2);
index = seq(from=1,to=(N+1),by=2)
x = matrix(X[index], nrow=N+1, ncol=N/2+1, byrow = TRUE)
MU = matrix(t(mu), ncol=N/2+1, nrow = length(X))
#f(1,i) = antiderivative of phi(x,mu,sig2) at x = X(2*i - 1)
f1 = .5*erf((x-MU)/(sqrt(2*sig2)));
#f(2,i) = antiderivative of x*phi(x,mu,sig2) at x = X(2*i - 1)
f2 = -sig2*dnorm(x, MU, sig) + MU*f1;
#f(3,i) = antiderivative of x^2*phi(x,mu,sig2) at x = X(2*i - 1)
f3 = (-2*sig*((x+MU)*exp(-((x-MU)^2)/(2*sig2))) + 2*sqrt(2*pi)*(MU^2 + sig2)*f1)/(2*sqrt(2*pi));
x = x[,-ncol(x)];
x2 = x^2;
F1 = f1[,-1] - f1[,-ncol(f1)]# Note that \int f(x) dx [a, b] = F(b) - F(a) where F is the antiderivative of f
F2 = f2[,-1] - f2[,-ncol(f2)]
F3 = f3[,-1] - f3[,-ncol(f3)]
z = matrix(0, ncol=1, nrow=N+1)
w[,seq(from=1, to=ncol(w), by=2)] = (.5*(((2*h^2)+3*h*cbind(x,0)+cbind(x2,0))*cbind(F1,0) + (cbind(0,x)*(cbind(0,x+h))*cbind(0,F1))) - .5*(((3*h)+(2*cbind(x,0)))*cbind(F2,0) + ((2*cbind(0,x))+h)*cbind(0,F2)) + .5*(cbind(F3,0) + cbind(0,F3)))/(h^2);
w[,seq(from=2, to=ncol(w), by=2)] = (-((x2+2*h*x)*F1)+(2*(x+h)*F2)-F3)/(h^2);
return(w)
}
# the so-called 'error function'
erf <- function(x) 2 * pnorm(x * sqrt(2)) - 1
# Function for the Pagel's lambda model. Stretches the branch lengths
lambda_transform <- function(phylo, par){
n <- Ntip(phylo)
parent <- phylo$edge[,1]
descendent <- phylo$edge[,2]
extern <- (descendent <= n)
# Pagel's lambda tree transformation
if(par!=1) {
root2tipDist <- node.depth.edgelength(phylo)[1:n] # for non-ultrametric trees. The 'up' limit should be exactly 1 to avoid singularity issues
phylo$edge.length <- phylo$edge.length * par
phylo$edge.length[extern] <- phylo$edge.length[extern] + (root2tipDist * (1-par))
}
return(phylo)
}
# Wrapper to the .CLIMtransform function used in fit_t_env
transformClim_thresh<-function(phylo, model=c("EnvExp", "EnvLin"), ...){
# Use mvMORPH for computing the log-likelihood
#require(mvMORPH)
## Parameterization
par<-list(...)
# Default model
if(!is.function(model)){
model<-model[1]
}
# Number of tips
tips <- length(phylo$tip.label)
# Check if the climatic function is provided
if(is.null(par[["fun"]])){
stop("Please provide a time-function")
}else if(!is.function(par$fun)){
# env_data is a dataframe with two columns: 1) is time; 2) is datapoints
if(is.null(par[["df"]])){
par$df <- smooth.spline(par$fun[,1], par$fun[,2])$df
}
spline_result <- sm.spline(par$fun[,1],par$fun[,2], df=par$df)
env_func <- function(t){predict(spline_result,t)}
# if we don't provide a time step in par we take the time steps of the dataset?
t<-unique(par$fun[,1])
# If the ClimLin model is used we should standardize the environmental curve: i.e. sigma is the maximum rate value.
# the user can choose by specifying it in the "par" list
if(is.null(par[["scale"]])){
# We build the interpolated smoothing spline function
par$fun<-splinefun(t,env_func(t))
}else{
curve_int<-env_func(t)
curve_scaled=scale(curve_int,min(curve_int,na.rm=T),max(curve_int, na.rm=T)-min(curve_int,na.rm=T))
par$fun<-splinefun(t,curve_scaled)
}
# Otherwise we assume that the environnemental function is given
}
# Check if the branching time is provided
if(is.null(par[["times"]])){
warning("The branching time for the \"phylo\" object was not provided by the user")
par$times<-branching.times(phylo)
# root age
mtot<-max(par$times)
# Set the root to zero
par$times<-max(par$times)-par$times
}else{
# root age
mtot<-par$mtot
}
# Check if the difference between tip and present day.
if(is.null(par[["maxdiff"]])){
maxdiff<-0
}else{
maxdiff<-par$maxdiff
}
# Check if the tree is in prunning-wise order
if(is.null(par[["check"]])){
par$check<-TRUE
}
# Check if there is polytomies
if (phylo$Nnode != tips - 1) {
stop("You can't use this function with polytomies, transform the tree using \"multi2di\" function first")
}
# Control for the integrate function: number of subdivisions
if(is.null(par[["subdivisions"]])){
subdivisions<-200L
}else{
subdivisions<-par$subdivisions
}
## Transform the tree and return the log-likelihood
# Check the parameters
if(is.null(par[["param"]])) {
stop("Please provide parameters values for \"sig2\" and \"beta\" ")
}
# Sigma is not provided but analytically computed instead
phylo <- .CLIMtransform(phylo, param=par$param, mtot=mtot, times=par$times, funEnv=par$fun, model=model, tips=tips, subdivisions=subdivisions, maxdiff=maxdiff)
return(phylo)
}
# Print the results
print.threshML<-function(x,...){
cat("Threshold model",x$model,"\n")
cat("AIC :",x$aic,"\n")
cat("AICc:",x$aicc,"\n")
cat("Log-Likelihood:",x$LH,"\n")
cat("______________________","\n")
cat("Parameters:","\n")
print(unlist(x$param), digits=3)
cat("______________________","\n")
if(x$opt$convergence==0){cat("Succesful convergence","\n")}else{cat("Convergence has not been reached","\n")}
#if(x$hess.value==0 & x$opt$convergence==0){cat("A reliable solution has been reached","\n")}else{cat("Unreliable solution (Likelihood at a saddle point)","\n")}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.