R/fit_bm_model.R
In castor: Efficient Phylogenetics on Large Trees

Documented in fit_bm_model

# Fit a Brownian Motion model of continuous trait evolution for one or more traits
# If more than one trait is modelled, then this function estimates the full diffusivity matrix D (=sigma^2/2)
# 	D is a non-negative definite symmetric matrix of size Ntraits x Ntraits, 
# 	such that exp(-X^T*D^{-1}*X/(4*L))/sqrt(det(2*pi*D)) is the probability density for the multidimensional trait vector X after phylogenetic distance L, if initially located at the origin.
# The main inputs are a list of trees (thought to be generated by the same BM process) and the corresponding numerical tip states
fit_bm_model = function(trees, 					# either a single tree in phylo format, or a list of trees
						tip_states,				# Numerical trait values for the tips of each tree. If trees[] is a single tree, then tip_states[] should be either a 1D vector of size Ntips or a 2D matrix of size Ntips x Ntraits. If trees[] is a list of trees, then tip_states should either be a list of 1D vectors or a 2D matrix of size Ntips x Ntraits.
						isotropic	= FALSE,	# logical, specifying whether diffusivity is assumed to be isotropic (i.e., independent of the direction). Hence, if isotropic=TRUE, then the diffusivity matrix is forced to be diagonal, with all entries being equal.
						Nbootstraps	= 0,		# integer, number of bootstraps to perform for estimating confidence intervals. Set to <=0 for no bootstrapping.
						check_input	= TRUE){
	# basic input checking
	if("phylo" %in% class(trees)){
		# trees[] is actually a single tree
		trees = list(trees)
		Ntrees = 1
		if(!(("list" %in% class(tip_states)) && (length(tip_states)==1))){
			tip_states = list(tip_states)
		}
	}else if("list" %in% class(trees)){
		# trees[] is a list of trees
		Ntrees = length(trees)
		if("list" %in% class(tip_states)){
			if(length(tip_states)!=Ntrees) return(list(success=FALSE,error=sprintf("Input list of tip_states has length %d, but should be of length %d (number of trees)",length(tip_states),Ntrees)))
		}else if("numeric" %in% class(tip_states)){
			if(Ntrees!=1) return(list(success=FALSE,error=sprintf("Input tip_states was given as a single vector, but expected a list of %d vectors (number of trees)",Ntrees)))
			if(length(tip_states)!=length(trees[[1]]$tip.label)) return(list(success=FALSE,error=sprintf("Input tip_states was given as a single vector of length %d, but expected length %d (number of tips in the input tree)",length(tip_states),length(trees[[1]]$tip.label))))
			tip_states = list(tip_states)
		}else if("matrix" %in% class(tip_states)){
			if(Ntrees!=1) return(list(success=FALSE,error=sprintf("Input tip_states was given as a single matrix, but expected a list of %d matrixes (number of trees)",Ntrees)))
			if(nrow(tip_states)!=length(trees[[1]]$tip.label)) return(list(success=FALSE,error=sprintf("Input tip_states was given as a matrix with %d rows, but expected %d rows (number of tips in the input tree)",nrow(tip_states),length(trees[[1]]$tip.label))))
			tip_states = list(tip_states)
		}
	}else{
		return(list(success=FALSE,error=sprintf("Unknown data format '%s' for input trees[]: Expected a list of phylo trees or a single phylo tree",class(trees)[1])))
	}
	if(is.null(Nbootstraps) || is.na(Nbootstraps) || (Nbootstraps<0)) Nbootstraps = 0;
	scalar = is.vector(tip_states[[1]])
	Ntraits	= (if(scalar) 1 else ncol(tip_states[[1]]))
	
	if(check_input){
		# check tree quality
		for(i in seq_len(Ntrees)){
			if(any(trees[[i]]$edge.length<0)) return(list(success=FALSE,error=sprintf("Tree #%d includes negative edge lengths",i)))
		}
	}
	
	# loop through the trees and extract independet contrasts
	# X will be a matrix of size Ncontrasts x Ntraits, each row of which lists a single independent contrast, i.e. geodistance divided by the square root of the corresponding phylodistance
	phylodistances 	= numeric()
	X				= matrix(nrow=0, ncol=Ntraits)
	for(i in seq_len(Ntrees)){
		pic_results 	= get_independent_contrasts(trees[[i]], tip_states[[i]], scaled = TRUE, only_bifurcations = FALSE, include_zero_phylodistances=FALSE, check_input = check_input);
		phylodistances 	= c(phylodistances, pic_results$distances)
		if(scalar){
			X = rbind(X, matrix(pic_results$PICs,ncol=1))		
		}else{
			X = rbind(X, pic_results$PICs)
		}
	}
	Ncontrasts = length(phylodistances)
			
	# estimate diffusivity based on vectorial increments
	if(scalar){
		diffusivity 	= 0.5 * mean(X**2)
		loglikelihood	= -0.25 * sum((X**2)/diffusivity) - 0.5*Ncontrasts*log(2*pi) - 0.5*sum(log(2*phylodistances)) - 0.5*Ncontrasts*log(diffusivity)
		Nparams			= 1
	}else{
		if(isotropic){
			# fit a single diffusivity for all directions
			diffusivity = (0.5/Ntraits) * mean(sapply(1:Ncontrasts, FUN=function(k) sum(X[k,]**2)), na.rm=TRUE)
			diffusivity = diag(diffusivity, Ntraits, Ntraits)
			Nparams = 1
		}else{
			# fit an arbitrary diffusivity matrix (constrained to be symmetric and non-negative definite)
			# maximum-likelihood estimator on the intrinsic geometry of positive-definite matrices
			diffusivity = matrix(0, ncol=Ntraits, nrow=Ntraits);
			for(t1 in 1:Ntraits){
				for(t2 in t1:Ntraits){
					diffusivity[t1,t2] = 0.5 * mean(X[,t1]*X[,t2])
					diffusivity[t2,t1] = diffusivity[t1,t2]; 
				}
			}
			Nparams = (Ntraits*(Ntraits+1))/2
		}
		inverse_diffusivity = solve(diffusivity)
		loglikelihood = -0.25 * sum(sapply(1:Ncontrasts, function(p) X[p,,drop=FALSE] %*% inverse_diffusivity %*% t(X[p,,drop=FALSE]))) - 0.5*Ntraits*Ncontrasts*log(2*pi) - 0.5*Ntraits*sum(log(2*phylodistances)) - 0.5*Ncontrasts*determinant(diffusivity,logarithm=TRUE)$modulus
	}
	
	#######################################################################
	# estimate confidence intervals if needed, via parametric bootstrapping
	
	if(Nbootstraps>0){
		bootstrap_Ds 	= matrix(NA,nrow=Nbootstraps,ncol=Ntraits*Ntraits)
		bootstrap_LLs	= rep(NA,times=Nbootstraps)
		for(b in 1:Nbootstraps){
			# simulate model with fitted diffusivity, along each tree
			bootstrap_tip_states = vector(mode="list", Ntrees)
			for(i in 1:Ntrees){
				bootstrap_tip_states[[i]] = castor::simulate_bm_model(	tree			= trees[[i]], 
																		diffusivity		= diffusivity,
																		include_tips	= TRUE, 
																		include_nodes	= FALSE, 
																		root_states 	= NULL,
																		Nsimulations	= 1,
																		drop_dims		= TRUE)$tip_states
			}

			# fit diffusivity using bootstrap-simulation
			fit = fit_bm_model(	trees, 
								tip_states	= bootstrap_tip_states,
								Nbootstraps	= 0,
								check_input = FALSE)
			if(fit$success){
				bootstrap_Ds[b,] = as.vector(fit$diffusivity)
				bootstrap_LLs[b] = fit$loglikelihood
			}
		}
		# calculate standard errors and confidence intervals from distribution of bootstrapped parameters
		standard_errors = sqrt(pmax(0, colMeans(bootstrap_Ds^2, na.rm=TRUE) - colMeans(bootstrap_Ds, na.rm=TRUE)^2))
		quantiles 		= sapply(1:ncol(bootstrap_Ds), FUN=function(p) quantile(bootstrap_Ds[,p], probs=c(0.25, 0.75, 0.025, 0.975), na.rm=TRUE, type=8))
		CI50lower 		= matrix(quantiles[1,],ncol=Ntraits)
		CI50upper 		= matrix(quantiles[2,],ncol=Ntraits)
		CI95lower 		= matrix(quantiles[3,],ncol=Ntraits)
		CI95upper 		= matrix(quantiles[4,],ncol=Ntraits)
		mean_BLL		= mean(bootstrap_LLs,na.rm=TRUE)
		consistency 	= sum(abs(bootstrap_LLs-mean_BLL)>=abs(loglikelihood-mean_BLL),na.rm=TRUE)/sum(!is.nan(bootstrap_LLs))
	}
	
	# return results
	results = list(	success			= TRUE,
					diffusivity		= diffusivity, 
					loglikelihood	= loglikelihood,
					AIC				= 2*Nparams - 2*loglikelihood,
					BIC				= Nparams * log(Ncontrasts) - 2*loglikelihood,
					Ncontrasts		= Ncontrasts,
					phylodistances	= phylodistances)
	if(Nbootstraps>0){
		results$standard_errors	= standard_errors
		results$CI50lower		= CI50lower
		results$CI50upper		= CI50upper
		results$CI95lower		= CI95lower
		results$CI95upper		= CI95upper
		results$consistency		= consistency
	}
	return(results)
}