R/model_adequacy_hbd.R
In castor: Efficient Phylogenetics on Large Trees

Documented in model_adequacy_hbd

# assess the adequacy of a (presumably fitted) homogenous-birth-death model to explaining a given (presumably real) timetree
# Each provided model should be a named list with the following elements:
#	ages	: numeric vector of size NG, listing ages in ascending order, covering at least 0 until the root
#	PSR		: numeric vector of size NG, listing the pulled speciation rate at each age-grid point
model_adequacy_hbd = function(	tree, 						# ultrametric timetree of class "phylo"
								models,						# named list specifying a single HBD model, or a list of such lists specifying multiple HBD models. Models are sampled randomly during bootstrapping.
								splines_degree	= 1,		# integer
								extrapolate		= FALSE,	# boolean, specifying whether to extrapolate model variables beyond their original time grid all the way to root_age and present-day if needed (as constants).
								Nbootstraps 	= 1000,		# integer, number of bootstraps to perform e.g. for checking statistical significances
								Nthreads		= 1){		# integer, number of parallel threads to use
	Nbootstraps 	= max(2,Nbootstraps)
	Nthreads 		= pmax(1,Nthreads)
	tree			= castor::multifurcations_to_bifurcations(tree)$tree # make sure tree is bifurcating, so that node & edge counts are the same as for the bootstraps
	Ntips			= length(tree$tip.label)
	Nnodes			= tree$Nnode
	root_age 		= get_tree_span(tree)$max_distance
	
	# check models
	if("list" %in% class(models[[1]])){
		Nmodels = length(models)
	}else{
		models  = list(models)
		Nmodels = 1
	}
	for(m in seq_len(Nmodels)){
		if(!is.null(models[[m]]$ages)){
			if(any(diff(models[[m]]$ages)<0)) return(list(success=FALSE, error=sprintf("Age grid  of model %d is not in increasing order",m)))
			if(models[[m]]$ages[1]>0){
				# the model's age grid does not cover the present-day
				if(extrapolate){
					models[[m]]$ages = c(0,models[[m]]$ages)
					models[[m]]$PSR  = (if(is.null(models[[m]]$PSR)) NULL else c(models[[m]]$PSR[1],models[[m]]$PSR))
				}else{
					return(list(success=FALSE, error=sprintf("Age grid must cover the present-day (age 0), but only reaches %g in model %d. Consider setting extrapolate=TRUE.",models[[m]]$ages[1],m)))
				}
			}
			if(tail(models[[m]]$ages,1)<root_age){
				# the model's age grid does not cover the root
				if(extrapolate){
 					models[[m]]$ages = c(models[[m]]$ages,root_age*1.001)
 					models[[m]]$PSR  = (if(is.null(models[[m]]$PSR)) NULL else c(models[[m]]$PSR,1.001*tail(models[[m]]$PSR,1)))
				}else{
					return(list(success=FALSE, error=sprintf("Age grid must cover the root age (%g), but only reaches %g in model %d. Consider setting extrapolate=TRUE.",root_age,tail(models[[m]]$ages,1),m)))
				}
			}
		}
	}	
	if(!(splines_degree %in% c(0,1,2,3))) return(list(success = FALSE, error = sprintf("Invalid splines_degree (%d): Expected one of 0,1,2,3.",splines_degree)))

	# calculate various statistics for the provided tree
	tree_gamma 		= gamma_statistic(tree)
	tree_Colless	= tree_imbalance(tree, type="Colless")
	tree_Sackin		= tree_imbalance(tree, type="Sackin")
	tree_LTT		= count_lineages_through_time(tree,Ntimes=10*log2(Ntips));
	tree_LTT$ages 	= pmax(0,root_age-tree_LTT$times)
	tree_node_ages	= root_age - get_all_distances_to_root(tree)[(1+Ntips):(2*Ntips-1)]
	tree_node_sizes	= as.numeric(count_tips_per_node(tree))
	tree_Blum		= sum(log(tree_node_sizes))
		
	# simulate bootstrap trees
	# for every unique model, perform all replicate simulations at once for efficiency
	bootstrap_trees = vector(mode="list", Nbootstraps)
	bootstrap2model = sample.int(n=Nmodels, size=Nbootstraps, replace=TRUE)
	for(m in seq_len(Nmodels)){
		bootstraps_this_model = which(bootstrap2model==m)
		if(length(bootstraps_this_model)==0) next
		sims = generate_tree_hbd_reverse(	Ntips			= Ntips,
											crown_age		= root_age,
											age_grid		= models[[m]]$ages,
											PSR				= models[[m]]$PSR,
											splines_degree	= splines_degree,
											Ntrees			= length(bootstraps_this_model))
		if(!sims$success) return(list(success=FALSE, error=sprintf("Failed to simulate bootstrap trees%s: %s",(if(Nmodels==1) "" else sprintf(" (model %d)",m)),sims$error)))
		bootstrap_trees[bootstraps_this_model] = sims$trees
	}
	
		
	# analyze bootstrap trees in parallel										
	analyze_bootstrap = function(b){
		bootstrap_tree = bootstrap_trees[[b]]	
		node_sizes = count_tips_per_node(bootstrap_tree)
		return(list(edge_lengths	= bootstrap_tree$edge.length,
					node_ages		= root_age - get_all_distances_to_root(bootstrap_tree)[(1+Ntips):(2*Ntips-1)],
					node_sizes		= as.numeric(node_sizes),
					Blum			= sum(log(node_sizes)), # Blum and Francois (2006). Which random processes describe the Tree of Life? A large-scale study of phylogenetic tree imbalance. Systematic Biology. 55:685-691.
					gamma 			= gamma_statistic(bootstrap_tree),
					Colless			= tree_imbalance(bootstrap_tree, type="Colless"),
					Sackin			= tree_imbalance(bootstrap_tree, type="Sackin"),
					LTT				= count_lineages_through_time(bootstrap_tree,times=tree_LTT$times)$lineages))
	}
	if((Nbootstraps>1) && (Nthreads>1) && (.Platform$OS.type!="windows")){
		# run trials in parallel using multiple forks
		# Note: Forks (and hence shared memory) are not available on Windows
		bootstraps = parallel::mclapply(seq_len(Nbootstraps), 
										FUN = analyze_bootstrap, 
										mc.cores = min(Nthreads, Nbootstraps), 
										mc.preschedule = TRUE, 
										mc.cleanup = TRUE)
	}else{
		# run in serial mode
		bootstraps = vector(mode="list", Nbootstraps)
		for(b in seq_len(Nbootstraps)){
			bootstraps[[b]] = analyze_bootstrap(b)
		}
	}
	if(is.null(bootstraps)) return(list(success=FALSE, error="An unknown error caused all bootstraps to fail"))
	
	# calculate P-value of gamma statistic
	bootstrap_gammas		= sapply(seq_len(Nbootstraps), FUN=function(b) bootstraps[[b]]$gamma)
	bootstrap_mean_gamma 	= mean(bootstrap_gammas, na.rm=TRUE)
	bootstrap_std_gamma 	= sd(bootstrap_gammas, na.rm=TRUE)
	Pgamma 					= mean(abs(bootstrap_gammas-bootstrap_mean_gamma)>=abs(tree_gamma-bootstrap_mean_gamma),na.rm=TRUE)
	RESgamma				= (tree_gamma-bootstrap_mean_gamma)/abs(bootstrap_mean_gamma) # relative effect size of gamma statistic
	SESgamma				= (tree_gamma-bootstrap_mean_gamma)/bootstrap_std_gamma # standardized effect size of gamma statistic
	
	# calculate P-value of Colless imbalance statistic
	bootstrap_Colless		= sapply(seq_len(Nbootstraps), FUN=function(b) bootstraps[[b]]$Colless)
	bootstrap_mean_Colless	= mean(bootstrap_Colless, na.rm=TRUE)
	bootstrap_std_Colless	= sd(bootstrap_Colless, na.rm=TRUE)
	PColless				= mean(abs(bootstrap_Colless-bootstrap_mean_Colless)>=abs(tree_Colless-bootstrap_mean_Colless),na.rm=TRUE)
	RESColless				= (tree_Colless-bootstrap_mean_Colless)/abs(bootstrap_mean_Colless) # relative effect size of Colless statistic
	SESColless				= (tree_Colless-bootstrap_mean_Colless)/bootstrap_std_Colless # standardized effect size of Colless statistic

	# calculate P-value of Sackin imbalance statistic
	bootstrap_Sackin		= sapply(seq_len(Nbootstraps), FUN=function(b) bootstraps[[b]]$Sackin)
	bootstrap_mean_Sackin	= mean(bootstrap_Sackin, na.rm=TRUE)
	bootstrap_std_Sackin	= sd(bootstrap_Sackin, na.rm=TRUE)
	PSackin					= mean(abs(bootstrap_Sackin-bootstrap_mean_Sackin)>=abs(tree_Sackin-bootstrap_mean_Sackin),na.rm=TRUE)
	RESSackin				= (tree_Sackin-bootstrap_mean_Sackin)/abs(bootstrap_mean_Sackin) # relative effect size of Sackin statistic
	SESSackin				= (tree_Sackin-bootstrap_mean_Sackin)/bootstrap_std_Sackin # standardized effect size of Sackin statistic

	# calculate P-value of Blum imbalance statistic
	bootstrap_Blum		= sapply(seq_len(Nbootstraps), FUN=function(b) bootstraps[[b]]$Blum)
	bootstrap_mean_Blum	= mean(bootstrap_Blum, na.rm=TRUE)
	bootstrap_std_Blum	= sd(bootstrap_Blum, na.rm=TRUE)
	PBlum				= mean(abs(bootstrap_Blum-bootstrap_mean_Blum)>=abs(tree_Blum-bootstrap_mean_Blum),na.rm=TRUE)
	RESBlum				= (tree_Blum-bootstrap_mean_Blum)/abs(bootstrap_mean_Blum) # relative effect size of Blum statistic
	SESBlum				= (tree_Blum-bootstrap_mean_Blum)/bootstrap_std_Blum # standardized effect size of Blum statistic
	
	# calculate confidence intervals of bootstrap LTTs
	bootstrap_LTTs		= t(sapply(seq_len(Nbootstraps), FUN=function(b) bootstraps[[b]]$LTT))
	bootstrap_LTT_CIs 	= calculate_equal_tailed_CIs_of_curves(bootstrap_LTTs)
	fraction_LTT_in_CI95= mean((tree_LTT$lineages<=bootstrap_LTT_CIs$CI95upper) & (tree_LTT$lineages>=bootstrap_LTT_CIs$CI95lower)) # fraction of the tree's LTT contained in the bootstrap CI95 interval
	
	# compare edge lengths of tree to bootstraps using Kolmogorov-Smirnov test
	bootstrap_edge_lengths 	= lapply(seq_len(Nbootstraps), FUN=function(b) bootstraps[[b]]$edge_lengths)
	edge_Kolmogorov_Smirnov = bootstrap_Kolmogorov_Smirnov_test(bootstraps=bootstrap_edge_lengths, empirical=tree$edge.length)

	# compare node ages of tree to bootstraps using Kolmogorov-Smirnov test
	# this is essentially a comparison of LTTs, since the LTT is the (non-normalized) cumulative distribution function of node ages
	bootstrap_node_ages 	= lapply(seq_len(Nbootstraps), FUN=function(b) bootstraps[[b]]$node_ages)
	node_Kolmogorov_Smirnov = bootstrap_Kolmogorov_Smirnov_test(bootstraps=bootstrap_node_ages, empirical=tree_node_ages)

	# compare node sizes (Ntips per node) of tree to bootstraps using Kolmogorov-Smirnov test
	bootstrap_node_sizes 	= lapply(seq_len(Nbootstraps), FUN=function(b) bootstraps[[b]]$node_sizes)
	size_Kolmogorov_Smirnov = bootstrap_Kolmogorov_Smirnov_test(bootstraps=bootstrap_node_sizes, empirical=tree_node_sizes)
	
	# wrap all results into a "report" list
	return(list(success						= TRUE,
				Nbootstraps					= Nbootstraps,
				tree_gamma					= tree_gamma,
				bootstrap_mean_gamma 		= bootstrap_mean_gamma,
				bootstrap_std_gamma 		= bootstrap_std_gamma,
				Pgamma						= Pgamma,
				RESgamma					= RESgamma,
				SESgamma					= SESgamma,
				tree_Colless				= tree_Colless,
				bootstrap_mean_Colless 		= bootstrap_mean_Colless,
				bootstrap_std_Colless 		= bootstrap_std_Colless,
				PColless					= PColless,
				RESColless					= RESColless,
				SESColless					= SESColless,
				tree_Sackin					= tree_Sackin,
				bootstrap_mean_Sackin 		= bootstrap_mean_Sackin,
				bootstrap_std_Sackin 		= bootstrap_std_Sackin,
				PSackin						= PSackin,
				RESSackin					= RESSackin,
				SESSackin					= SESSackin,
				tree_Blum					= tree_Blum,
				bootstrap_mean_Blum 		= bootstrap_mean_Blum,
				PBlum						= PBlum,
				RESBlum						= RESBlum,
				SESBlum						= SESBlum,
				tree_edgeKS					= edge_Kolmogorov_Smirnov$empirical_KS,
				bootstrap_mean_edgeKS 		= edge_Kolmogorov_Smirnov$mean_bootstrap_KS,
				bootstrap_std_edgeKS 		= edge_Kolmogorov_Smirnov$std_bootstrap_KS,
				PedgeKS						= edge_Kolmogorov_Smirnov$Pvalue,
				RESedgeKS					= edge_Kolmogorov_Smirnov$RES,
				SESedgeKS					= edge_Kolmogorov_Smirnov$SES,
				tree_nodeKS					= node_Kolmogorov_Smirnov$empirical_KS,
				bootstrap_mean_nodeKS 		= node_Kolmogorov_Smirnov$mean_bootstrap_KS,
				bootstrap_std_nodeKS 		= node_Kolmogorov_Smirnov$std_bootstrap_KS,
				PnodeKS						= node_Kolmogorov_Smirnov$Pvalue,
				RESnodeKS					= node_Kolmogorov_Smirnov$RES,
				SESnodeKS					= node_Kolmogorov_Smirnov$SES,
				tree_sizeKS					= size_Kolmogorov_Smirnov$empirical_KS,
				bootstrap_mean_sizeKS 		= size_Kolmogorov_Smirnov$mean_bootstrap_KS,
				bootstrap_std_sizeKS 		= size_Kolmogorov_Smirnov$std_bootstrap_KS,
				PsizeKS						= size_Kolmogorov_Smirnov$Pvalue,
				RESsizeKS					= size_Kolmogorov_Smirnov$RES,
				SESsizeKS					= size_Kolmogorov_Smirnov$SES,
				statistical_tests			= data.frame(	statistic			= c("gamma", "Colless", "Sackin", "Blum", "edge-lengths Kolmogorov-Smirnov", "node-ages Kolmogorov-Smirnov", "node-sizes Kolmogorov-Smirnov"),
															tree_value	 		= c(tree_gamma, tree_Colless, tree_Sackin, tree_Blum, edge_Kolmogorov_Smirnov$empirical_KS, node_Kolmogorov_Smirnov$empirical_KS, size_Kolmogorov_Smirnov$empirical_KS),
															bootstrap_mean		= c(bootstrap_mean_gamma, bootstrap_mean_Colless, bootstrap_mean_Sackin, bootstrap_mean_Blum, edge_Kolmogorov_Smirnov$mean_bootstrap_KS, node_Kolmogorov_Smirnov$mean_bootstrap_KS, size_Kolmogorov_Smirnov$mean_bootstrap_KS),
															bootstrap_std		= c(bootstrap_std_gamma, bootstrap_std_Colless, bootstrap_std_Sackin, bootstrap_std_Blum, edge_Kolmogorov_Smirnov$std_bootstrap_KS, node_Kolmogorov_Smirnov$std_bootstrap_KS, size_Kolmogorov_Smirnov$std_bootstrap_KS),
															Pvalue	 			= c(Pgamma, PColless, PSackin, PBlum, edge_Kolmogorov_Smirnov$Pvalue, node_Kolmogorov_Smirnov$Pvalue, size_Kolmogorov_Smirnov$Pvalue),
															RES		 			= c(RESgamma, RESColless, RESSackin, RESBlum, edge_Kolmogorov_Smirnov$RES, node_Kolmogorov_Smirnov$RES, size_Kolmogorov_Smirnov$RES),
															SES		 			= c(SESgamma, SESColless, SESSackin, SESBlum, edge_Kolmogorov_Smirnov$SES, node_Kolmogorov_Smirnov$SES, size_Kolmogorov_Smirnov$SES)),
				LTT_ages				= tree_LTT$ages,
				tree_LTT				= tree_LTT$lineages,
				bootstrap_LTT_CI		= bootstrap_LTT_CIs,
				fraction_LTT_in_CI95	= fraction_LTT_in_CI95))
}