R/auxiliary_routines.R
In castor: Efficient Phylogenetics on Large Trees

# convert a list to a vector using unlist, after converting NULLs to NAs
# this function may be needed if your list contains NULLs, in which case the standard unlist removes those entries from the returned vector
unlist_with_nulls = function(L){
	L[sapply(L, is.null)] = NA
	return(unlist(L))
}


# given a numeric vector, determine the indices of the N smallest non-nan values
get_smallest_items = function(values, N, check_nan=TRUE){
	if(check_nan){
		valids = which(!is.nan(values))
	}else{
		valids = seq_len(length(values))
	}
	smallest = valids[order(values[valids])][1:min(N,length(valids))]
	return(smallest)
}


get_adjacent_edges_per_edge = function(tree){
	Nedges = nrow(tree$edge)
	adjacents = get_adjacent_edges_per_edge_CPP(	Ntips 		= length(tree$tip.label),
													Nnodes		= tree$Nnode,
													Nedges		= Nedges,
													tree_edge	= as.vector(t(tree$edge)) - 1);
	
	# update indices from 0-based to 1-based
	return(lapply(1:Nedges,FUN = function(edge){ adjacents[[edge]]+1 }))
}



get_outgoing_edges_per_clade = function(tree){
	Nclades = length(tree$tip.label) + tree$Nnode;
	outgoing_edges = get_outgoing_edges_per_clade_CPP(	Ntips 		= length(tree$tip.label),
														Nnodes		= tree$Nnode,
														Nedges		= nrow(tree$edge),
														tree_edge	= as.vector(t(tree$edge)) - 1);
	
	# update indices from 0-based to 1-based
	return(lapply(1:Nclades,FUN = function(clade){ outgoing_edges[[clade]]+1 }))
}


get_incoming_edges_per_clade = function(tree){
	Nclades = length(tree$tip.label) + tree$Nnode;
	incoming_edges = get_incoming_edges_per_clade_CPP(	Ntips 		= length(tree$tip.label),
														Nnodes		= tree$Nnode,
														Nedges		= nrow(tree$edge),
														tree_edge	= as.vector(t(tree$edge)) - 1);
	
	# update indices from 0-based to 1-based
	return(lapply(1:Nclades,FUN = function(clade){ incoming_edges[[clade]]+1 }))
}



get_paths_root_to_tips = function(tree){
	Ntips = length(tree$tip.label)
	paths = get_paths_root_to_tips_CPP(	Ntips 		= Ntips,
										Nnodes		= tree$Nnode,
										Nedges		= nrow(tree$edge),
										tree_edge	= as.vector(t(tree$edge)) - 1);
	
	# update indices from 0-based to 1-based
	return(lapply(1:Ntips,FUN = function(tip){ paths[[tip]]+1 }))
}


# determine the parent nodes of one or more clades (tips or nodes) in a rooted tree
# clades may be specified as integer indices or labels; in the later case, if clades incude nodes then the tree must include node labels.
# The root's parent node does not exist, so 0 is returned in that case
parent_nodes = function(tree, clades){
	return(get_ancestral_nodes(tree, descendants=clades, Nsplits=1))
	# OLD CODE
# 	Ntips = length(tree$tip.label)
# 	clades = map_tip_or_node_names_to_indices(tree, A=clades, type="both", list_title="clade", check_input=TRUE)
# 	if(length(clades)==1){
# 		# optimized code for the special case of 1 clade
# 		edge = which(tree$edge[,2]==clades)
# 		if(length(edge)==0) return(0)
# 		else return(tree$edge[edge,1]-Ntips)
# 	}else{
# 		# general code for multiple clades
# 		clade2parent = rep(0,Ntips+tree$Nnode)
# 		clade2parent[tree$edge[,2]] = tree$edge[,1]-Ntips
# 		return(clade2parent[clades])
# 	}
}



# convert a name list (A) to a list of tip or node indices, or retain the original index list if already in integer form
# type can be 'tip', 'node' or 'both'
map_tip_or_node_names_to_indices = function(tree, A, type, list_title, check_input=TRUE){
	Ntips = length(tree$tip.label)
	if(type=='tip'){
		item_title 	= 'tip'
		Nmax_title 	= 'Ntips'
		Nmax 		= Ntips
		if(is.character(A)) name_pool = tree$tip.label;
	}else if(type=='node'){
		item_title 	= 'node';
		Nmax_title 	= 'Nnodes'
		Nmax 		= tree$Nnode;
		if(is.character(A)){
			if(is.null(tree$node.label)) stop(sprintf("ERROR: Tree must include node labels, for mapping %s to node indices",list_title))
			name_pool = tree$node.label
		}
	}else{
		item_title 	= 'tip or node'
		Nmax_title 	= 'Ntips+Nnodes'
		Nmax 		= Ntips+tree$Nnode;
		if(is.character(A)) name_pool = c(tree$tip.label,tree$node.label);
	}
	if((!is.character(A)) && (!is.numeric(A)) && (!is.integer(A))) stop(sprintf("ERROR: %s must be a character or integer vector",list_title))
	if(is.character(A)){
		name2index = 1:Nmax;
		names(name2index) = name_pool;
		Ai = name2index[A]; 
		if(check_input && any(is.na(Ai))) stop(sprintf("ERROR: Unknown %s name '%s'",item_title,A[which(is.na(Ai))[1]]));
		A = Ai;
	}else if(check_input && (length(A)>0)){
		minA = min(A); maxA = max(A);
		if(minA<1 || maxA>Nmax) stop(sprintf("ERROR: %s must contain values between 1 and %s (%d); instead found values from %d to %d",list_title,Nmax_title,Nmax,minA,maxA));
	}
	return(A);
}




# given a Markov transition rate matrix, calculate the transition probability matrix conditional upon a single transition occurring
# input: Q[i,j] is probability rate of transition i-->j
# output: P[i,j] will be the probability of transition i-->j, provided that a single transition i-->* occurred
get_conditional_transition_probabilities = function(Q){
	S = rowSums(Q)-diag(Q)
	S[S<=0] = 1
	P = Q/S
	diag(P) = 0
	return(P)
}



find_edge_splitting_tree = function(tree, target_tips, is_rooted=FALSE){
	Ntips 	= length(tree$tip.label)
	Nnodes 	= tree$Nnode
	if(is.character(target_tips)){
		# target tips are given as names, not indices
		indices	= match(target_tips, tree$tip.label)
		if(any(is.na(indices))) stop(sprintf("ERROR: Some target_tips (e.g. '%s') were not found in the tree",target_tips[which(is.na(indices))[1]]))
		target_tips = indices
	}
	
	results = find_edge_splitting_tree_CPP(	Ntips				= Ntips,
											Nnodes				= tree$Nnode,
											Nedges				= nrow(tree$edge),
											tree_edge			= as.vector(t(tree$edge)) - 1,	# flatten in row-major format and adjust clade indices to 0-based
											is_rooted			= is_rooted,
											target_tips			= target_tips - 1L,
											include_misplaced 	= TRUE)
													
	return(list(edge					= (if(results$edge<0) NA else as.integer(results$edge+1)),
				Nmisplaced_targets		= results$Nmisplaced_targets,
				Nmisplaced_nontargets	= results$Nmisplaced_nontargets,
				Ntargets_upstream 		= results$Ntargets_upstream,
				Ntargets_downstream 	= results$Ntargets_downstream,
				misplaced_targets		= results$misplaced_targets+1L,
				misplaced_nontargets	= results$misplaced_nontargets+1L));
				
}




get_subtree_with_clades = function(	tree, 
									clades_to_keep = NULL, 	# integer vector listing tip/node indices to keep
									collapse_monofurcations = TRUE, 
									force_keep_root = FALSE, 
									keep_all_children_of_explicit_clades_to_keep = FALSE, 
									keep_all_tips_of_explicit_clades_to_keep = FALSE){ 
	Ntips  = length(tree$tip.label);
	Nnodes = tree$Nnode;
	
	results = get_subtree_with_specific_clades_CPP(	Ntips 					= Ntips,
													Nnodes 					= Nnodes,
													Nedges					= nrow(tree$edge),
													tree_edge				= as.vector(t(tree$edge)) - 1, # flatten in row-major format and adjust clade indices to 0-based
													edge_length				= (if(is.null(tree$edge.length)) numeric() else tree$edge.length),
													clades_to_keep			= clades_to_keep-1,
													collapse_monofurcations = collapse_monofurcations,
													force_keep_root			= force_keep_root,
													keep_all_children_of_explicit_clades_to_keep 	= keep_all_children_of_explicit_clades_to_keep,
													keep_all_tips_of_explicit_clades_to_keep 		= keep_all_tips_of_explicit_clades_to_keep)
	Ntips_kept  	= results$Ntips_kept
	Nnodes_kept 	= results$Nnodes_kept
	new2old_clade 	= results$new2old_clade + 1L # switch to 1-based indices
	subtree = list(	Nnode 		= Nnodes_kept,
					tip.label 	= (if(is.null(tree$tip.label)) NULL else tree$tip.label[new2old_clade[1:Ntips_kept]]),
					node.label 	= (if(is.null(tree$node.label)) NULL else tree$node.label[new2old_clade[(Ntips_kept+1):(Ntips_kept+Nnodes_kept)]-Ntips]),
					edge 		= matrix(as.integer(results$new_tree_edge),ncol=2,byrow=TRUE) + 1L,
					edge.length = results$new_edge_length,
					root 		= results$new_root+1L,
					root.edge	= (if(force_keep_root && (!is.null(tree$root.edge))) tree$root.edge else NULL));
	class(subtree) = "phylo";
	attr(subtree,"order") = NULL
	
	return(list(tree 			= subtree,
				root_shift		= results$root_shift, # distance between old & new root (will always be non-negative)
				new2old_tip		= new2old_clade[1:Ntips_kept], 
				new2old_node	= new2old_clade[(Ntips_kept+1):(Ntips_kept+Nnodes_kept)]-Ntips));
}



# calculate the geometric placement of tips & nodes for plotting a tree as a phylogram (with the root on the left and tips on the right end, edges extend horizontally left to right)
get_phylogram_geometry = function(tree){
	results = get_phylogram_geometry_CPP(	Ntips 					= length(tree$tip.label),
											Nnodes 					= tree$Nnode,
											Nedges					= nrow(tree$edge),
											tree_edge				= as.vector(t(tree$edge)) - 1, # flatten in row-major format and adjust clade indices to 0-based
											edge_length				= (if(is.null(tree$edge.length)) numeric() else tree$edge.length));
	return(list(clade_x	= results$clade_x,		# x-coordinates of tips & nodes
				clade_y	= results$clade_y,		# y-coordinates of tips & nodes
				root_y	= results$root_y,		# the y-coordinate of the root
				min_x	= results$min_x,		# the minimum x-coordinate of any clade (normally this is 0)
				max_x	= results$max_x,		# the maximum x-coordinate of any tip
				min_y	= results$min_y,		# the minimum y-coordinate of any tip
				max_y	= results$max_y));		# the maximum y-coordinate of any tip
}




# assign tips & nodes of a tree to groups, such that each group is monophyletic (a "taxon") and represented by exactly one of given representative tips
# each representative tip is taken to represent a different taxon
# tip2taxon[n] or node2taxon[n] will be -1 if the tip/node could not be unambiguously assigned to a taxon (e.g., it contains multiple descending representatives)
assign_clades_to_taxa = function(tree, representative_tips){
	Ntips  = length(tree$tip.label);
	Nnodes = tree$Nnode;
	results = assign_clades_to_taxa_CPP(	Ntips 			= Ntips,
											Nnodes 			= tree$Nnode,
											Nedges			= nrow(tree$edge),
											tree_edge		= as.vector(t(tree$edge)) - 1, # flatten in row-major format and adjust clade indices to 0-based
											representatives	= representative_tips-1);
	return(list(tip2taxon 	= results$clade2taxon[1:Ntips]+1L,
				node2taxon	= results$clade2taxon[(Ntips+1):(Ntips+Nnodes)]+1L));
}




# congruify trees (map nodes in the target tree to "equivalent" nodes in the reference tree)
# [Eastman et al (2013). Congruification: support for time scaling large phylogenetic trees. Methods in Ecology and Evolution. 4:688-691]
# mapping must be one of the following:
#	A 2D integer array of size NM x 2 (with NM<=TNtips), listing Ttips mapped to Rtips (mapping[m,1] --> mapping[m,2])
#	A 2D character array of size NM x 2 (with NM<=TNtips), listing Ttip names mapped to Rtip names (mapping[m,1] --> mapping[m,2])
#	A data frame of size NM x 1, whose row names are target tip labels and whose entries are either integers (R tip indices) or strings (R tip labels). This is the format used by geiger::congruify.phylo
#	A vector of size NM, whose names are target tip labels and whose entries are either integers (R tip indices) or strings (R tip labels).
congruify_trees = function(reference_tree, target_tree, mapping){
	TNtips = length(target_tree$tip.label)
	RNtips = length(reference_tree$tip.label)

	# re-format mapping if needed
	if(is.data.frame(mapping)){
		mapped_Ttips = rownames(mapping)
		mapped_Rtips = (if(is.numeric(mapping[,1])) mapping[,1] else as.character(mapping[,1]))
	}else if(is.vector(mapping)){
		mapped_Ttips = names(mapping)
		mapped_Rtips = (if(is.numeric(mapping)) mapping else as.character(mapping))
	}else{
		# assuming mapping is a 2D array of size NM x 2
		mapped_Ttips = mapping[,1]
		mapped_Rtips = mapping[,2]
	}
	if(is.character(mapped_Ttips)){
		# mapped target tips given as names, not indices
		indices = match(mapped_Ttips, target_tree$tip.label)
		if(any(is.na(indices))) stop(sprintf("ERROR: Some mapped target tips (e.g. '%s') were not found in the target tree",mapped_Ttips[which(is.na(indices))[1]]))
		mapped_Ttips = indices
	}
	if(is.character(mapped_Rtips)){
		# mapped reference tips given as names, not integers
		indices = match(mapped_Rtips, reference_tree$tip.label)
		if(any(is.na(indices))) stop(sprintf("ERROR: Some mapped reference tips (e.g. '%s') were not found in the reference tree",mapped_Rtips[which(is.na(indices))[1]]))
		mapped_Rtips = indices
	}
	mapping = matrix(c(mapped_Ttips,mapped_Rtips),ncol=2,byrow=FALSE)
		
	# congruify
	results = congruify_trees_CPP(	RNtips		= RNtips,
									RNnodes		= reference_tree$Nnode,
									RNedges		= nrow(reference_tree$edge),
									Rtree_edge	= as.vector(t(reference_tree$edge)) - 1, # flatten in row-major format and adjust clade indices to 0-based
									TNtips		= TNtips,
									TNnodes		= target_tree$Nnode,
									TNedges		= nrow(target_tree$edge),
									Ttree_edge	= as.vector(t(target_tree$edge)) - 1, # flatten in row-major format and adjust clade indices to 0-based
									mapping		= as.vector(t(mapping))-1) # flatten in row-major format and adjust tip indices to 0-based
	return(list(reference_nodes = results$mapped_Rnodes+1L,
				target_nodes 	= results$mapped_Tnodes+1L))
}



# map nodes in tree A to nodes in tree B, assuming both trees have the same topologies (but are potentially indexed differently)
# if tipsA2B is NULL, tips are matched by name
# This function returns an error (success=FALSE) if the trees don't have equivalent topologies, so it can also be used as a simple equivalence test
match_tree_nodes = function(treeA, treeB, tipsA2B=NULL){
	Ntips  = length(treeA$tip.label)
	Nnodes = treeA$Nnode
	if((Ntips!=length(treeB$tip.label)) || (Nnodes!=treeB$Nnode)) return(list(success=FALSE, error=sprintf("Tree sizes don't match: TreeA has %d tips and %d nodes, treeB has %d tips and %d nodes",Ntips,Nnodes,length(treeB$tip.label),treeB$Nnode)))
	if(is.null(tipsA2B)){
		tipsA2B = match(treeA$tip.label, treeB$tip.label)
		if(any(is.na(tipsA2B))) return(list(success=FALSE, error=sprintf("Tip labels in treeA don't match tip labels in treeB")))
	}
	results = match_tree_nodes_CPP(	Ntips		= Ntips,
									Nnodes		= Nnodes,
									Nedges		= nrow(treeA$edge),
									tree_edgeA	= as.vector(t(treeA$edge)) - 1, # flatten in row-major format and adjust clade indices to 0-based
									tree_edgeB	= as.vector(t(treeB$edge)) - 1, # flatten in row-major format and adjust clade indices to 0-based
									tipsA2B		= tipsA2B-1);
	if(!results$success) return(list(success=FALSE, error=results$error));
	return(list(success 	= TRUE,
				rootA		= results$rootA,
				rootB		= results$rootB,
				nodesA2B	= results$nodesA2B+1L));
}


get_complement = function(N, indices){
	include 		 = rep(TRUE, times=N)
	include[indices] = FALSE
	return(which(include))
}

get_intersection = function(N, indicesA, indicesB){
	includeA 		 	= rep(FALSE, times=N)
	includeB 		 	= rep(FALSE, times=N)
	includeA[indicesA] 	= TRUE
	includeB[indicesB] 	= TRUE
	return(which(includeA & includeB))
}

# extract the values of independent rates from a transition_matrix, based on a provided index_matrix
# The index matrix should be as generated by get_transition_index_matrix, i.e. a 2D matrix of size Nstates x Nstates, and values in 1,..,Nrates, where Nrates is the number of independent rate variables
extract_independent_rates_from_transition_matrix = function(transition_matrix, index_matrix){
	flattened_index_matrix 	= as.vector(index_matrix)
	flattened_transition_matrix = as.vector(transition_matrix)
	independents 			= seq_along(flattened_index_matrix)[!duplicated(flattened_index_matrix)]
	independents 			= independents[flattened_index_matrix[independents]>0] # omit any zeros from index_matrix
	independent_rates 		= rep(NA,length(independents))
	independent_rates[flattened_index_matrix[independents]] = flattened_transition_matrix[independents]
	return(independent_rates)
}



# get a 1D lookup matrix of size Nstates, mapping birth-rates to indices of a subset of independent birth-rates
# model can be "ER" or "ARD" or a custom index_matrix as if it was generated by this function (in which case it is merely used to determine Nrates)
# This function is analogous to get_transition_index_matrix(..), but for 1D vectors
get_rate_index_vector = function(Nstates, rate_model){
	if (is.character(rate_model)) {
		if(rate_model == "ER"){
			Nrates = 1;
			index_vector = rep(1,Nstates)
		}else if(rate_model == "ARD"){
			Nrates = Nstates;
			index_vector = 1:Nrates;		
		}else{
			stop(sprintf("ERROR: Unknown rate_model '%s'",rate_model))
		}
	}else{
		if(length(rate_model)!=Nstates) stop(sprintf("ERROR: Wrong number of elements in rate model (expected %d, found %d)",Nstates,length(rate_model)));
		index_vector = rate_model
		Nrates = max(rate_model)
	}
	return(list(index_vector=index_vector, Nrates=Nrates))
}


extract_independent_rates_from_vector = function(rates, index_vector){
	independents 			= seq_along(index_vector)[!duplicated(index_vector)]
	independent_rates 		= rep(NA,length(independents))
	independent_rates[index_vector[independents]] = rates[independents]
	return(independent_rates)
}




# guesstimate an Mk transition matrix Q based on transitions along edges, as inferred via max-parsimony ASR
# Convention: Q[i,j] will be an estimate for the probability rate of the transition i-->j
# at least one of tip_states[] or tip_priors[] must be given; tip_states[] is given priority
guesstimate_Mk_transition_rates_via_max_parsimony_ASR = function(	tree, 
																	tip_states			= NULL,	# 1D vector of size Ntips, or NULL
																	tip_priors			= NULL,	# 2D array of size Ntips x Nstates, or NULL 
																	Nstates			 	= NULL, 
																	transition_costs 	= "all_equal",
																	allow_ties			= FALSE){	# logical, specifying whether to include tips with ties in the tip priors, i.e. tips that have the same tip_prior for various states. If TRUE, then ties are broken at random. If FALSE, then tips with ties are omitted from the analysis.
	# basic error checking & input formatting
	if(is.null(tip_states)){
		if(is.null(tip_priors)) return(list(success=FALSE, error="Missing tip_states or tip_priors"));
		if(allow_ties){
			tip_states = max.col(tip_priors, ties.method="random")
		}else{
			tip_states  = max.col(tip_priors, ties.method="first")
			tip_states2 = max.col(tip_priors, ties.method="last")
			tip_states[tip_states!=tip_states2] = NA # in case of ambiguity, assign NA
		}
	}
	if(length(tip_states)!=length(tree$tip.label)) return(list(success=FALSE, error=sprintf("Number of provided tip states (%d) does not match number of tips in the tree (%d)",length(tip_states),length(tree$tip.label))))
	
	# only consider subtree with known tip states
	known_tips = which(!is.na(tip_states));
	if(length(known_tips)<=1) return(list(success=FALSE, error=sprintf("All or almost all tips have unknown or ambiguous state")))
	if(length(known_tips)<length(tip_states)){
		extraction	= get_subtree_with_tips(tree, only_tips=known_tips, omit_tips=FALSE, collapse_monofurcations=TRUE, force_keep_root=TRUE);
		tree		= extraction$subtree;
		tip_states	= tip_states[extraction$new2old_tip]
	}
	Ntips  = length(tree$tip.label)
	Nedges = nrow(tree$edge)
	
	# perform ASR max-parsimony on known subtree
	asr = asr_max_parsimony(	tree				= tree, 
								tip_states			= tip_states, 		
								Nstates				= Nstates, 
								transition_costs	= transition_costs);
	if(!asr$success) return(list(success=FALSE, error="ASR max-parsimony failed"))
	Nstates = ncol(asr$ancestral_likelihoods);
								
	# determine most likely node states
	node_states 	 	= max.col(asr$ancestral_likelihoods, ties.method="first")
	clade_states 		= c(tip_states, node_states)
	state_transitions 	= cbind(clade_states[tree$edge[,1]],clade_states[tree$edge[,2]]) # state_transitions[e,1]-->state_transitions[e,2] is the transition along edge e.
	
	# determine entries Q[i,j] based on transitions i-->j among all edges
	Q 			 = matrix(0,nrow=Nstates,ncol=Nstates)
	Ntransitions = matrix(0,nrow=Nstates,ncol=Nstates)
	edge_lengths = (if(is.null(tree$edge.length)) rep(1.0,Nedges) else tree$edge.length)
	for(i in 1:Nstates){
		for(j in 1:Nstates){
			transitions = which((state_transitions[,1]==i) & (state_transitions[,2]==j));
			Ntransitions[i,j] = length(transitions)
			if(i!=j){ # keep diagonal zero, for correct normalization afterwards
				if(length(transitions)>0){
					mean_transition_time = mean(edge_lengths[transitions])
					Q[i,j] = 1.0/mean_transition_time
				}else{
					Q[i,j] = 0;
				}
			}
		}
	}
	
	# make sure Q has zero sum in each row
	diag(Q) = -rowSums(Q, na.rm = TRUE);
	return(list(success=TRUE, Q=Q, Ntransitions=Ntransitions));
}



# guesstimate the scaled of Mk transition rates based on transitions along independent contrasts
# Attention: Strictly speaking, this function can only estimate the single rate of an "ER" Mk model
# at least one of tip_states[] or tip_priors[] must be given; tip_states[] is given priority
guesstimate_Mk_rate_via_independent_contrasts = function(	tree, 					# rooted phylogenetic tree of class "phylo"
															Nstates,
															tip_states	= NULL,		# 1D vector of size Ntips, or NULL
															tip_priors	= NULL,		# 2D array of size Ntips x Nstates, or NULL 
															allow_ties	= FALSE){	# logical, specifying whether to include tips with ties in the tip priors, i.e. tips that have the same tip_prior for various states. If TRUE, then ties are broken at random. If FALSE, then tips with ties are omitted from the analysis.
	# basic error checking & input formatting
	if(is.null(tip_states)){
		if(is.null(tip_priors)) return(list(success=FALSE, error="Missing tip_states or tip_priors"));
		if(allow_ties){
			tip_states = max.col(tip_priors, ties.method="random")
		}else{
			tip_states  = max.col(tip_priors, ties.method="first")
			tip_states2 = max.col(tip_priors, ties.method="last")
			tip_states[tip_states!=tip_states2] = NA # in case of ambiguity, assign NA
		}
	}
	if(length(tip_states)!=length(tree$tip.label)) return(list(success=FALSE, error=sprintf("Number of provided tip states (%d) does not match number of tips in the tree (%d)",length(tip_states),length(tree$tip.label))))
	
	# only consider subtree with known tip states
	known_tips = which(!is.na(tip_states));
	if(length(known_tips)<=1) return(list(success=FALSE, error=sprintf("All or almost all tips have unknown or ambiguous state")))
	if(length(known_tips)<length(tip_states)){
		extraction	= get_subtree_with_tips(tree, only_tips=known_tips, omit_tips=FALSE, collapse_monofurcations=TRUE, force_keep_root=TRUE);
		tree		= extraction$subtree;
		tip_states	= tip_states[extraction$new2old_tip]
	}
	Ntips  = length(tree$tip.label)
	Nedges = nrow(tree$edge)
	
	# fit a symmetric Mk model to the data
	fit = fit_symmetric_mk(tree, Nstates = Nstates, tip_states	= tip_states, rate_model = "ER", Ntrials = 1)
	if(fit$success){	
		rate = fit$transition_matrix[1,2]
	}else if(!is.null(fit$guess_rate)){
		# fitting failed, so use first-fuess rate
		rate = fit$guess_rate
	}else{
		return(list(success=FALSE, error=sprintf("Unable to even guess the Mk rate: %s",fit$error)))
	}
	
	return(list(success=TRUE, rate=rate));
}



# return the ages of all branching events in an ultrametric timetree, i.e. all nodes accounting for multifurcations
# if the tree is purely bifurcating, this is the same as getting all node ages
# However, if the tree includes multifurcations, these are counted multiple times (since they represent multiple nearby bifurcations)
# Monofurcations are not returned
# Assumes that the tree is ultrametric
get_all_branching_ages = function(tree){
	Ntips  = length(tree$tip.label)
	Nnodes = tree$Nnode;
	depths = get_mean_depth_per_node_CPP(	Ntips			= Ntips,
											Nnodes			= Nnodes,
											Nedges			= nrow(tree$edge),
											tree_edge		= as.vector(t(tree$edge))-1,	# flatten in row-major format and make indices 0-based
											edge_length		= (if(is.null(tree$edge.length)) numeric() else tree$edge.length));
	Nchildren = get_child_count_per_node_CPP(	Ntips			= Ntips,
												Nnodes			= Nnodes,
												Nedges			= nrow(tree$edge),
												tree_edge		= as.vector(t(tree$edge))-1);
	branching_ages = rep(depths,times=Nchildren-1);
	return(branching_ages);
}


get_child_count_per_node = function(tree){
	Nchildren = get_child_count_per_node_CPP(	Ntips			= length(tree$tip.label),
												Nnodes			= tree$Nnode,
												Nedges			= nrow(tree$edge),
												tree_edge		= as.vector(t(tree$edge))-1);
	return(Nchildren)
}


# given a piecewise polynomial (splines) function f(x), defined as a time series on some x-grid, calculate its antiderivative A(x):=\int_{Xstart}^x f(u) du for an arbitrary number of target x values
# this function is most efficient when the requested target x-values are monotonically increasing or decreasing
get_antiderivative_of_splines_function = function(	Xgrid,			# numeric vector of size NG, listing x-values in ascending order
													Xstart,			# numeric, lower end of the integration, i.e. x-value where antiderivative is set to zero
													Ygrid,			# numeric vector of size NG, listing y-values along Xgrid
													splines_degree,	# integer, either 0,1,2 or 3, specifying the splines degree assumed for Y between grid points
													Xtarget){		# numeric vector of size N, specifying the target x values on which to evaluate the antiderivative. The function is most efficient if Xtarget are in ascending or descending order.
	A = get_antiderivative_CPP(	Xgrid, Xstart, Ygrid, splines_degree, Xtarget);
	return(A);
}


# given a piecewise polynomial (splines) function f(x), defined as a time series on some x-grid, calculate its first derivative A(x):=df(x)/dx at an arbitrary number of target x values
# this function is most efficient when the requested target x-values are monotonically increasing or decreasing
get_derivative_of_splines_function = function(	Xgrid,			# numeric vector of size NG, listing x-values in ascending order
												Ygrid,			# numeric vector of size NG, listing y-values along Xgrid
												splines_degree,	# integer, either 0,1,2 or 3, specifying the splines degree assumed for Y between grid points
												Xtarget){		# numeric vector of size N, specifying the target x values on which to evaluate the derivative. The function is most efficient if Xtarget are in ascending or descending order.
	D = get_derivative_CPP(Xgrid, Ygrid, splines_degree, Xtarget);
	return(D);
}




# given a lineages-through-time curve, defined as a time series on some discrete age grid, extract the branching ages that would have generated that LTT
# ages[] should be a 1D vector of ages (time before present) in ascending order
# LTT[] should be a 1D vector of the same size as ages[], listing the number of lineages at each age
# the LTT is assumed to be linear between adjacent age grid points
# branching points will be associated with those times where the LTT passes through an integer value
get_branching_ages_from_LTT = function(ages, LTT){
	results = get_branching_ages_from_LTT_CPP(ages, LTT);
	if(!results$success){
		return(list(success=FALSE, error=results$error));
	}else{
		return(list(success=TRUE, branching_ages = results$branching_ages));
	}
}


# given some density curve on an X-interval, define a non-uniform X-grid on that interval so that the density of grid points reflects the requested density
# this can be used for example to define an age grid, with the grid density reflecting the number of lineages in a timetree at any given age, e.g. for fitting purposes
# the density curve is specified as a piecewise linear function. The density must be non-negative, and have non-zero total area under the curve.
get_inhomogeneous_grid_1D = function(	Xstart,
										Xend, 
										Ngrid, 		# integer, number of grid points to return, including the edges Xstart & Xend
										densityX, 	# numeric vector of size ND, listing X-values for defining the density, in ascending order
										densityY,	# numeric vector of size ND, listing density values at densityX. 
										extrapolate = FALSE){	# extrapolate density grid as needed, to cover Xstart & Xend. The density will be extrapolated as a constant.
	if(Ngrid<2){
		stop(sprintf("Ngrid must be at least 2"));
	}else if(densityX[1]>=tail(densityX,1)){
		stop(sprintf("Values in densityX must be strictly increasing"))
	}
	if(Xstart<densityX[1]){
		if(extrapolate){
			densityX = c(Xstart,densityX);
			densityY = c(densityY[1], densityY);
		}else{
			stop(sprintf("Xstart (%g) is not covered by the density grid (which starts at %g). Consider setting extrapolate=TRUE.",Xstart,densityX[1]));
		}
	}else if(Xend>tail(densityX,1)){
		if(extrapolate){
			densityX = c(densityX,Xend);
			densityY = c(densityY,tail(densityY,1));
		}else{
			stop(sprintf("Xend (%g) is not covered by the density grid (which ends at %g). Consider setting extrapolate=TRUE.",Xend,tail(densityX,1)));
		}
	}
	return(get_inhomogeneous_grid_1D_CPP(	Xstart		= Xstart, 
											Xend		= Xend, 
											Ngrid		= Ngrid, 
											densityX	= densityX, 
											densityY	= densityY, 
											xepsilon	= 0.0000001*(Xend-Xstart)/Ngrid));
}



# given some distribution of numbers, define a non-uniform X-grid so that the density of grid points reflects the density of the provided random numbers
# This can be used for example to define an age grid, with the grid density reflecting the number of nodes in a timetree at any given age interval, e.g. for fitting purposes
# The returned Xgrid is guaranteed to include Xstart and Xend, to have size Ngrid, and to be non-decreasing.
# Note that if randomX does not cover the full requested interval [Xstart,Xend], then the first and last grid cells might deviate from the intended density
# Also note that if the randomX data is has Dirac-distribution peaks (i.e., many values are equal), then the returned grid might also include multiple grid points at the same location (i.e., the grid is not strictly monotonic)
get_inhomogeneous_grid_1D_from_samples = function(	Xstart,
													Xend, 
													Ngrid, 		# integer, number of grid points to return, including the edges Xstart & Xend
													randomX){ 	# numeric vector, listing random values defining the target density. Must be sorted in ascending order. Typically randomX should roughly cover the full requested interval [Xstart,Xend]
	if(Ngrid<2){
		return(list(success=FALSE, error=sprintf("Ngrid must be at least 2")))
	}else if(randomX[1]>=tail(randomX,1)){
		return(list(success=FALSE, error=sprintf("Values in randomX must be strictly increasing")))
	}
	randomX 		= randomX[(randomX>=Xstart) & (randomX<=Xend)] # ignore samples outside of the requested range
	if(length(randomX)==0) return(list(success=FALSE, error=sprintf("No randomX values found within requested interval")))
	Nrandom			= length(randomX)
	unique_randoms	= 1 + Nrandom - rev(c(1,1+which(diff(rev(randomX))<0))) # indices of unique values in randomX, keeping always the last instance of each value. Assuming that randomX is monotonically increasing.
	randomCDF_X 	= randomX[unique_randoms]
	randomCDF_P		= unique_randoms/Nrandom
	if(Xstart<randomCDF_X[1]){
		randomCDF_X = c(Xstart,randomCDF_X)
		randomCDF_P = c(0,randomCDF_P)
	}
	if(Xend>randomCDF_X[length(randomCDF_X)]){
		randomCDF_X = c(randomCDF_X,Xend)
		randomCDF_P = c(randomCDF_P,1.0001)
	}
	Xgrid = approx(x=randomCDF_P, y=randomCDF_X, xout=seq(from=0,to=1,length.out=Ngrid), yleft=Xstart, yright=Xend)$y
	return(list(success=TRUE,grid=Xgrid))
}



# calculate the pulled speciation rate (PSR) of an HBD congruence class for a given pulled diversification rate (PDR) and product rho*lambda(0)
get_PSR_from_PDR_HBD = function(oldest_age,
								age_grid,				# numeric vector of size NG, listing grid ages in ascending order. Must cover at least age0 and oldest_age.
								PDR				= 0,	# numeric vector of size NG, listing PDRs on the corresponding grid points
								age0			= 0,	# non-negative numeric, specifying the age at which rholambda0 is given, i.e. rholambda0=rho(age0)*lambda(age0)
								rholambda0		= 1,	# positive numeric, product rho(age0)*lambda(age0), where rho is the sampling fraction and lambda is the speciation rate
								splines_degree	= 1,	# either 1, 2 or 3, specifying the degree of the splines defined by the PDR on the age grid.
								relative_dt		= 1e-3,	# numeric, maximum relative time step allowed for integration. Smaller values increase integration accuracy. Typical values are 0.0001-0.001.
								include_nLTT0	= FALSE){	# (logical) whether to also calculate the ratio nLTT0:=LTT(age0)/LTT(present-day)
	# basic error checking
	if(is.null(PDR)) stop("Missing PDR")
	if(is.null(age_grid) || (length(age_grid)<=1)){
		if((!is.null(PDR)) && (length(PDR)!=1)) return(list(success=FALSE, error=sprintf("Invalid number of PDR values; since no age grid was provided, you must provide a single (constant) PDR")))
		# create dummy age grid
		NG 			= 2;
		age_grid	= seq(from=0,to=oldest_age,length.out=NG)
		if(!is.null(PDR)) PDR = rep(PDR,times=NG);
	}else{
		NG = length(age_grid);
		if((age_grid[1]>oldest_age) || (age_grid[NG]<oldest_age)) return(list(success=FALSE, error=sprintf("Age grid must cover the entire requested age interval, including oldest_age (%g)",oldest_age)))
		if((age_grid[1]>age0) || (age_grid[NG]<age0)) return(list(success=FALSE, error=sprintf("Age grid must cover the entire requested age interval, including age0 (%g)",age0)))
		if(include_nLTT0 && (age_grid[1]>0) || (age_grid[NG]<0)) return(list(success=FALSE, error=sprintf("Age grid must cover the present-day age (0) in order to calculate nLTT0")))
		if((!is.null(PDR)) && (length(PDR)!=1) && (length(PDR)!=NG)) return(list(success=FALSE, error=sprintf("Invalid number of PDR values; since an age grid of size %d was provided, you must either provide one or %d PDR",NG,NG)))
		if((!is.null(PDR)) && (length(PDR)==1)) PDR = rep(PDR,times=NG);
	}
	if(rholambda0<=0) return(list(success=FALSE, error=sprintf("rholambda0 must be strictly positive; instead, got %g",rholambda0)))
	if(!(splines_degree %in% c(0,1,2,3))) return(list(success=FALSE, error=sprintf("Invalid splines_degree (%d): Expected one of 0,1,2,3.",splines_degree)))
	if(age_grid[1]>tail(age_grid,1)) return(list(success=FALSE, error=sprintf("Values in age_grid must be strictly increasing")))

	# calculate PSR
	results = get_PSR_from_PDR_HBD_CPP(	age0			= age0,
										oldest_age 		= oldest_age,
										age_grid		= age_grid,
										PDR				= PDR,
										rholambda0		= rholambda0,
										splines_degree	= splines_degree,
										relative_dt		= relative_dt,
										include_nLTT0	= include_nLTT0)
	if(results$success){
		return(list(success	= TRUE, 
					ages	= results$refined_age_grid,	# numeric vector listing (potentially refined) grid ages, spanning [max(0,age_grid[1]), oldest_age]
					PSR		= results$PSR, 				# numeric vector of the same size as ages[], listing the PSR on the refined grid
					nLTT0	= (if(include_nLTT0) results$nLTT0 else NULL)))
	}else{
		return(list(success=FALSE, error=results$error))
	}
}





# calculate the pulled speciation rate (PSR) of an HBD model for a given speciation rate (lambda), extinction rate (mu) and the sampling fraction (rho0) at some age0>=0
get_PSR_of_HBD_model = function(oldest_age,					# oldest age until which to calculate the PSR
								age_grid		= NULL,		# numeric vector of size NG, listing grid ages in ascending order. Must cover at least age0 and oldest_age. Can also be NULL, in which case the same lambda & mu are assumed everywhere.
								lambda			= 0,		# numeric vector of size NG, listing speciation rates on the corresponding grid points. Can also be a single constant.
								mu				= 0,		# numeric vector of size NG, listing extinction rates on the corresponding grid points. Can also be a single constant.
								age0			= 0,		# numeric, age (time before present) at which the sampling fraction (rho) is specified
								rho0			= 1,		# positive numeric, sampling fraction at age0
								splines_degree	= 1,		# either 1, 2 or 3, specifying the degree of the splines defined by the PDR on the age grid.
								relative_dt		= 1e-3){	# numeric, maximum relative time step allowed for integration. Smaller values increase integration accuracy. Typical values are 0.0001-0.001.
	# basic error checking
	if(rho0<=0) return(list(success=FALSE, error=sprintf("rho must be strictly positive; instead, got %g",rho0)))
	if(!(splines_degree %in% c(0,1,2,3))) return(list(success=FALSE, error=sprintf("Invalid splines_degree (%d): Expected one of 0,1,2,3.",splines_degree)))
	if(is.null(age_grid) || (length(age_grid)<=1)){
		# this model is time-independent, i.e. lambda & mu are constant over time
		if(length(lambda)!=1) return(list(success = FALSE, error = sprintf("Invalid number of lambda values (%d); since no age grid was provided, you must either provide a single (constant) lambda or none",length(lambda))))
		if(length(mu)!=1) return(list(success = FALSE, error = sprintf("Invalid number of mu values (%d); since no age grid was provided, you must provide a single (constant) mu",length(mu))))
		NG = 1
		constant_rates = TRUE
	}else{
		NG = length(age_grid);
		if((age_grid[1]>oldest_age) || (age_grid[NG]<oldest_age)) return(list(success = FALSE, error = sprintf("Age grid must cover the entire requested age interval, including oldest_age (%g)",oldest_age)))
		if((age_grid[1]>age0) || (age_grid[NG]<age0)) return(list(success = FALSE, error = sprintf("Age grid must cover the entire requested age interval, including age0 (%g)",age0)))
		if((length(lambda)!=1) && (length(lambda)!=NG)) return(list(success = FALSE, error = sprintf("Invalid number of lambda values (%d); since an age grid of size %d was provided, you must either provide one or %d lambdas",length(lambda),NG,NG)))
		if((length(mu)!=1) && (length(mu)!=NG)) return(list(success = FALSE, error = sprintf("Invalid number of mu values (%d); since an age grid of size %d was provided, you must either provide one or %d mus",length(mu),NG,NG)))
		constant_rates = ((length(lambda)==1) || all(diff(lambda)==0)) && ((length(mu)==1) || all(diff(mu)==0)) # whether this model is in fact a constant-rates model
		if((length(lambda)==1)) lambda = rep(lambda,times=NG)
		if((length(mu)==1)) mu = rep(mu,times=NG)
		if(age_grid[1]>tail(age_grid,1)) return(list(success=FALSE, error=sprintf("Values in age_grid must be strictly increasing")))
	}

	# calculate PSR from lambda, mu & rho0=rho(age0)
	if(constant_rates){
		results = get_PSR_of_CR_HBD_model_CPP(	age0			= age0,
												oldest_age 		= oldest_age,
												lambda			= lambda[1],
												mu				= mu[1],
												rho0			= rho0,
												relative_dt		= relative_dt)
	}else{
		results = get_PSR_of_HBD_model_CPP(	age0			= age0,
											oldest_age 		= oldest_age,
											age_grid		= age_grid,
											lambda			= lambda,
											mu				= mu,
											rho0			= rho0,
											splines_degree	= splines_degree,
											relative_dt		= relative_dt)
		
	}
	if(results$success){
		return(list(success	= TRUE, 
					ages	= results$ages, # numeric vector listing (potentially refined) grid ages in ascending order, spanning [max(0,age_grid[1]), oldest_age]
					PSR		= results$PSR))	# numeric vector of the same size as ages[], listing the PSR on the refined grid
	}else{
		return(list(success=FALSE, error=results$error))
	}
}




# generate a random variable bounded from below but not from above
# used to pick random start params for fitting trials
random_semiboxed_left = function(lower_bound, default, typical_scale, orders_of_magnitude){
	if((default==0) && (lower_bound==0)){
		if(typical_scale==0){
			return(runif(n=1,min=0,max=1))
		}else{
			return(abs(typical_scale) * 10**runif(n=1,min=0,max=orders_of_magnitude))
		}
	}else if((default>lower_bound) && (default>0)){
		return(if(rbinom(n=1,size=1,prob=0.5)==1) (default * 10**runif(n=1,min=0,max=orders_of_magnitude)) else (lower_bound + (default-lower_bound)*runif(n=1,min=0,max=1)))
	}else if(default>lower_bound){
		return(lower_bound + (default-lower_bound) * 10**runif(n=1,min=-orders_of_magnitude/2,max=orders_of_magnitude/2))
	}else{
		return(lower_bound + (0-lower_bound) * 10**runif(n=1,min=-orders_of_magnitude/2,max=orders_of_magnitude/2))
	}
}


# choose random parameter values within boxed constraints
# each lower & upper bound may be Inf
# defaults[], lower_bounds[], upper_bounds[] and scales[] must be 1D numeric vectors of the same length, and must not include NaN or NA 
# lower_bounds[] and upper_bounds[] may include +Inf and -Inf
get_random_params = function(defaults, lower_bounds, upper_bounds, scales, orders_of_magnitude){
	start_values = defaults
	boxed_left	 = which((!is.infinite(lower_bounds)) & is.infinite(upper_bounds))
	boxed_right	 = which((!is.infinite(upper_bounds)) & is.infinite(lower_bounds))
	boxed_dual   = which(!(is.infinite(lower_bounds) | is.infinite(upper_bounds))); # determine fitted params that are boxed, i.e. constrained to within finite lower & upper bounds
	unboxed 	 = which(is.infinite(lower_bounds) & is.infinite(upper_bounds))
	if(length(boxed_dual)>0) 	start_values[boxed_dual] = lower_bounds[boxed_dual] + (upper_bounds[boxed_dual]-lower_bounds[boxed_dual]) * runif(n=length(boxed_dual),min=0,max=1)
	if(length(unboxed)>0) 	 	start_values[unboxed]	 = 10**runif(n=length(unboxed), min=-orders_of_magnitude/2.0, max=orders_of_magnitude/2.0) * start_values[unboxed]
	if(length(boxed_left)>0) 	start_values[boxed_left] = sapply(boxed_left, FUN=function(fp){ random_semiboxed_left(lower_bound=lower_bounds[fp], default=start_values[fp], typical_scale=scales[fp], orders_of_magnitude=orders_of_magnitude) })
	if(length(boxed_right)>0) 	start_values[boxed_right]= sapply(boxed_right, FUN=function(fp){ -random_semiboxed_left(lower_bound=-upper_bounds[fp], default=-start_values[fp], typical_scale=scales[fp], orders_of_magnitude=orders_of_magnitude) })
	start_values = pmax(lower_bounds,pmin(upper_bounds,start_values))
	return(start_values)
}


# check validity and sanitize (standardize format) of model parameters for fitting
# used by various fitting routines
sanitize_parameters_for_fitting = function(	param_values,				# numeric vector of size NP, specifying fixed values for some or all parameters. For fitted (i.e. non-fixed) parameters, use NaN or NA.
											param_guess		= NULL,		# numeric vector of size NP, listing an initial guess for each parameter. For fixed parameters, guess values are ignored.
											param_min		= -Inf,		# numeric vector of size NP, specifying lower bounds for the model parameters. For fixed parameters, bounds are ignored. May also be a single scalar, in which case the same lower bound is assumed for all params.
											param_max		= +Inf,		# numeric vector of size NP, specifying upper bounds for the model parameters. For fixed parameters, bounds are ignored. May also be a single scalar, in which case the same upper bound is assumed for all params.
											param_scale		= NULL){	# numeric vector of size NP, specifying typical scales for the model parameters. For fixed parameters, scales are ignored. If NULL, scales are automatically estimated from other information (such as provided guess and bounds). May also be a single scalar, in which case the same scale is assumed for all params.
	NP 			= length(param_values);
	param_names	= names(param_values);
	if(is.null(param_guess)){
		if(any(!is.finite(param_values))){
			return(list(success=FALSE, error=sprintf("Missing guessed parameter values")))
		}else{
			param_guess = rep(NA, times=NP);
		}
	}
	if(length(param_guess)!=NP){
		return(list(success=FALSE, error=sprintf("Number of guessed parameters (%d) differs from number of model parameters (%d)",length(param_guess),NP)))
	}else if(!is.null(param_names)){
		names(param_guess) = param_names;
	}
	if((!is.null(param_names)) && (length(param_names)!=NP)){
		return(list(success=FALSE, error=sprintf("Number of parameter names (%d) differs from number of model parameters (%d)",length(param_names),NP)))
	}
	if(is.null(param_min)){
		param_min = rep(-Inf,times=NP);
	}else if(length(param_min)==1){
		param_min = rep(param_min,times=NP);
	}else if(length(param_min)!=NP){
		return(list(success=FALSE, error=sprintf("Length of param_min[] (%d) differs from number of model parameters (%d)",length(param_min),NP)))
	}
	if(is.null(param_max)){
		param_max = rep(+Inf,times=NP);
	}else if(length(param_max)==1){
		param_max = rep(param_max,times=NP);
	}else if(length(param_max)!=NP){
		return(list(success=FALSE, error=sprintf("Length of param_max[] (%d) differs from number of model parameters (%d)",length(param_max),NP)))
	}
	if(is.null(param_scale)){
		param_scale = rep(NA,times=NP);
	}else if(length(param_scale)==1){
		param_scale = rep(param_scale,times=NP);
	}else if(length(param_scale)!=NP){
		return(list(success=FALSE, error=sprintf("Length of param_scale[] (%d) differs from number of model parameters (%d)",length(param_scale),NP)))
	}
	param_values[is.nan(param_values)] = NA # standardize representation of non-fixed params
	param_scale[is.nan(param_scale)] = NA	# standardize representation of unknown param scales		
		
	# determine which parameters are to be fitted
	fitted_params	= which(is.na(param_values))
	fixed_params	= which(!is.na(param_values))
	NFP				= length(fitted_params);
	param_guess[fixed_params] = param_values[fixed_params] # make sure guessed param values are consistent with fixed param values

	if(any(!is.finite(param_guess))) return(list(success=FALSE, error=sprintf("Some guessed parameter values are NA or NaN or Inf; you must specify a valid guess for each non-fixed model parameter")));
	if(any((!is.na(param_scale)) & (param_scale==0))) return(list(success=FALSE, error=sprintf("Some provided parameter scales are zero; expecting non-zero scale for each parameter")));
	
	# determine typical parameter scales, whenever these are not provided
	for(p in fitted_params){
		if(is.na(param_scale[p])){
			if(param_guess[p]!=0){
				param_scale[p] = abs(param_guess[p]);
			}else if((is.finite(param_min[p]) && (param_min[p]!=0)) || (is.finite(param_max[p]) && (param_max[p]!=0))){
				param_scale[p] = mean(abs(c((if(is.finite(param_min[p]) && (param_min[p]!=0)) param_min[p] else NULL), (if(is.finite(param_max[p]) && (param_max[p]!=0)) param_max[p] else NULL))));
			}else{
				param_scale[p] = 1;
			}
		}
	}
	
	return(list(success			= TRUE,
				NP				= NP,
				NFP				= NFP,
				param_names		= param_names,
				param_values	= param_values,
				param_guess		= param_guess,
				param_min		= param_min,
				param_max		= param_max,
				param_scale		= param_scale,
				fitted_params	= fitted_params,
				fixed_params	= fixed_params))
}
	



# function for reformatting/sanitizing generic fit parameters provided by the user
# used by various fitting routines
# prepare_generic_fit_params = function(	param_values,					# numeric vector of size NP, specifying fixed values for a some or all parameters. For fitted (i.e. non-fixed) parameters, use NaN or NA.
# 										param_guess			= NULL,		# numeric vector of size NP, listing an initial guess for each parameter. For fixed parameters, guess values are ignored.
# 										param_min			= -Inf,		# numeric vector of size NP, specifying lower bounds for the model parameters. For fixed parameters, bounds are ignored. May also be a single scalar, in which case the same lower bound is assumed for all params.
# 										param_max			= +Inf,		# numeric vector of size NP, specifying upper bounds for the model parameters. For fixed parameters, bounds are ignored. May also be a single scalar, in which case the same upper bound is assumed for all params.
# 										param_scale			= NULL){	# numeric vector of size NP, specifying typical scales for the model parameters. For fixed parameters, scales are ignored. If NULL, scales are automatically estimated from other information (such as provided guess and bounds). May also be a single scalar, in which case the same scale is assumed for all params.
# 	NP = length(param_values)
# 	param_names = names(param_values);
# 	if(is.null(param_guess)){
# 		if(any(is.finite(param_values))){
# 			return(list(success=FALSE, error=sprintf("Missing guessed parameter values")))
# 		}else{
# 			param_guess = rep(NA, times=NP);
# 		}
# 	}
# 	if(length(param_guess)!=NP){
# 		return(list(success=FALSE, error=sprintf("Number of guessed parameters (%d) differs from number of model parameters (%d)",length(param_guess),NP)))
# 	}else if(!is.null(param_names)){
# 		names(param_guess) = param_names;
# 	}
# 	if((!is.null(param_names)) && (length(param_names)!=NP)){
# 		return(list(success=FALSE, error=sprintf("Number of parameter names (%d) differs from number of model parameters (%d)",length(param_names),NP)))
# 	}
# 	if(is.null(param_min)){
# 		param_min = rep(-Inf,times=NP);
# 	}else if(length(param_min)==1){
# 		param_min = rep(param_min,times=NP);
# 	}else if(length(param_min)!=NP){
# 		return(list(success=FALSE, error=sprintf("Length of param_min[] (%d) differs from number of model parameters (%d)",length(param_min),NP)))
# 	}
# 	if(is.null(param_max)){
# 		param_max = rep(+Inf,times=NP);
# 	}else if(length(param_max)==1){
# 		param_max = rep(param_max,times=NP);
# 	}else if(length(param_max)!=NP){
# 		return(list(success=FALSE, error=sprintf("Length of param_max[] (%d) differs from number of model parameters (%d)",length(param_max),NP)))
# 	}
# 	if(is.null(param_scale)){
# 		param_scale = rep(NA,times=NP);
# 	}else if(length(param_scale)==1){
# 		param_scale = rep(param_scale,times=NP);
# 	}else if(length(param_scale)!=NP){
# 		return(list(success=FALSE, error=sprintf("Length of param_scale[] (%d) differs from number of model parameters (%d)",length(param_scale),NP)))
# 	}
# 	return(list(success=TRUE, param_values=param_values, param_guess=param_guess, param_min=param_min, param_max=param_max, param_scale))
# }


# given an undirected graph (nodes,edges), find its maximal connected subgraphs
# any two nodes may be connected by zero, one or multiple edges
# edges[] should be a 2D array of size Nedges x 2, listing source & target nodes of the graph
get_connected_subgraphs = function(Nnodes, edges){
	results = split_undirected_graph_CPP(Nnodes=Nnodes, Nedges=nrow(edges), edges = as.vector(t(edges))-1);
	return(list(Nsubgraphs 		= results$Nsubgraphs,
				subgraph2nodes	= lapply(1:results$Nsubgraphs, FUN=function(n){ results$subgraph2nodes[[n]]+1 }),
				subgraph2edges	= lapply(1:results$Nsubgraphs, FUN=function(n){ results$subgraph2edges[[n]]+1 }),
				node2subgraph	= results$node2subgraph+1,
				edge2subgraph	= results$edge2subgraph+1));
}



# Sarting from a mapping pool-->group, calculate the reverse mapping group-->member_list
# this function returns a list of length Ngroups, where each element is a 1D vector listing member indices
# values in pool2group[] must be between 1 and Ngroups, or NA or <=0 (in which case the item is not affiliated with any group)
get_member_lists_from_group_assignments = function(Ngroups, pool2group){
	if(Ngroups==0) return(list())
	pool2group[is.na(pool2group)] = -1;
	results = get_member_lists_from_group_assignments_CPP(Ngroups=Ngroups, pool2group=pool2group-1)
	return(lapply(1:Ngroups,FUN=function(g){ results$group2members[[g]]+1 }))
}



# evaluate a mathematical expression (univariate function of X) for various X-values
# the input X[] can either be a 1D vector or a 2D matrix
evaluate_univariate_expression = function(expression, Xname="x", X){
	if(is.vector(X)){
		results = evaluate_univariate_expression_CPP(expression=expression, Xname=Xname, X=X);
		return(list(success=results$success, error=results$error, Y=results$Y))
	}else if(is.matrix(X)){
		results = evaluate_univariate_expression_CPP(expression=expression, Xname=Xname, X=as.vector(t(X)));
		if(!results$success) return(list(success=FALSE, error=results$error))
		return(list(success=TRUE, Y=matrix(results$Y,ncol=ncol(X),byrow=TRUE)))
	}else{
		return(list(success=FALSE, error="Unknown data format X: Expecting either a vector or a matrix"))
	}
}



# calculate geodesic angle (aka. central angle, in radians) between two geographical locations (assuming the Earth is a sphere)
# based on the Vincenty formula with equal major and minor axis
# geographic coordinates should be given in decimal degrees
geodesic_angle = function(latitude1, longitude1, latitude2, longitude2){
	theta1	= pi*latitude1/180;
	theta2	= pi*latitude2/180;
	phi1 	= pi*longitude1/180.0;
	phi2 	= pi*longitude2/180.0;
	delta	= abs(phi1-phi2);
	angle 	= abs(atan2(sqrt((cos(theta2)*sin(delta))^2 + (cos(theta1)*sin(theta2)-sin(theta1)*cos(theta2)*cos(delta))^2), (sin(theta1)*sin(theta2)+cos(theta1)*cos(theta2)*cos(delta))));
	return(angle);
}

# calculate geodesic angles (in radians) between pairs of coordinates
# this function returns a 1D vector of size equal to the size of the input lists latitudes1[], latitudes2[] etc
geodesic_angles = function(latitudes1, longitudes1, latitudes2, longitudes2){
	return(geodesic_angles_CPP(latitudes1, longitudes1, latitudes2, longitudes2));
}

# calculate all geodesic angles (in radians) between two sets of coordinates
# This function returns a 2D matrix of size N1 x N2
all_pairwise_geodesic_angles = function(latitudes1, longitudes1, latitudes2, longitudes2){
	angles = get_all_pairwise_geodesic_angles_CPP(latitudes1, longitudes1, latitudes2, longitudes2);
	return(matrix(angles, ncol=length(latitudes2), byrow=TRUE));
}


# calculate expectation of sin^2(omega) across multiple independent contrasts with non-equal time steps, where omega is the central transition angle, for the spherical Brownian Motion
get_expected_SBM_sinsquare = function(time_steps, diffusivity, radius){
	return((2/3)*(1-mean(exp(-6*(diffusivity/(radius^2))*time_steps))))
}

expected_SBM_distance = function(time_step, diffusivity, radius){
	tD = time_step * diffusivity/radius^2;
	return(radius * SBM_get_average_transition_angle_CPP(tD = tD, max_error = 1e-7, max_Legendre_terms = 200))
}


# autocorrelation function ACF(t) of Spherical Brownian Motion with constant diffusivity D
# ACF(t) := E <n(t),n(0)> = E cos(omega(t))
# where n(t) is the unit vector pointing to the particle's random location on the unit phere at time t, and <,> is the scalar product, and omega is the transition angle.
get_SBM_ACF = function(times, diffusivity, radius){
	return(exp(-2*(diffusivity/radius^2)*times))
}



# given some rooted timetree, extract information about various sampling and branching events, as defined by the homogenous birth-death-sampling model with continuous sampling and concentrated sampling efforts
# monofurcating nodes are interpreted as sampled nodes, i.e. sampled lineages with additional subsequently sampled descendants
# bifurcating and multifurcating nodes are interpreted as branching events, with multifurcations counted multiple times (i.e., as if they are first split into bifurcations)
extract_HBDS_events_from_tree = function(	tree,
											root_age = NULL, # optional numeric, the age of the root. Can be used to define a time offset, e.g. if the last tip was not actually sampled at the present. If NULL, this will be calculated from the treem and it will be assumed that the last tip was sampled at the present
											CSA_ages = numeric(0), # 1D vector listing ages of concentrated sampling efforts, in ascending order
											age_epsilon = 0){
	Ntips  	= length(tree$tip.label)
	Nnodes 	= tree$Nnode
	NCSA		= length(CSA_ages)
	if((NCSA>=2) && (CSA_ages[1]>tail(CSA_ages,1))) return(list(success=FALSE, error="CSA_ages must be in ascending order"))
	
	# determine branching ages & tip ages & sampled node ages
	if(is.null(root_age)) root_age = get_tree_span(tree)$max_distance
	node2Nchildren = get_child_count_per_node_CPP(	Ntips		= Ntips,
													Nnodes		= Nnodes,
													Nedges		= nrow(tree$edge),
													tree_edge	= as.vector(t(tree$edge))-1);
	clade_heights = get_distances_from_root_CPP(Ntips		= Ntips,
												Nnodes		= Nnodes,
												Nedges		= nrow(tree$edge),
												tree_edge	= as.vector(t(tree$edge))-1,
												edge_length	= (if(is.null(tree$edge.length)) numeric() else tree$edge.length))
	sampled_nodes		= which(node2Nchildren==1)
	sampled_node_ages 	= pmax(0, root_age - clade_heights[Ntips + sampled_nodes])
	tip_ages 			= pmax(0, root_age - clade_heights[1:Ntips])
	branchings			= which(node2Nchildren>=2) # determine which nodes are branchings
	branching_ages		= pmax(0, root_age - clade_heights[Ntips + branchings])
	branching_ages		= rep(branching_ages,times=node2Nchildren[branchings]-1) # count multifurcations multiple times
	
	if(length(CSA_ages)>0){
		# determine which tips were sampled during concentrated sampling efforts versus due to Poissonian sampling
		sorted2original = order(tip_ages)
		CSAbinning		= place_sorted_values_into_bins_CPP(items=tip_ages[sorted2original], bin_mins=CSA_ages-age_epsilon, bin_maxs=CSA_ages+age_epsilon)
		tip2CSA 		= 1L + CSAbinning$item2bin
		Ptip_ages		= tip_ages[sorted2original[which(CSAbinning$item2bin<0)]]
		CSA2tips		= lapply(1:NCSA, FUN=function(ce){ sorted2original[1L + CSAbinning$bin2items[[ce]]] });
		CSA_tip_counts 	= sapply(1:NCSA, FUN=function(ce){ length(CSA2tips[[ce]]) })

		# determine which sampled nodes were sampled during concentrated sampling efforts versus due to Poissonian sampling
		sorted2original = order(sampled_node_ages)
		CSAbinning 		= place_sorted_values_into_bins_CPP(items=sampled_node_ages[sorted2original], bin_mins=CSA_ages-age_epsilon, bin_maxs=CSA_ages+age_epsilon)
		Pnode_ages		= sampled_node_ages[sorted2original[which(CSAbinning$item2bin<0)]]
		CSA2nodes		= lapply(1:NCSA, FUN=function(ce){ sampled_nodes[sorted2original[1L + CSAbinning$bin2items[[ce]]]] });
		CSA_node_counts = sapply(1:NCSA, FUN=function(ce){ length(CSA2nodes[[ce]]) })
	}else{
		CSA2tips 		= integer(0)
		CSA_tip_counts 	= integer(0)
		CSA2nodes 		= integer(0)
		CSA_node_counts = integer(0)
		Ptip_ages 		= sort(tip_ages)
		Pnode_ages 		= sort(sampled_node_ages)
	}
	
	return(list(concentrated_tips 			= CSA2tips,
				concentrated_tip_counts 	= CSA_tip_counts,
				concentrated_nodes 			= CSA2nodes,
				concentrated_node_counts 	= CSA_node_counts,
				Ptip_ages					= Ptip_ages, 	# ages of Poissonian tip ages, i.e. of Poissonian terminal sampling events, in ascending order
				Pnode_ages					= Pnode_ages,	# ages of Poissonian node ages, i.e. of Poissonian non-terminal sampling events, in ascending order
				branching_ages				= sort(branching_ages)))
}



# create a finer time grid, based on some old grid and a refinement factor, by splitting each interval of the original grid into refinement_factor sub-intervals
# old_grid[] must be non-decreasing
# refinement_factor must be an integer >= 1
refine_time_grid = function(old_grid, refinement_factor){
	old_diffs = diff(old_grid)
	new_diffs = rep(old_diffs/refinement_factor, rep(refinement_factor, length(old_diffs)))
	new_grid  = old_grid[1] + cumsum(new_diffs)
	return(new_grid)
}


# smoothen a time series using qubic splines
smoothen_time_series = function(times, values){
	refinement_factor 	= max(2,min(100,100*max(abs(diff(values)/sd(values)))))
	refined_times  		= refine_time_grid(old_grid=times, refinement_factor=refinement_factor)
	refined_values 		= evaluate_spline(Xgrid=times, Ygrid=values, splines_degree=3, Xtarget=refined_times, extrapolate="const", derivative=0)
	return(list(times = refined_times, values = refined_values))
}



generate_OU_time_series = function(	times,					# numeric vector of size NT, specifying the times at which to evaluate the OU process. A finer time grid means that the resulting time series will have finer structures, and look more "jagged"
									start_value,			# optional numeric. If NA or NaN or NULL, it will be chosen randomly from the stationary distribution
									stationary_mean,		# numeric
									stationary_std,			# non-negative numeric
									decay_rate,				# strictly positive numeric, in units 1/time
									constrain_min=-Inf,		# optional lower bound for the returned values
									constrain_max=+Inf,		# optional upper bound for the returned values
									smoothen = FALSE){		# logical, specifying whether to smoothen the generated time series using qubic splines onto a refined time grid
	values = get_Ornstein_Uhlenbeck_time_series_CPP(times			= times,
													start_value		= (if((!is.null(start_value)) && is.finite(start_value)) start_value else NaN),
													stationary_mean = stationary_mean,
													stationary_std	= stationary_std,
													decay_rate		= decay_rate)$values
	if(smoothen){
		smoothened = smoothen_time_series(times=times, values=values)
		times  = smoothened$times
		values = smoothened$values
	}
	return(list(times=times, values=pmin(constrain_max,pmax(constrain_min,values))))
}


# generate a time series according to a bounded Brownian Motion, i.e. a diffusing particle inside a finite interval with reflecting boundaries
generate_bounded_BM_time_series = function(	times,					# numeric vector of size NT
											diffusivity,			# strictly positive numeric, in units value^2/time
											lower,					# the lower bound of the reflected Brownian Motion. Either a single numeric (constant bound), or a numeric vector of the same length as times[] (time-dependent bound)
											upper,					# the upper bound of the reflected Brownian Motion. Either a single numeric (constant bound), or a numeric vector of the same length as times[] (time-dependent bound)
											start_value = NULL){	# optional numeric. If NA or NaN or NULL, it will be chosen randomly from the stationary distribution
	values = get_bounded_BM_time_series_CPP(times		= times,
											start_value	= (if((!is.null(start_value)) && is.finite(start_value)) start_value else NaN),
											diffusivity = diffusivity,
											lower		= lower,
											upper		= upper)$values
	return(list(times=times, values=values))
}


# estimate the parameters of a 1-dimensional Ornstein-Uhlenbeck process (proceeding over linear time) via maximum-likelihood, given a single univariate time series
# This is not a phylogenetic routine, i.e., it is not suitable for fitting an OU process evolving & branching along a phylogeny
# The OU process is parameterized through its stationary mean ("mean"), its stationary standard deviation ("std") and its decay_rate (aka. "lambda")
fit_OU1_model = function(	times,	# numeric vector of length NT, listing times in strictly ascending order
							values, # numeric vector of length NT, listing the observed values of the OU process
							min_mean			= -Inf,
							max_mean			= Inf,
							min_std				= 0,
							max_std				= Inf,
							min_decay_rate		= 0,
							max_decay_rate		= Inf,
							fixed_mean			= NA,		# optional numeric, specifying the fixed stationary mean. To fit the stationary mean, set this to NA or NaN.
							fixed_std			= NA,		# optional numeric, specifying the fixed stationary std. To fit the stationary std, set this to NA or NaN.
							fixed_decay_rate	= NA,		# optional numeric, specifying the fixed decay_rate. To fit the decay_rate, set this to NA.
							guess_mean			= NA,		# initial guess for the stationary mean of the OU process. If NA, this will be automatically chosen.
							guess_std			= NA,		# initial guess for the stationary standard deviation of the OU process. If NA, this will be automatically chosen.
							guess_decay_rate 	= NA,		# initial guess for the decay rate ("lambda") of the OU process. If NA, this will be automatically chosen.
							detrend				= TRUE,		# detrend the time series prior to fitting the OU process.
							Ntrials				= 1,		# number of fitting trials to perform, to avoid local non-global likelihood maxima
							max_start_attempts	= 1,		# integer, number of times to attempt finding a valid start point (per trial) before giving up. Randomly choosen start parameters may result in Inf/undefined objective, so this option allows the algorithm to keep looking for valid starting points.
							Nthreads			= 1,		# number of parallel threads to use when performing multiple fitting trials
							fit_control			= list(),	# a named list containing options for the nlminb fitting routine (e.g. iter.max and rel.tol)
							verbose				= FALSE,	# boolean, specifying whether to print informative messages
							diagnostics			= FALSE,	# boolean, specifying whether to print detailed info (such as log-likelihood) at every iteration of the fitting. For debugging purposes mainly.
							verbose_prefix		= ""){		# string, specifying the line prefix when printing messages. Only relevant if verbose==TRUE.
	NT 		= length(times)
	Ndata 	= NT-1
	if(NT<2) return(list(success=FALSE, error="Not enough data points"))
	time_steps = diff(times)
	
	if(detrend){
		linfit 	= stats::lm(Y ~ X, data=data.frame(X=times, Y=values), na.action=na.omit)
		values	= values - (linfit$coefficients[1] + linfit$coefficients[2] * times) + mean(values, na.rm=TRUE)
		trend 	= linfit$coefficients[2]
	}
	
	# figure out reasonable parameter guesses where needed
	if(!is.finite(guess_mean)) guess_mean = mean(values)
	if(!is.finite(guess_std)) guess_std = sd(values)
	if(!is.finite(guess_decay_rate)) guess_decay_rate = mean(abs(diff(values)/(diff(times) * values[1:(NT-1)])))
	
	# sanitize model parameters
	sanitized_params = sanitize_parameters_for_fitting(	param_values	= c(fixed_mean, fixed_std, fixed_decay_rate),
														param_guess 	= c(guess_mean, guess_std, guess_decay_rate),
														param_min		= c(min_mean, min_std, min_decay_rate),
														param_max 		= c(max_mean, max_std, max_decay_rate),
														param_scale 	= NULL)
	if(!sanitized_params$success) return(list(success=FALSE, error=sanitized_params$error))
	NP				= sanitized_params$NP
	NFP				= sanitized_params$NFP
	param_names		= sanitized_params$param_names
	param_values	= sanitized_params$param_values
	param_guess		= sanitized_params$param_guess
	param_min		= sanitized_params$param_min
	param_max		= sanitized_params$param_max
	param_scale		= sanitized_params$param_scale
	fitted_params	= sanitized_params$fitted_params
	fixed_params	= sanitized_params$fixed_params

	# set fit-control options, unless provided by the caller
	if(is.null(fit_control)) fit_control = list()
	if(is.null(fit_control$step.min)) fit_control$step.min = 0.001
	if(is.null(fit_control$x.tol)) fit_control$x.tol = 1e-8
	if(is.null(fit_control$iter.max)) fit_control$iter.max = 1000
	if(is.null(fit_control$eval.max)) fit_control$eval.max = 2 * fit_control$iter.max * NFP
							
	# objective function: negated log-likelihood
	# input argument is the subset of fitted parameters, rescaled according to param_scale
	objective_function = function(fparam_values,trial){
		params = param_values; params[fitted_params] = fparam_values * param_scale[fitted_params];
		if(any(is.nan(params)) || any(is.infinite(params))) return(Inf); # catch weird cases where params become NaN
		# calculate joint likelihood of transitions, based on the fact that transitions are independent normally distributed variables
		stationary_mean = params[1]
		stationary_std 	= params[2]
		decay_rate 		= params[3]
		next_stds  		= stationary_std * sqrt(1-exp(-2*time_steps*decay_rate))
		next_means 		= stationary_mean + (values[1:(NT-1)]-stationary_mean) * exp(-time_steps*decay_rate)
		loglikelihood	= -sum(log(next_stds * sqrt(2*pi)) + ((values[2:NT]-next_means)^2)/(2*next_stds^2))
		if(!is.finite(loglikelihood)) loglikelihood = -Inf
		if(diagnostics) cat(sprintf("%s  Trial %s: loglikelihood %.10g\n",verbose_prefix,as.character(trial),loglikelihood))
		return(-loglikelihood)
	}
	
	# fit with various starting points
	fit_single_trial = function(trial){
		scales		 = param_scale[fitted_params]
		lower_bounds = param_min[fitted_params]
		upper_bounds = param_max[fitted_params]
		# randomly choose start values for fitted params (keep trying up to max_start_attempts times)
		Nstart_attempts = 0
		while(Nstart_attempts<max_start_attempts){
			# randomly choose start values for fitted params
			if(trial==1){
				start_values = param_guess[fitted_params]		
			}else{
				start_values = get_random_params(defaults=param_guess[fitted_params], lower_bounds=lower_bounds, upper_bounds=upper_bounds, scales=scales, orders_of_magnitude=4)
			}
			# check if start values yield NaN
			start_objective = objective_function(start_values/scales, trial)
			Nstart_attempts = Nstart_attempts + 1
			if(is.finite(start_objective)) break;
		}
		# run fit with chosen start_values
		if(is.finite(start_objective)){
			fit = stats::nlminb(start		= start_values/scales, 
								objective	= function(pars){ objective_function(pars, trial) },
								lower		= lower_bounds/scales, 
								upper		= upper_bounds/scales, 
								control		= fit_control)
			results = list(objective_value=fit$objective, fparam_values = fit$par*scales, converged=(fit$convergence==0), Niterations=fit$iterations, Nevaluations=fit$evaluations[[1]], Nstart_attempts=Nstart_attempts, start_values=start_values, start_objective=start_objective)
			if(diagnostics) cat(sprintf("%s  Trial %d: Final loglikelihood %.10g, Niterations %d, Nevaluations %d, converged = %d\n",verbose_prefix,trial,-results$objective_value,results$Niterations, results$Nevaluations, results$converged))
		}else{
			results = list(objective_value=NA, fparam_values = NA, converged=FALSE, Niterations=0, Nevaluations=0, Nstart_attempts=Nstart_attempts, start_values=start_values, start_objective=start_objective)
			if(diagnostics) cat(sprintf("%s  Trial %d: Start objective is non-finite. Skipping trial\n",verbose_prefix,trial))
		}
		return(results)
	}

	# run one or more independent fitting trials
    if((Ntrials>1) && (Nthreads>1) && (.Platform$OS.type!="windows")){
		# run trials in parallel using multiple forks
		# Note: Forks (and hence shared memory) are not available on Windows
		if(verbose) cat(sprintf("%sFitting %d model parameters (%d trials, parallelized)..\n",verbose_prefix,NFP,Ntrials))
		fits = parallel::mclapply(	1:Ntrials, 
									FUN = function(trial) fit_single_trial(trial), 
									mc.cores = min(Nthreads, Ntrials), 
									mc.preschedule = FALSE,
									mc.cleanup = TRUE)
	}else{
		# run in serial mode
		if(verbose) cat(sprintf("%sFitting %d model parameters (%s)..\n",verbose_prefix,NFP,(if(Ntrials==1) "1 trial" else sprintf("%d trials",Ntrials))))
		fits = sapply(1:Ntrials,function(x) NULL)
		for(trial in 1:Ntrials){
			fits[[trial]] = fit_single_trial(trial)
		}
	}	

	# extract information from best fit (note that some fits may have LL=NaN or NA)
	objective_values	= unlist_with_nulls(sapply(1:Ntrials, function(trial) fits[[trial]]$objective_value))
	valids				= which((!is.na(objective_values)) & (!is.nan(objective_values)) & (!is.null(objective_values)) & (!is.infinite(objective_values)))
	if(length(valids)==0) return(list(success=FALSE, error=sprintf("Fitting failed for all trials")));
	best 				= valids[which.min(sapply(valids, function(i) objective_values[i]))]
	objective_value		= -fits[[best]]$objective_value
	loglikelihood		= objective_value
	fitted_param_values = param_values; fitted_param_values[fitted_params] = fits[[best]]$fparam_values;
	if(is.null(objective_value) || any(is.na(fitted_param_values)) || any(is.nan(fitted_param_values))) return(list(success=FALSE, error=sprintf("Some fitted parameters are NaN")));

	return(list(success					= TRUE,
				objective_value			= objective_value,
				objective_name			= "loglikelihood",
				loglikelihood			= loglikelihood,
				fitted_mean				= fitted_param_values[1],
				fitted_std				= fitted_param_values[2],
				fitted_decay_rate		= fitted_param_values[3],
				linear_trend			= trend,
				NFP						= NFP,
				Ndata					= Ndata,
				AIC						= 2*NFP - 2*loglikelihood,
				BIC						= log(Ndata)*NFP - 2*loglikelihood))
}




# fit a homogenous birth-death model on a grid to a given extant timetree, choosing the "best" grid size according to AIC or BIC
fit_hbd_model_on_best_grid_size = function(	tree, 
											oldest_age			= NULL,		# numeric, the oldest age to consider in the evaluation of the likelihood as well as for defining the age grid. Typically set to the stem age or root age. Can be NULL (equivalent to the root age).
											age0				= 0,		# non-negative numeric, youngest age (time before present) to consider when fitting and with respect to which rho is defined (rho(age0) is the fraction of lineages extant at age0 that are included in the tree)
											grid_sizes			= c(1,10),	# integer vector, listing the grid sizes to consider
											uniform_grid		= TRUE,		# logical, specifying whether the age grid should be uniform (equidistant age intervals). If FALSE, then the grid point density is chosen proportional to the square root of the LTT, hence resulting in higher resolution grid near the present.
											criterion			= "AIC",	# character, how to choose the optimal grid point. Options are "AIC" or "BIC".
											exhaustive			= TRUE,		# logical, whether to try all grid sizes for choosing the "best" one. If FALSE, the grid size is gradually increased until the selectin criterio (e.g., AIC) starts becoming worse, at which point the search is halted.
											min_lambda			= 0,		# numeric, lower bound for the fitted lambdas (applying to all grid points).
											max_lambda			= +Inf,		# numeric, upper bound for the fitted lambdas (applying to all grid points).
											min_mu				= 0,		# numeric, lower bound for the fitted mus (applying to all grid points).
											max_mu				= +Inf,		# numeric, upper bound for the fitted mus (applying to all grid points).
											min_rho0			= 1e-10,	# numeric, lower bound for the fitted rho. Note that rho is always within (0,1]
											max_rho0			= 1,		# numeric, upper bound for the fitted rho.
											guess_lambda		= NULL,		# initial guess for the lambda. Either NULL (an initial guess will be computed automatically), or a single numeric (guessing a constant lambda at all ages), or a function handle (for generating guesses at each grid point; this function may also return NA at some time points).
											guess_mu			= NULL,		# initial guess for the mu. Either NULL (an initial guess will be computed automatically), or a single numeric (guessing a constant mu at all ages), or a function handle (for generating guesses at each grid point; this function may also return NA at some time points).
											guess_rho0			= 1,		# initial guess for rho. Either NULL (an initial guess will be computed automatically) or a single strictly-positive numeric.
											fixed_lambda		= NULL,		# optional fixed lambda value. Either NULL (none of the lambdas are fixed), or a single scalar (all lambdas are fixed to this value) or a function handle specifying lambda for any arbitrary age (lambdas will be fixed at any age for which this function returns a finite number). The function lambda() need not return finite values for all times.
											fixed_mu			= NULL,		# optional fixed mu value. Either NULL (none of the mus are fixed), or a single scalar (all mus are fixed to this value) or a function handle specifying mu for any arbitrary age (mu will be fixed at any age for which this function returns a finite number). The function mu() need not return finite values for all times.
											fixed_rho0			= NULL,		# optional fixed value for rho. If non-NULL and non-NA, then rho is not fitted. 
											const_lambda		= FALSE,	# logical, whether to enforce a constant (time-independent) fitted speciation rate. Only relevant if lambdas are non-fixed.
											const_mu			= FALSE,	# logical, whether to enforce a constant (time-independent) fitted extinction rate. Only relevant if mus are non-fixed.
											splines_degree		= 1,		# integer, either 0, 1, 2 or 3, specifying the degree for the splines defined by lambda and mu on the age grid.
											condition			= "auto",	# one of "crown" or "stem" or "none" or "auto" or "stem2" (or "stem3" etc) or "crown3" (or "crown4" etc), specifying whether to condition the likelihood on the survival of the stem group or the crown group. It is recommended to use "stem" when oldest_age!=root_age, and "crown" when oldest_age==root_age. This argument is similar to the "cond" argument in the R function RPANDA::likelihood_bd. Note that "crown" really only makes sense when oldest_age==root_age.
											relative_dt			= 1e-3,		# maximum relative time step allowed for integration. Smaller values increase the accuracy of the computed likelihoods, but increase computation time. Typical values are 0.0001-0.001. The default is usually sufficient.
											Ntrials				= 1,
											Nthreads			= 1,
											max_model_runtime	= NULL,		# maximum time (in seconds) to allocate for each likelihood evaluation. Use this to escape from badly parameterized models during fitting (this will likely cause the affected fitting trial to fail). If NULL or <=0, this option is ignored.
											fit_control			= list(),
											verbose				= FALSE,
											verbose_prefix		= ""){
	# basic error checking
	if(verbose) cat(sprintf("%sChecking input parameters..\n",verbose_prefix))
	root_age = get_tree_span(tree)$max_distance
	if(is.null(oldest_age)) oldest_age = root_age
	if(!is.null(guess_lambda)){
		if("function" %in% class(guess_lambda)){
			if(length(guess_lambda)!=1){
				return(list(success=FALSE, error="Expecting either exactly one guess_lambda, or NULL, or a function handle"))
			}else{
				# convert guess_lambda to a function handle
				guess_lambda_value = guess_lambda
				guess_lambda = function(ages){ rep(guess_lambda_value, length(ages)) }
			}
		}
	}else{
		guess_lambda = function(ages){ rep(NA, length(ages)) }
	}
	if(!is.null(guess_mu)){
		if("function" %in% class(guess_mu)){
			if(length(guess_mu)!=1){
				return(list(success=FALSE, error="Expecting either exactly one guess_mu, or NULL, or a function handle"))
			}else{
				# convert guess_mu to a function handle
				guess_mu_value = guess_mu
				guess_mu = function(ages){ rep(guess_mu_value, length(ages)) }
			}
		}
	}else{
		guess_mu = function(ages){ rep(NA, length(ages)) }
	}
	if(!is.null(fixed_lambda)){
		if("function" %in% class(fixed_lambda)){
			if(length(fixed_lambda)!=1){
				return(list(success=FALSE, error="Expecting either exactly one fixed_lambda, or NULL, or a function handle"))
			}else{
				# convert fixed_lambda to a function handle
				fixed_lambda_value = fixed_lambda
				fixed_lambda = function(ages){ rep(fixed_lambda_value, length(ages)) }
			}
		}
	}else{
		fixed_lambda = function(ages){ rep(NA, length(ages)) }
	}
	if(!is.null(fixed_mu)){
		if("function" %in% class(fixed_mu)){
			if(length(fixed_mu)!=1){
				return(list(success=FALSE, error="Expecting either exactly one fixed_mu, or NULL, or a function handle"))
			}else{
				# convert fixed_mu to a function handle
				fixed_mu_value = fixed_mu
				fixed_mu = function(ages){ rep(fixed_mu_value, length(ages)) }
			}
		}
	}else{
		fixed_mu = function(ages){ rep(NA, length(ages)) }
	}
	if(length(min_lambda)!=1) return(list(success=FALSE, error=sprintf("Expecting exactly one min_lambda; instead, received %d",length(min_lambda))))
	if(length(max_lambda)!=1) return(list(success=FALSE, error=sprintf("Expecting exactly one max_lambda; instead, received %d",length(max_lambda))))
	if(!(criterion %in% c("AIC", "BIC"))) return(list(success=FALSE, error=sprintf("Invalid model selection criterion '%s'. Expected 'AIC' or 'BIC'",criterion)))
	Nmodels  = length(grid_sizes)
	
	# calculate tree LTT if needed
	if(!uniform_grid){
		LTT = count_lineages_through_time(tree=tree, Ntimes = max(100,10*max(grid_sizes)), regular_grid = TRUE, ultrametric=TRUE)
		LTT$ages = root_age - LTT$times
	}
	
	# determine order in which to examine models
	if(exhaustive){
		model_order = seq_len(Nmodels)
	}else{
		# examine models in the order of increasing grid sizes
		model_order = order(grid_sizes)
	}
	
	# fit HBD model on various grid sizes, keeping track of the "best" Ngrid
	if(verbose) cat(sprintf("%sFitting models with %s%d different grid sizes..\n",verbose_prefix,(if(exhaustive) "" else "up to "),Nmodels))
	AICs 		= rep(NA, times=Nmodels)
	BICs 		= rep(NA, times=Nmodels)
	best_fit	= NULL
	for(m in model_order){
		Ngrid = grid_sizes[m]
		if(uniform_grid || (Ngrid==1)){
			if(splines_degree==0){
				age_grid = seq(from=age0, to=oldest_age, length.out=Ngrid+1)[1:Ngrid]
			}else{
				age_grid = seq(from=age0, to=oldest_age, length.out=Ngrid)
			}
		}else{
			if(splines_degree==0){
				age_grid = get_inhomogeneous_grid_1D(Xstart = age0, Xend = oldest_age, Ngrid = Ngrid+1, densityX = rev(LTT$ages), densityY=sqrt(rev(LTT$lineages)), extrapolate=TRUE)[1:Ngrid]
			}else{
				age_grid = get_inhomogeneous_grid_1D(Xstart = age0, Xend = oldest_age, Ngrid = Ngrid, densityX = rev(LTT$ages), densityY=sqrt(rev(LTT$lineages)), extrapolate=TRUE)
			}
		}
		if(verbose) cat(sprintf("%s  Fitting model with grid size %d..\n",verbose_prefix,Ngrid))
		fit = fit_hbd_model_on_grid(tree				= tree, 
									oldest_age			= oldest_age,
									age0				= age0,
									age_grid			= age_grid,
									min_lambda			= min_lambda,
									max_lambda			= max_lambda,
									min_mu				= min_mu,
									max_mu				= max_mu,
									min_rho0			= min_rho0,
									max_rho0			= max_rho0,
									guess_lambda		= guess_lambda(age_grid),
									guess_mu			= guess_mu(age_grid),
									guess_rho0			= guess_rho0,
									fixed_lambda		= fixed_lambda(age_grid),
									fixed_mu			= fixed_mu(age_grid),
									fixed_rho0			= fixed_rho0,
									const_lambda		= const_lambda,
									const_mu			= const_mu,
									splines_degree		= splines_degree,
									condition			= condition,
									relative_dt			= relative_dt,
									Ntrials				= Ntrials,
									Nthreads			= Nthreads,
									max_model_runtime	= max_model_runtime,
									fit_control			= fit_control)
		if(!fit$success) return(list(success=FALSE, error=sprintf("Fitting model with grid size %d failed: %s",Ngrid,fit$error)))
		criterion_value = fit[[criterion]]
		if(is.null(best_fit)){
			best_fit = fit
			worsened = FALSE
		}else if(criterion_value<best_fit[[criterion]]){
			best_fit = fit
			worsened = FALSE
		}else{
			worsened = TRUE
		}
		AICs[m] = fit$AIC
		BICs[m] = fit$BIC
		if(verbose) cat(sprintf("%s  --> %s=%.10g. Best grid size so far: %d\n",verbose_prefix,criterion,criterion_value,length(best_fit$age_grid)))
		if((!exhaustive) && worsened) break; # model selection criterion became worse compared to the previous grid size, so stop search and keep best model found so far
	}
	
	return(list(success	 	= TRUE,
				best_fit 	= best_fit,
				grid_sizes	= grid_sizes,
				AICs		= AICs,
				BICs		= BICs))
}



# fit a homogenous birth-death model on a grid to a given extant timetree, choosing the "best" grid size according to AIC or BIC
fit_hbds_model_on_best_grid_size = function(tree, 
											root_age 			= NULL,		# optional numeric, the age of the root. Can be used to define a time offset, e.g. if the last tip was not actually sampled at the present. If NULL, this will be calculated from the tree and it will be assumed that the last tip was sampled at the present (age 0).
											oldest_age			= NULL,		# numeric, specifying the oldest age to consider. Can also be NULL, in which case this is set to the root age
											grid_sizes			= c(1,10),	# integer vector, listing the grid sizes to consider
											uniform_grid		= TRUE,		# logical, specifying whether the age grid should be uniform (equidistant age intervals). If FALSE, then the grid point density is chosen proportional to the square root of the LTT
											criterion			= "AIC",	# character, how to choose the optimal grid point. Options are "AIC" or "BIC".
											exhaustive			= TRUE,		# logical, whether to try all grid sizes for choosing the "best" one. If FALSE, the grid size is gradually increased until the selectin criterio (e.g., AIC) starts becoming worse, at which point the search is halted.
											min_lambda			= 0,		# numeric, lower bound for the fitted lambdas (applying to all grid points).
											max_lambda			= +Inf,		# numeric, upper bound for the fitted lambdas (applying to all grid points).
											min_mu				= 0,		# numeric, lower bound for the fitted mus (applying to all grid points).
											max_mu				= +Inf,		# numeric, upper bound for the fitted mus (applying to all grid points).
											min_psi				= 0,		# numeric, lower bound for the fitted psis (applying to all grid points).
											max_psi				= +Inf,		# numeric, upper bound for the fitted psis (applying to all grid points).
											min_kappa			= 0,		# numeric, lower bound for the fitted kappas (applying to all grid points).
											max_kappa			= +Inf,		# numeric, upper bound for the fitted kappas (applying to all grid points).
											guess_lambda		= NULL,		# initial guess for the lambda. Either NULL (an initial guess will be computed automatically), or a single numeric (guessing a constant lambda at all ages), or a function handle (for generating guesses at each grid point; this function may also return NA at some time points).
											guess_mu			= NULL,		# initial guess for the mu. Either NULL (an initial guess will be computed automatically), or a single numeric (guessing a constant mu at all ages), or a function handle (for generating guesses at each grid point; this function may also return NA at some time points).
											guess_psi			= NULL,		# initial guess for the psi. Either NULL (an initial guess will be computed automatically), or a single numeric (guessing a constant psi at all ages), or a function handle (for generating guesses at each grid point; this function may also return NA at some time points).
											guess_kappa			= NULL,		# initial guess for the kappa. Either NULL (an initial guess will be computed automatically), or a single numeric (guessing a constant kappa at all ages), or a function handle (for generating guesses at each grid point; this function may also return NA at some time points).
											fixed_lambda		= NULL,		# optional fixed lambda value. Either NULL (none of the lambdas are fixed), or a single scalar (all lambdas are fixed to this value) or a function handle specifying lambda for any arbitrary age (lambdas will be fixed at any age for which this function returns a finite number). The function lambda() need not return finite values for all times.
											fixed_mu			= NULL,		# optional fixed mu value. Either NULL (none of the mus are fixed), or a single scalar (all mus are fixed to this value) or a function handle specifying mu for any arbitrary age (mu will be fixed at any age for which this function returns a finite number). The function mu() need not return finite values for all times.
											fixed_psi			= NULL,		# optional fixed psi value. Either NULL (none of the psis are fixed), or a single scalar (all psis are fixed to this value) or a function handle specifying psi for any arbitrary age (psi will be fixed at any age for which this function returns a finite number). The function psi() need not return finite values for all times.
											fixed_kappa			= NULL,		# optional fixed kappa value. Either NULL (none of the kappas are fixed), or a single scalar (all kappas are fixed to this value) or a function handle specifying kappa for any arbitrary age (kappa will be fixed at any age for which this function returns a finite number). The function kappa() need not return finite values for all times.
											const_lambda		= FALSE,	# logical, whether to enforce a constant (time-independent) fitted speciation rate. Only relevant if lambdas are non-fixed.
											const_mu			= FALSE,	# logical, whether to enforce a constant (time-independent) fitted extinction rate. Only relevant if mus are non-fixed.
											const_psi			= FALSE,	# logical, whether to enforce a constant (time-independent) fitted sampling rate. Only relevant if psis are non-fixed.
											const_kappa			= FALSE,	# logical, whether to enforce a constant (time-independent) fitted retention probability. Only relevant if kappas are non-fixed.
											splines_degree		= 1,		# integer, either 1 or 2 or 3, specifying the degree for the splines defined by lambda and mu on the age grid.
											condition			= "auto",	# one of "crown" or "stem" or "none" or "auto", specifying whether to condition the likelihood on the survival of the stem group or the crown group. It is recommended to use "stem" when oldest_age!=root_age, and "crown" when oldest_age==root_age. This argument is similar to the "cond" argument in the R function RPANDA::likelihood_bd. Note that "crown" really only makes sense when oldest_age==root_age.
											ODE_relative_dt		= 0.001,	# positive unitless number, relative integration time step for the ODE solvers. Relative to the typical time scales of the dynamics, as estimated from the theoretically maximum possible rate of change. Typical values are 0.001 - 0.1.
											ODE_relative_dy		= 1e-3,		# positive unitless mumber, unitless number, relative step for interpolating simulated values over time. So a ODE_relative_dy of 0.001 means that E is recorded and interpolated between points between which E differs by roughy 0.001. Typical values are 0.01-0.0001. A smaller E_value_step increases interpolation accuracy, but also increases memory requirements and adds runtime (scales with the tree's age span, not Ntips).
											Ntrials				= 1,
											max_start_attempts	= 1,		# integer, number of times to attempt finding a valid start point (per trial) before giving up. Randomly choosen start parameters may result in Inf/undefined objective, so this option allows the algorithm to keep looking for valid starting points.
											Nthreads			= 1,
											max_model_runtime	= NULL,		# maximum time (in seconds) to allocate for each likelihood evaluation. Use this to escape from badly parameterized models during fitting (this will likely cause the affected fitting trial to fail). If NULL or <=0, this option is ignored.
											fit_control			= list(),	# a named list containing options for the nlminb fitting routine (e.g. iter.max and rel.tol)
											verbose				= FALSE,
											verbose_prefix		= ""){
	# basic error checking
	if(verbose) cat(sprintf("%sChecking input parameters..\n",verbose_prefix))
	if(is.null(root_age)) root_age = get_tree_span(tree)$max_distance
	if(is.null(oldest_age)) oldest_age = root_age
	if(!is.null(guess_lambda)){
		if("function" %in% class(guess_lambda)){
			if(length(guess_lambda)!=1){
				return(list(success=FALSE, error="Expecting either exactly one guess_lambda, or NULL, or a function handle"))
			}else{
				# convert guess_lambda to a function handle
				guess_lambda_value = guess_lambda
				guess_lambda = function(ages){ rep(guess_lambda_value, length(ages)) }
			}
		}
	}else{
		guess_lambda = function(ages){ rep(NA, length(ages)) }
	}
	if(!is.null(guess_mu)){
		if("function" %in% class(guess_mu)){
			if(length(guess_mu)!=1){
				return(list(success=FALSE, error="Expecting either exactly one guess_mu, or NULL, or a function handle"))
			}else{
				# convert guess_mu to a function handle
				guess_mu_value = guess_mu
				guess_mu = function(ages){ rep(guess_mu_value, length(ages)) }
			}
		}
	}else{
		guess_mu = function(ages){ rep(NA, length(ages)) }
	}
	if(!is.null(guess_psi)){
		if("function" %in% class(guess_psi)){
			if(length(guess_psi)!=1){
				return(list(success=FALSE, error="Expecting either exactly one guess_psi, or NULL, or a function handle"))
			}else{
				# convert guess_psi to a function handle
				guess_psi_value = guess_psi
				guess_psi = function(ages){ rep(guess_psi_value, length(ages)) }
			}
		}
	}else{
		guess_psi = function(ages){ rep(NA, length(ages)) }
	}
	if(!is.null(guess_kappa)){
		if("function" %in% class(guess_kappa)){
			if(length(guess_kappa)!=1){
				return(list(success=FALSE, error="Expecting either exactly one guess_kappa, or NULL, or a function handle"))
			}else{
				# convert guess_kappa to a function handle
				guess_kappa_value = guess_kappa
				guess_kappa = function(ages){ rep(guess_kappa_value, length(ages)) }
			}
		}
	}else{
		guess_kappa = function(ages){ rep(NA, length(ages)) }
	}
	if(!is.null(fixed_lambda)){
		if("function" %in% class(fixed_lambda)){
			if(length(fixed_lambda)!=1){
				return(list(success=FALSE, error="Expecting either exactly one fixed_lambda, or NULL, or a function handle"))
			}else{
				# convert fixed_lambda to a function handle
				fixed_lambda_value = fixed_lambda
				fixed_lambda = function(ages){ rep(fixed_lambda_value, length(ages)) }
			}
		}
	}else{
		fixed_lambda = function(ages){ rep(NA, length(ages)) }
	}
	if(!is.null(fixed_mu)){
		if("function" %in% class(fixed_mu)){
			if(length(fixed_mu)!=1){
				return(list(success=FALSE, error="Expecting either exactly one fixed_mu, or NULL, or a function handle"))
			}else{
				# convert fixed_mu to a function handle
				fixed_mu_value = fixed_mu
				fixed_mu = function(ages){ rep(fixed_mu_value, length(ages)) }
			}
		}
	}else{
		fixed_mu = function(ages){ rep(NA, length(ages)) }
	}
	if(!is.null(fixed_psi)){
		if("function" %in% class(fixed_psi)){
			if(length(fixed_psi)!=1){
				return(list(success=FALSE, error="Expecting either exactly one fixed_psi, or NULL, or a function handle"))
			}else{
				# convert fixed_psi to a function handle
				fixed_psi_value = fixed_psi
				fixed_psi = function(ages){ rep(fixed_psi_value, length(ages)) }
			}
		}
	}else{
		fixed_psi = function(ages){ rep(NA, length(ages)) }
	}
	if(!is.null(fixed_kappa)){
		if("function" %in% class(fixed_kappa)){
			if(length(fixed_kappa)!=1){
				return(list(success=FALSE, error="Expecting either exactly one fixed_kappa, or NULL, or a function handle"))
			}else{
				# convert fixed_kappa to a function handle
				fixed_kappa_value = fixed_kappa
				fixed_kappa = function(ages){ rep(fixed_kappa_value, length(ages)) }
			}
		}
	}else{
		fixed_kappa = function(ages){ rep(NA, length(ages)) }
	}
	if(length(min_lambda)!=1) return(list(success=FALSE, error=sprintf("Expecting exactly one min_lambda; instead, received %d",length(min_lambda))))
	if(length(max_lambda)!=1) return(list(success=FALSE, error=sprintf("Expecting exactly one max_lambda; instead, received %d",length(max_lambda))))
	if(length(min_mu)!=1) return(list(success=FALSE, error=sprintf("Expecting exactly one min_mu; instead, received %d",length(min_mu))))
	if(length(max_mu)!=1) return(list(success=FALSE, error=sprintf("Expecting exactly one max_mu; instead, received %d",length(max_mu))))
	if(length(min_psi)!=1) return(list(success=FALSE, error=sprintf("Expecting exactly one min_psi; instead, received %d",length(min_psi))))
	if(length(max_psi)!=1) return(list(success=FALSE, error=sprintf("Expecting exactly one max_psi; instead, received %d",length(max_psi))))
	if(length(min_kappa)!=1) return(list(success=FALSE, error=sprintf("Expecting exactly one min_kappa; instead, received %d",length(min_kappa))))
	if(length(max_kappa)!=1) return(list(success=FALSE, error=sprintf("Expecting exactly one max_kappa; instead, received %d",length(max_kappa))))
	if(!(criterion %in% c("AIC", "BIC"))) return(list(success=FALSE, error=sprintf("Invalid model selection criterion '%s'. Expected 'AIC' or 'BIC'",criterion)))
	Nmodels  = length(grid_sizes)
	
	# calculate tree LTT if needed
	if(!uniform_grid){
		LTT = count_lineages_through_time(tree=tree, Ntimes = max(100,10*max(grid_sizes)), regular_grid = TRUE)
		LTT$ages = root_age - LTT$times
	}
	
	# determine order in which to examine models
	if(exhaustive){
		model_order = seq_len(Nmodels)
	}else{
		# examine models in the order of increasing grid sizes
		model_order = order(grid_sizes)
	}
	
	# fit HBD model on various grid sizes, keeping track of the "best" Ngrid
	if(verbose) cat(sprintf("%sFitting models with %s%d different grid sizes..\n",verbose_prefix,(if(exhaustive) "" else "up to "),Nmodels))
	AICs 		= rep(NA, times=Nmodels)
	BICs 		= rep(NA, times=Nmodels)
	best_fit	= NULL
	for(m in model_order){
		Ngrid = grid_sizes[m]
		if(splines_degree==0){
			if(uniform_grid || (Ngrid==1)){
				#age_grid = c(seq(from=0,to=root_age*(Ngrid-1)/Ngrid,length.out=Ngrid),root_age*2)
				age_grid = seq(from=0,to=root_age*(Ngrid-1)/Ngrid,length.out=Ngrid)
			}else{
				#age_grid = c(get_inhomogeneous_grid_1D(Xstart=0, Xend=root_age*(Ngrid-1)/Ngrid, Ngrid=Ngrid, densityX = rev(LTT$ages), densityY=rev(sqrt(LTT$lineages)), extrapolate=TRUE),root_age*2)
				age_grid = get_inhomogeneous_grid_1D(Xstart=0, Xend=root_age*(Ngrid-1)/Ngrid, Ngrid=Ngrid, densityX = rev(LTT$ages), densityY=rev(sqrt(LTT$lineages)), extrapolate=TRUE)
			}
		}else{
			if(uniform_grid || (Ngrid==1)){
				age_grid = seq(from=0,to=root_age,length.out=Ngrid)
			}else{
				age_grid = get_inhomogeneous_grid_1D(Xstart=0, Xend=root_age, Ngrid=Ngrid, densityX = rev(LTT$ages), densityY=rev(sqrt(LTT$lineages)), extrapolate=TRUE)
			}
		}
		
		if(verbose) cat(sprintf("%s  Fitting model with grid size %d..\n",verbose_prefix,Ngrid))
		fit = fit_hbds_model_on_grid(	tree				= tree, 
										root_age			= root_age,
										oldest_age			= oldest_age,
										age_grid			= age_grid,
										min_lambda			= min_lambda,
										max_lambda			= max_lambda,
										min_mu				= min_mu,
										max_mu				= max_mu,
										min_psi				= min_psi,
										max_psi				= max_psi,
										min_kappa			= min_kappa,
										max_kappa			= max_kappa,
										guess_lambda		= guess_lambda(age_grid),
										guess_mu			= guess_mu(age_grid),
										guess_psi			= guess_psi(age_grid),
										guess_kappa			= guess_kappa(age_grid),
										fixed_lambda		= fixed_lambda(age_grid),
										fixed_mu			= fixed_mu(age_grid),
										fixed_psi			= fixed_psi(age_grid),
										fixed_kappa			= fixed_kappa(age_grid),
										const_lambda		= const_lambda,
										const_mu			= const_mu,
										const_psi			= const_psi,
										const_kappa			= const_kappa,
										splines_degree		= splines_degree,
										condition			= condition,
										ODE_relative_dt		= ODE_relative_dt,
										ODE_relative_dy		= ODE_relative_dy,
										Ntrials				= Ntrials,
										max_start_attempts	= max_start_attempts,
										Nthreads			= Nthreads,
										max_model_runtime	= max_model_runtime,
										fit_control			= fit_control,
										verbose				= FALSE,
										verbose_prefix		= paste0(verbose_prefix,"    "))
		if(!fit$success) return(list(success=FALSE, error=sprintf("Fitting model with grid size %d failed: %s",Ngrid,fit$error)))
		criterion_value = fit[[criterion]]
		if(is.null(best_fit)){
			best_fit = fit
			worsened = FALSE
		}else if(criterion_value<best_fit[[criterion]]){
			best_fit = fit
			worsened = FALSE
		}else{
			worsened = TRUE
		}
		AICs[m] = fit$AIC
		BICs[m] = fit$BIC
		if(verbose) cat(sprintf("%s  --> %s=%.10g. Best grid size so far: %d\n",verbose_prefix,criterion,criterion_value,length(best_fit$age_grid)))
		if((!exhaustive) && worsened) break; # model selection criterion became worse compared to the previous grid size, so stop search and keep best model found so far
	}
	
	return(list(success	 	= TRUE,
				best_fit 	= best_fit,
				grid_sizes	= grid_sizes,
				AICs		= AICs,
				BICs		= BICs))
}




# given a list of objects, unpack every object that is a pure list (non recursively)
# "pure list" means that class(object) = c("list")
# For example, list(a=3, b=list(c=4,d=5), e=list(f=list(6,7), g=8, h=phylo_object, i=list())) will become list(a=3, c=4, d=5, f=list(6,7), g=8, h=phylo_object)
# This function may be needed when returning a large list of lists from an Rcpp function (which may necessitate list nesting due to limitations of Rcpp) and you want to unpack the first-level lists.
flatten_list_first_level = function(L){
	N  = length(L)
	NF = sum(sapply(seq_len(N), FUN=function(i){ (if(all(class(L[[i]])=="list")) length(L[[i]]) else 1) })) # predicted length of the flattened list
	LF = vector(mode="list", NF)
	Lnames = names(L)
	LFnames = character(NF)
	f  = 1 # next index in flattened list at which to place item
	for(i in seq_len(N)){
		if(all(class(L[[i]])=="list")){
			if(length(L[[i]])>0){
				LF[f:(f+length(L[[i]])-1)] = L[[i]]
				if((!is.null(Lnames)) && (!is.null(names(L[[i]])))) LFnames[f:(f+length(L[[i]])-1)] = names(L[[i]])
			}
			f = f + length(L[[i]])
		}else{
			if(!is.null(Lnames)) LFnames[f] = Lnames[i]
			LF[[f]] = L[[i]]
			f = f + 1
		}
	}
	names(LF) = LFnames
	return(LF)
}



# given a list of birth events and death events, calculate the corresponding LTT
# Note that birth & death events need not be consistent with a tree topology, i.e. the LTT simply decreases (over increasing time) with every death event and increases with every birth event
count_lineages_through_time_BD = function(	birth_times, 	# numeric vector of arbitrary length, listing times of birth events. Need not necessarily be sorted.
											death_times, 	# numeric vector of arbitrary length, listing times of death events. Need not necessarily be sorted.
											time_grid, 		# times at which lineage counts are requested. Need not necessarily be sorted.
											initial_count){ # integer, the number of lineages assumed at -infinity
	if(length(time_grid)==0) return(numeric())
	if(any(diff(time_grid)<0)){
		# time grid is not sorted in ascending order, so figure out proper order
		time_grid_order = order(time_grid)
	}else{
		time_grid_order = c(1:length(time_grid))
	}
	lineages = numeric(length(time_grid))
	lineages[time_grid_order] = initial_count + get_LTT_BD_CPP(birth_times = sort(birth_times), death_times = sort(death_times), time_grid=time_grid[time_grid_order]);
}



# extract independent contrasts from a bifurcating tree, for fitting SBM diffusion models
get_SBM_independent_contrasts = function(	tree,
											tip_latitudes, 						# numeric vector of size Ntips, listing geographical latitudes of the tips (in decimal degrees)
											tip_longitudes, 					# numeric vector of size Ntips, listing geographical longitudes of the tips (in decimal degrees)
											radius,								# numeric, radius to assume for the sphere (e.g. Earth). Use this e.g. if you want to hange the units in which diffusivity is estimated. Earth's mean radius is about 6371e3 m.
											clade_states			= NULL,		# optional, either an integer vector of length Ntips+Nnodes (if trees[] is a single tree) or a list of 1D vectors (if trees[] is a list of trees), specifying the discrete "state" of each tip and node in each tree. This can be used to limit independent contrasts to tip pairs whose total number of state-transitions (along their shortest path) is zero.
											planar_approximation	= FALSE,	# logical, specifying whether the estimation formula should be based on a planar approximation of Earth's surface, i.e. geodesic angles are converted to distances and then those are treated as if they were Euclideanon a 2D plane. This approximation substantially increases the speed of computations.
											only_basal_tip_pairs	= FALSE,	# logical, specifying whether only immediate sister tips should be considered, i.e. tip pairs with at most 2 edges between the two tips
											only_distant_tip_pairs	= FALSE,	# logical, whether to only consider tip pairs located at distinct geographic locations
											min_MRCA_time			= 0,		# numeric, specifying the minimum allowed height (distance from root) of the MRCA of sister tips considered in the fitting. In other words, an independent contrast is only considered if the two sister tips' MRCA has at least this distance from the root. Set min_MRCA_time=0 to disable this filter.
											max_MRCA_age			= Inf,		# numeric, specifying the maximum allowed age (distance from youngest tip) of the MRCA of sister tips considered in the fitting. In other words, an independent contrast is only considered if the two sister tips' MRCA has at most this age (time to present). Set max_MRCA_age=Inf to disable this filter.
											max_phylodistance		= Inf,		# numeric, maximum allowed geodistance for an independent contrast to be included
											no_state_transitions	= FALSE,	# if TRUE, only tip pairs without state transitions along their shortest paths are considered. In particular, only tips in the same state are considered. Requires that clade_states[] is provided.
											only_state				= NULL){	# optional integer, specifying the state in which tip pairs (and their connecting ancestors) must be in order to be considered. Requires that clade_states[] is provided.
	Ntips = length(tree$tip.label)
	if(("list" %in% class(tip_latitudes)) && (length(tip_latitudes)==Ntips)){
		tip_latitudes = unlist(tip_latitudes)
	}
	if(("list" %in% class(tip_latitudes)) && (length(tip_longitudes)==Ntips)){
		tip_longitudes = unlist(tip_longitudes)
	}
	if((!is.null(clade_states)) && ("list" %in% class(clade_states)) && (length(clade_states)==Ntips+tree$Nnode)){
		clade_states = unlist(clade_states)
	}
	if((!is.null(only_state)) && is.null(clade_states)) return(list(success=FALSE, error="Missing clade_states[], needed when only_state is specified"))
	if(no_state_transitions && is.null(clade_states)) return(list(success=FALSE, error="Missing clade_states[], needed when no_state_transitions=TRUE"))
		
	# make sure tree does not have multifurcations
	# note that this does not change the tip & node indices, so the returned tip_pairs[] will still refer to the correct tips in the original tree
	if(tree_has_multifurcations_CPP(Ntips = Ntips, Nnodes = tree$Nnode, Nedges = nrow(tree$edge), tree_edge = as.vector(t(tree$edge)) - 1)){
		tree = multifurcations_to_bifurcations(tree)$tree
	}

	# extract independent pairs of sister tips
	tip_pairs = get_independent_sister_tips(tree)
	if(only_basal_tip_pairs){
		# calculate number of nodes between tip pairs
		edge_counts = get_pairwise_distances(tree, A=tip_pairs[,1], B=tip_pairs[,2], as_edge_counts=TRUE, check_input=FALSE)
		# only keep tip pairs with at most 2 edges connecting them
		keep_pairs 	= which(edge_counts<=2)
		tip_pairs 	= tip_pairs[keep_pairs,,drop=FALSE]
	}
	
	# calculate MRCAs
	MRCAs = get_pairwise_mrcas(tree, tip_pairs[,1], tip_pairs[,2], check_input=FALSE)
	
	# calculate clade times
	clade_times = castor::get_all_distances_to_root(tree)
	
	# filter tip pairs based on MRCA time & age, if needed
	if((min_MRCA_time>0) || (max_MRCA_age<Inf)){
		tree_span 	= max(clade_times)
		keep_pairs	= which((clade_times[MRCAs]>=min_MRCA_time) & (tree_span-clade_times[MRCAs]<=max_MRCA_age))
		tip_pairs	= tip_pairs[keep_pairs,,drop=FALSE]
		MRCAs		= MRCAs[keep_pairs]
	}
	if(nrow(tip_pairs)==0) return(list(success=FALSE, error="No valid tip pairs left for extracting independent contrasts"))
		
	# calculate phylogenetic divergences and geodesic distances between sister tips
	phylodistances 	= get_pairwise_distances(tree, A=tip_pairs[,1], B=tip_pairs[,2], check_input=FALSE)
	geodistances 	= radius * sapply(1:nrow(tip_pairs), FUN=function(p){ geodesic_angle(tip_latitudes[tip_pairs[p,1]],tip_longitudes[tip_pairs[p,1]],tip_latitudes[tip_pairs[p,2]],tip_longitudes[tip_pairs[p,2]]) })

	# omit tip pairs with zero phylogenetic distance, because in that case the likelihood density is pathological
	# also omit tip pairs located at the same geographic location, if requested
	keep_pair = (phylodistances>0)
	if(only_distant_tip_pairs) keep_pair = keep_pair & (geodistances>0)
	if(max_phylodistance<Inf) keep_pair = keep_pair & (phylodistances<=max_phylodistance)
	if(no_state_transitions || (!is.null(only_state))){
		Ntransitions = count_transitions_between_clades(tree=tree, A=tip_pairs[,1], B=tip_pairs[,2], states=clade_states, check_input=TRUE)
		if(no_state_transitions) keep_pair = keep_pair & (Ntransitions==0)
		if(!is.null(only_state)) keep_pair = keep_pair & (Ntransitions==0) & (clade_states[tip_pairs[,1]]==only_state) & (clade_states[tip_pairs[,2]]==only_state)
	}
	tip_pairs		= tip_pairs[keep_pair,,drop=FALSE]
	MRCAs			= MRCAs[keep_pair]
	phylodistances 	= phylodistances[keep_pair]
	geodistances 	= geodistances[keep_pair]
	NC 				= length(phylodistances)
	if(NC==0) return(list(success=FALSE, error="No valid tip pairs left for extracting independent contrasts"))

	# determine MRCA and tip times
	MRCA_times = clade_times[MRCAs]
	tip_times1 = clade_times[tip_pairs[,1]]
	tip_times2 = clade_times[tip_pairs[,2]]

	return(list(success			= TRUE,
				NC				= NC,
				tip_pairs		= tip_pairs,		# numeric matrix of size NC x 2, listing the tip indices used to define the indepent contrasts
				phylodistances	= phylodistances,
				geodistances	= geodistances,
				MRCA_times		= MRCA_times,
				child_times1	= tip_times1,
				child_times2	= tip_times2))
}



# fit a homogenous birth-death model on a grid to a given extant timetree, choosing the "best" grid size according to AIC or BIC
fit_sbm_on_best_grid_size = function(	tree, 
										tip_latitudes, 						# numeric vector of size Ntips, listing geographical latitudes of the tips (in decimal degrees)
										tip_longitudes, 					# numeric vector of size Ntips, listing geographical longitudes of the tips (in decimal degrees)
										radius,								# numeric, radius to assume for the sphere (e.g. Earth). Use this e.g. if you want to hange the units in which diffusivity is estimated. Earth's mean radius is about 6371e3 m.
										clade_states			= NULL,		# optional, either an integer vector of length Ntips+Nnodes (if trees[] is a single tree) or a list of 1D vectors (if trees[] is a list of trees), specifying the discrete "state" of each tip and node in each tree. This can be used to limit independent contrasts to tip pairs whose total number of state-transitions (along their shortest path) is zero.
										planar_approximation	= FALSE,	# logical, specifying whether the estimation formula should be based on a planar approximation of Earth's surface, i.e. geodesic angles are converted to distances and then those are treated as if they were Euclideanon a 2D plane. This approximation substantially increases the speed of computations.
										only_basal_tip_pairs	= FALSE,	# logical, specifying whether only immediate sister tips should be considered, i.e. tip pairs with at most 2 edges between the two tips
										only_distant_tip_pairs	= FALSE,	# logical, whether to only consider tip pairs located at distinct geographic locations
										min_MRCA_time			= 0,		# numeric, specifying the minimum allowed height (distance from root) of the MRCA of sister tips considered in the fitting. In other words, an independent contrast is only considered if the two sister tips' MRCA has at least this distance from the root. Set min_MRCA_time=0 to disable this filter.
										max_MRCA_age			= Inf,		# numeric, specifying the maximum allowed age (distance from youngest tip) of the MRCA of sister tips considered in the fitting. In other words, an independent contrast is only considered if the two sister tips' MRCA has at most this age (time to present). Set max_MRCA_age=Inf to disable this filter.
										max_phylodistance		= Inf,		# numeric, maximum allowed geodistance for an independent contrast to be included in the SBM fitting
										no_state_transitions	= FALSE,	# if TRUE, only tip pairs without state transitions along their shortest paths are considered. In particular, only tips in the same state are considered. Requires that clade_states[] is provided.
										only_state				= NULL,		# optional integer, specifying the state in which tip pairs (and their connecting ancestors) must be in order to be considered. Requires that clade_states[] is provided.
										grid_sizes				= c(1,10),	# integer vector, listing the grid sizes to consider
										uniform_grid			= TRUE,		# logical, specifying whether the age grid should be uniform (equidistant age intervals). If FALSE, then the grid point density is chosen proportional to the square root of the independent contrasts density, hence resulting in higher resolution grid in areas where there is more data available.
										guess_diffusivity		= NULL,		# optional numeric, first guess for the diffusivity (at all grid points). If NULL, this will be automatically chosen.
										min_diffusivity			= NULL,		# numeric, lower bound for the fitted diffusivity (applying to all grid points). If NULL, this is automatically chosen.
										max_diffusivity			= Inf,		# numeric, upper bound for the fitted diffusivity (applying to all grid points).
										criterion				= "AIC",	# character, how to choose the optimal grid point. Options are "AIC" or "BIC".
										exhaustive				= TRUE,		# logical, whether to try all grid sizes for choosing the "best" one. If FALSE, the grid size is gradually increased until the selectin criterio (e.g., AIC) starts becoming worse, at which point the search is halted.
										Ntrials					= 1,
										Nthreads				= 1,
										Nbootstraps				= 0,		# (integer) optional number of parametric-bootstrap samples for estimating confidence intervals of fitted parameters. If 0, no parametric bootstrapping is performed. Typical values are 10-100.
										Ntrials_per_bootstrap	= NULL,		# (integer) optional number of fitting trials for each bootstrap sampling. If NULL, this is set equal to Ntrials. A smaller Ntrials_per_bootstrap will reduce computation, at the expense of increasing the estimated confidence intervals (i.e. yielding more conservative estimates of confidence).
										NQQ						= 0,		# (integer) optional number of simulations to perform for creating Q-Q plots of the theoretically expected distribution of geodistances vs the empirical distribution of geodistances (across independent contrasts). The resolution of the returned QQ plot will be equal to the number of independent contrasts used for fitting.
										fit_control				= list(),
										SBM_PD_functor			= NULL,		# internally used SBM probability density functor
										verbose					= FALSE,
										verbose_prefix			= ""){
	# basic error checking
	if(verbose) cat(sprintf("%sChecking input parameters..\n",verbose_prefix))
	if((!is.null(guess_diffusivity)) && (length(guess_diffusivity)!=1)) return(list(success=FALSE, error="Expecting either exactly one guess_diffusivity, or NULL"))									
	if((!is.null(min_diffusivity)) && (length(min_diffusivity)!=1)) return(list(success=FALSE, error="Expecting either exactly one min_diffusivity, or NULL"))									
	if((!is.null(max_diffusivity)) && (length(max_diffusivity)!=1)) return(list(success=FALSE, error="Expecting either exactly one min_diffusivity, or NULL"))									
	if(!(criterion %in% c("AIC", "BIC"))) return(list(success=FALSE, error=sprintf("Invalid model selection criterion '%s'. Expected 'AIC' or 'BIC'",criterion)))
	root_age = get_tree_span(tree)$max_distance
	Nmodels  = length(grid_sizes)
		
	# determine order in which to examine models
	if(exhaustive){
		model_order = c(1:Nmodels)
	}else{
		# examine models in the order of increasing grid sizes
		model_order = order(grid_sizes)
	}
	
	if(!uniform_grid){
		# get independent contrasts density through time, needed for defining non-uniform grid
		if(verbose) cat(sprintf("%sDetermining independent contrasts density..\n",verbose_prefix))
		ICs = get_SBM_independent_contrasts(tree					= tree,
											tip_latitudes			= tip_latitudes,
											tip_longitudes			= tip_longitudes,
											radius					= radius,
											clade_states			= clade_states,
											planar_approximation	= planar_approximation,
											only_basal_tip_pairs	= only_basal_tip_pairs,
											only_distant_tip_pairs	= only_distant_tip_pairs,
											min_MRCA_time			= min_MRCA_time,
											max_MRCA_age			= max_MRCA_age,
											max_phylodistance		= max_phylodistance,
											no_state_transitions	= no_state_transitions,
											only_state				= only_state)
		ICdensity = list(times = seq(from=root_age/1000, to=root_age, length.out=200))
		ICdensity$lineages = count_lineages_through_time_BD(birth_times		= c(ICs$MRCA_times,ICs$MRCA_times),
															death_times		= c(ICs$child_times1,ICs$child_times2),
															time_grid		= ICdensity$times, 
															initial_count	= 0)
		if(!ICs$success) return(list(success=FALSE, error=sprintf("Failed to acquire independent contrasts: %s",ICs$error)))
	}
	
	# fit SBM model on various grid sizes, keeping track of the "best" Ngrid
	if(verbose) cat(sprintf("%sFitting SBM models with %s%d different grid sizes..\n",verbose_prefix,(if(exhaustive) "" else "up to "),Nmodels))
	AICs 		= rep(NA, times=Nmodels)
	BICs 		= rep(NA, times=Nmodels)
	best_fit	= NULL
	for(m in model_order){
		Ngrid = grid_sizes[m]
		if(uniform_grid || (Ngrid==1)){
			time_grid = seq(from=0, to=root_age, length.out=Ngrid)
		}else{
			time_grid = get_inhomogeneous_grid_1D(Xstart = 0, Xend = root_age, Ngrid = Ngrid, densityX = ICdensity$times, densityY=sqrt(ICdensity$lineages), extrapolate=TRUE)
		}
		if(verbose) cat(sprintf("%s  Fitting SBM model with grid size %d..\n",verbose_prefix,Ngrid))
		fit = fit_sbm_on_grid(	tree					= tree, 
								tip_latitudes			= tip_latitudes,
								tip_longitudes			= tip_longitudes,
								radius					= radius,
								clade_states			= clade_states,
								planar_approximation	= planar_approximation,
								only_basal_tip_pairs	= only_basal_tip_pairs,
								only_distant_tip_pairs	= only_distant_tip_pairs,
								min_MRCA_time			= min_MRCA_time,
								max_MRCA_age			= max_MRCA_age,
								max_phylodistance		= max_phylodistance,
								no_state_transitions	= no_state_transitions,
								only_state				= only_state,
								time_grid				= time_grid,
								guess_diffusivity		= guess_diffusivity,
								min_diffusivity			= min_diffusivity,
								max_diffusivity			= max_diffusivity,
								Ntrials					= Ntrials,
								Nthreads				= Nthreads,
								Nbootstraps				= 0,
								Ntrials_per_bootstrap	= Ntrials_per_bootstrap,
								NQQ						= NQQ,
								fit_control				= fit_control,
								SBM_PD_functor			= (if(!is.null(best_fit)) best_fit$SBM_PD_functor else SBM_PD_functor),
								verbose					= verbose,
								verbose_prefix			= paste0(verbose_prefix,"    "))
		if(!fit$success) return(list(success=FALSE, error=sprintf("Fitting SBM model with grid size %d failed: %s",Ngrid,fit$error)))
		criterion_value = fit[[criterion]]
		if(is.null(best_fit)){
			best_fit = fit
			worsened = FALSE
		}else if(criterion_value<best_fit[[criterion]]){
			best_fit = fit
			worsened = FALSE
		}else{
			worsened = TRUE
		}
		AICs[m] = fit$AIC
		BICs[m] = fit$BIC
		if(verbose) cat(sprintf("%s  --> %s=%.10g. Best grid size so far: %d\n",verbose_prefix,criterion,criterion_value,length(best_fit$time_grid)))
		if((!exhaustive) && worsened) break; # model selection criterion became worse compared to the previous grid size, so stop search and keep best model found so far
	}
	
	if(Nbootstraps>0){
		if(verbose) cat(sprintf("%sRepeating fit of model with grid size %d, for bootstrapping..\n",verbose_prefix,length(best_fit$time_grid)))
		best_fit = fit_sbm_on_grid(	tree					= tree, 
									tip_latitudes			= tip_latitudes,
									tip_longitudes			= tip_longitudes,
									radius					= radius,
									clade_states			= clade_states,
									planar_approximation	= planar_approximation,
									only_basal_tip_pairs	= only_basal_tip_pairs,
									only_distant_tip_pairs	= only_distant_tip_pairs,
									min_MRCA_time			= min_MRCA_time,
									max_MRCA_age			= max_MRCA_age,
									max_phylodistance		= max_phylodistance,
									no_state_transitions	= no_state_transitions,
									only_state				= only_state,
									time_grid				= best_fit$time_grid,
									guess_diffusivity		= guess_diffusivity,
									min_diffusivity			= min_diffusivity,
									max_diffusivity			= max_diffusivity,
									Ntrials					= Ntrials,
									Nthreads				= Nthreads,
									Nbootstraps				= Nbootstraps,
									Ntrials_per_bootstrap	= Ntrials_per_bootstrap,
									NQQ						= NQQ,
									fit_control				= fit_control,
									SBM_PD_functor			= best_fit$SBM_PD_functor,
									verbose					= verbose,
									verbose_prefix			= paste0(verbose_prefix,"  "))

	}
	
	return(list(success	 	= TRUE,
				best_fit 	= best_fit,
				grid_sizes	= grid_sizes,
				AICs		= AICs,
				BICs		= BICs))
}


read_fasta = function(	file,
						include_headers		= TRUE,
						include_sequences	= TRUE,
						truncate_headers_at	= NULL){ # optional needle string, at which to truncate headers (i.e. remove everything at and after the first instance of the needle)
	results = read_fasta_from_file_CPP(	fasta_path			= file,
										include_headers		= include_headers,
										include_sequences	= include_sequences)
	if(!results$success) return(list("success"=FALSE, error=results$error))
	if(include_headers && (!is.null(truncate_headers_at))){
		results$headers = sapply(seq_len(length(results$headers)), FUN=function(h){ strsplit(results$headers[h],split=truncate_headers_at,fixed=TRUE)[[1]][1] })
	}
	return(list(headers		= (if(include_headers) results$headers else NULL),
				sequences	= (if(include_sequences) results$sequences else NULL),
				Nlines		= results$Nlines,
				Nsequences	= results$Nsequences))
}





# make a time series monotonically increasing or decreasing, by setting problematic values to NaN
monotonize_series_by_pruning = function(values,				# numeric vector of size N, listing the scalar time series values. May include NaN.
										increasing, 		# logical, specifying whether the resulting time series should be monotonically increasing (rather than decreasing)
										prefer_later_data){	# logical, specifying whether later data (rather than earlier data) should be kept when resolving monotonicity conflicts
	results = monotonize_series_by_pruning_CPP(values = values, increasing = increasing, prefer_later_data = prefer_later_data)
	return(results)
}


# make a time series monotonically increasing or decreasing, by replacing problematic values with linearly interpolations between valid values.
monotonize_time_series_via_interpolation = function(times,				# numeric vector of size N, listing the "times" of the time series, in ascending order. May not include NaN.
													values,				# numeric vector of size N, listing the scalar time series values. May include NaN.
													increasing, 		# logical, specifying whether the resulting time series should be monotonically increasing (rather than decreasing)
													prefer_later_data){	# logical, specifying whether later data (rather than earlier data) should be kept when resolving monotonicity conflicts
	results = monotonize_series_via_interpolation_CPP(times = times, values = values, increasing = increasing, prefer_later_data = prefer_later_data)
	return(results)
}



bootstraps_to_confidence_intervals = function(bootstrap_samples){		# 2D numeric matrix of size NB x NP, where NB is the number of bootstraps and NP is the number of distinct parameters fitted at each bootstraps
	NP = ncol(bootstrap_samples)
	NB = nrow(bootstrap_samples)
	if(NP==0){
		return(list(mean		= numeric(0),
					median		= numeric(0),
					CI50lower	= numeric(0),
					CI50upper	= numeric(0),
					CI95lower	= numeric(0),
					CI95upper	= numeric(0)))
	}
	quantiles 	= sapply(seq_len(NP), FUN=function(k){ quantile(bootstrap_samples[,k], probs=c(0.5, 0.25, 0.75, 0.025, 0.975), na.rm=TRUE, type=8) })
	if(NP==1) quantiles = matrix(quantiles, ncol=NP) # make sure quantiles[,] is 2D matrix, even if only a single parameter was boostrapped
	return(list(mean 		= colMeans(bootstrap_samples, na.rm=TRUE),
				median 		= quantiles[1,],
				CI50lower 	= quantiles[2,],
				CI50upper 	= quantiles[3,],
				CI95lower 	= quantiles[4,],
				CI95upper 	= quantiles[5,]))
}




# Locally estimate the exponential growth rate of a time series in a sliding window
fit_local_exponential_rate = function(	X,					# 1D numeric vector of length N, listing X values in ascending order
										Y,					# 1D numeric vector of length N, listing non-negative Y-values corresponding to X[]
										ref						= NULL,			# optional numeric vector of length N, listing reference values for judging the adequacy of data in a sliding window.
										window_size				= 2,			# strictly positive integer, specifying the size of the sliding window (number of data points per fitting)
										trim_window_at_bounds 	= TRUE,			# logical, specifying whether to trim the sliding window when hitting the data's X-bounds. If false, then the sliding window always has the specified size, but may not always be centered around the point of evaluation, and toward the left & right bound the fitted params will be constant. If TRUE, the window becomes smaller towards the edges, hence estimates towards the edges become more noisy.
										normalize_by_Xstep		= FALSE,		# logical, specifying whether to divide each Y value by the corresponding X-step (this is only relevant if X is a non-regular grid). For example, if X are times and Y are new infections since the last time point, then the exponential growth rate of the disease should be calculated using the normalized Y.
										model 					= "lognormal",	# character, specifying the stochastic model to assume for the Y values. Available options are "lognormal" and "Poisson" (only suitable for count data).
										min_Npoints 			= 2,			# integer, specifying the minimum number of valid data points for fitting, per sliding window. If a sliding window covers fewer usable points than this threshold, the corresponding estimate is set to NA.
										min_Npositives 			= 2,			# integer, specifying the minimum number of valid positive (Y>0) data points for fitting, per sliding window. If a sliding window covers fewer usable points than this threshold, the corresponding estimate is set to NA. Not relevant for the lognormal model, since zeros are always ignored for that model.
										min_Nref				= 0,			# integer, specifying the minimum sum of reference values in a sliding window, needed for fitting. If the sum of reference values in a sliding window is below this threshold, the corresponding estimate is set to NA.
										Nbootstraps 			= 0,			# integer, specifying the optional number of boostraps for estimating confidence intervals
										Nthreads				= 1){			# integer, number of parallel threads to use for bootstrapping
	NX 				= length(X)
	window_size 	= max(1,window_size)
	min_Npoints 	= min(min_Npoints,window_size)
	min_Npositives 	= min(min_Npositives,window_size)
	
	Ymodified = Y # may be further modified below, prior to estimation of exponential rates
	if(model == "lognormal"){
		if(normalize_by_Xstep && (NX>=2)){
			# divide Y values by the X-step lengths. For Y[1] we don't know the X-step, so we can't normalize it, hence we set Y[1]=NA
			Ymodified = c(NA,Y[2:NX]/diff(X))
		}
		fit = fit_exp_LeastLogSquares_moving_window_CPP(X = X, Y = Ymodified, window_size = window_size, trim_window_at_bounds = trim_window_at_bounds)
		fit$rate[fit$Npoints<min_Npoints] = NA
		if(!is.null(ref)){
			Nrefs_per_window = sapply(seq_len(length(fit$window_starts)), FUN=function(w) sum(ref[c(fit$window_starts[w]:fit$window_ends[w])],na.rm=TRUE))
			fit$rate[Nrefs_per_window<min_Nref] = NA
		}
		valid_points_for_bootstrap = which(is.finite(fit$predicted_logY) & is.finite(fit$log_variance) & is.finite(X) & is.finite(Y) & (Y>0))
	}else if(model=="Poisson"){
		if(normalize_by_Xstep && (NX>=2)){
			# the appropriate scalings must be passed to the max-likelihood fitting routine, i.e. we can't just rescale the Y-values
			scalings = c(1, diff(X))
			Ymodified = c(NA, Y[2:NX])
		}else{
			scalings = rep(1, NX)
		}
		fit = fit_exp_Poisson_moving_window_CPP(X = X, Y = Ymodified, scalings = scalings, window_size = window_size, trim_window_at_bounds = trim_window_at_bounds)	
		fit$rate[(fit$Npoints<min_Npoints) | (fit$Npositives<min_Npositives)] = NA
		if(!is.null(ref)){
			Nrefs_per_window = sapply(seq_len(length(fit$window_starts)), FUN=function(w) sum(ref[c(fit$window_starts[w]:fit$window_ends[w])],na.rm=TRUE))
			fit$rate[Nrefs_per_window<min_Nref] = NA
		}
		valid_points_for_bootstrap = which(is.finite(X) & is.finite(Y) & (Y>=0))
	}else{
		stop(sprintf("Invalid model '%s'",model))
	}
	
	if(Nbootstraps>0){
		boostrap_rates = matrix(NA, nrow=Nbootstraps, ncol=NX)
		if(length(valid_points_for_bootstrap)>0){
			aux_bootstrap = function(b){
				# generate random time series bootstrap_Y, according to the fitted model params 
				bootstrap_Y = Y
				if(model=="lognormal"){
					# assuming a log-normal model, i.e. where log(Y) ~ normal(mean=fit$predicted_logY, variance=fit$log_variance)
					bootstrap_Y[valid_points_for_bootstrap] = exp(rnorm(n=length(valid_points_for_bootstrap), mean=fit$predicted_logY[valid_points_for_bootstrap], sd=sqrt(pmax(0,fit$log_variance[valid_points_for_bootstrap]))))
				}else if(model=="Poisson"){
					bootstrap_Y[valid_points_for_bootstrap] = rpois(n=length(valid_points_for_bootstrap), lambda=Y[valid_points_for_bootstrap])				
				}
				bfit = fit_local_exponential_rate(	X						= X,
													Y						= bootstrap_Y,
													ref						= ref,
													window_size				= window_size,
													trim_window_at_bounds	= trim_window_at_bounds,
													model					= model,
													min_Npoints				= min_Npoints,
													min_Npositives			= min_Npositives,
													min_Nref				= min_Nref,
													Nbootstraps				= 0,
													Nthreads				= 1)
				return(bfit$rate)
			}
			if((Nthreads>1) && (Nbootstraps>1) && (.Platform$OS.type!="windows")){
				boostrap_rates = matrix(unlist(parallel::mclapply(seq_len(Nbootstraps), FUN = function(b) aux_bootstrap(b), mc.cores=min(Nthreads, Nbootstraps), mc.preschedule=TRUE, mc.cleanup=TRUE)), nrow=Nbootstraps, byrow=TRUE)
			}else{
				boostrap_rates = t(sapply(seq_len(Nbootstraps), FUN = function(b) aux_bootstrap(b)))
			}
		}
		rate_CI = bootstraps_to_confidence_intervals(bootstrap_samples=boostrap_rates)
		for(i in seq_len(length(rate_CI))) rate_CI[[i]][!is.finite(fit$rate)] = NA
	}
	
	return(list(rate 	= fit$rate, 
				rate_CI = (if(Nbootstraps>0) rate_CI else NULL)))
}


# create a latitude x longitude grid, such that every grid cell has the same surface area
# returns a vector of latitudes (of size Nlat+1) and longitudes (of size Nlon), such that latitudes[1]=-90, latitudes[end]=+90, longitues[1]=-180 and longitudes[end]=+180
split_sphere_into_equisized_tiles = function(Nlat, Nlon){
	latitudes  = asin(2*seq(from=0,to=1,length.out=(Nlat+1)) - 1) * 180/pi # split sphere into Nlat latitudinal segments ("onion rings")
	longitudes = seq(from=-pi,to=pi,length.out=(Nlon+1)) * 180/pi
	return(list(latitudes=latitudes, longitudes=longitudes))
}


assign_points_to_tiles_on_sphere = function(point_latitudes, 
											point_longitudes,
											tile_latitudes,		# numeric vector of size Nlat+1, listing latitudes in ascending order from -90 up to 90.
											tile_longitudes){	# numeric vector of size Nlon+1, listing longitudes in ascending order from -180 to 180.
	Nlat = length(tile_latitudes)-1
	Nlon = length(tile_longitudes)-1
	NP	 = length(point_latitudes)
	
	# first bin by latitude
	# point2lat[p] will be the latitudinal tile index (from 1 to Nlat) for point p
	lat_order = order(point_latitudes)
	lat_binning = place_sorted_values_into_bins_CPP(items		= point_latitudes[lat_order],
													bin_mins 	= tile_latitudes[1:Nlat],
													bin_maxs	= tile_latitudes[2:(Nlat+1)])
	point2lat = integer(NP)
	point2lat[lat_order] = lat_binning$item2bin+1
	
	# next bin by longitude
	# point2lon[p] will be the longitudinal tile index (from 1 to Nlon) for point p
	lon_order = order(point_longitudes)
	lon_binning = place_sorted_values_into_bins_CPP(items		= point_longitudes[lon_order],
													bin_mins 	= tile_longitudes[1:Nlon],
													bin_maxs	= tile_longitudes[2:(Nlon+1)])
	point2lon = integer(NP)
	point2lon[lon_order] = lon_binning$item2bin+1
	
	return(list(point2lat=point2lat, point2lon=point2lon))
}



# generate random birth-death trees, simulate a constant-diffusivity SBM on each tree, and subsample each tree with geographic biases
# The geographically variable sampling probability can either be provided (see tile_counts[]) or determined based on a set of provided points on the sphere, by splitting the sphere into a number of equally sized tiles
# Note that the returned trees might have somewhat different tip counts than requested, depending on whether some simulated tips land in zero-probability tiles
simulate_geobiased_sbm = function(	Nsims,
									Ntips,
									radius,
									diffusivity,
									lambda					= 1,
									mu						= 0,
									rarefaction 			= 1,
									crown_age				= NULL,
									stem_age				= NULL,
									root_latitude			= NULL,
									root_longitude			= NULL,
									Nthreads				= 1,
									omit_failed_sims		= FALSE,
									Ntiles					= NULL,	# number of tiles into which to split sphere, for calculating sampling density. Only needed if tile_counts==NULL.
									reference_latitudes		= NULL,	# optional numeric vector of size NR, specifying the latitudes of the reference points on the sphere, i.e. based on which the sampling probability density will be determined. Only needed if tile_counts==NULL.
									reference_longitudes	= NULL,	# optional numeric vector of size NR, specifying the latitudes of the reference points on the sphere, i.e. based on which the sampling probability density will be determined. Only needed if tile_counts==NULL.
									tile_counts				= NULL,	# optional 2D numeric matrix listing reference counts (~density) for each spherical tile. The normalization of tile_counts[] does not matter, i.e. only relative values matter.
									tile_latitudes			= NULL,	# optional numeric vector of size nrow(tile_counts)+1, listing tile latitudes, ranging from -90 to +90. Must be provided iff tile_counts is provided.
									tile_longitudes			= NULL){# optional numeric vector of size ncol(tile_counts)+1, listing tile longitudes, ranging from -180 to +180. Must be provided iff tile_counts is provided.

	# determine spherical tiles
	if(!is.null(tile_counts)){
		if(is.null(tile_latitudes)) return(list(success=FALSE, error="Missing tile_latitudes"))
		if(is.null(tile_longitudes)) return(list(success=FALSE, error="Missing tile_longitudes"))
		Ntiles = nrow(tile_counts) * ncol(tile_counts)
		Nlat   = nrow(tile_counts)
		Nlon   = ncol(tile_counts)
		if(length(tile_latitudes)!=Nlat+1) return(list(success=FALSE, error=sprintf("Number of tile_latitudes (%d) differs from expectation (%d)",length(tile_latitudes),Nlat+1)))
		if(length(tile_longitudes)!=Nlon+1) return(list(success=FALSE, error=sprintf("Number of tile_longitudes (%d) differs from expectation (%d)",length(tile_longitudes),Nlon+1)))
	}else{
		if(is.null(reference_latitudes)) return(list(success=FALSE, error="Missing reference_latitudes"))
		if(is.null(reference_longitudes)) return(list(success=FALSE, error="Missing reference_longitudes"))
		if(is.null(Ntiles)) Ntiles = max(8,length(reference_latitudes)/10)
		Nlat 	= max(1,as.integer(round(sqrt(Ntiles/2))))
		Nlon 	= max(1,as.integer(round(Ntiles/Nlat)))
		tiles 	= split_sphere_into_equisized_tiles(Nlat=Nlat, Nlon=Nlon)
		tile_latitudes  = tiles$latitudes
		tile_longitudes = tiles$longitudes

		# determine number of reference points in each tile (i.e. geographic sampling density)
		tile_counts = matrix(0, nrow=Nlat, ncol=Nlon)
		for(r in seq_len(Nlat)){
			for(k in seq_len(Nlon)){
				tile_counts[r,k] = sum((reference_latitudes<tile_latitudes[r+1]) & (reference_latitudes>=tile_latitudes[r]) & (reference_longitudes<tile_longitudes[k+1]) & (reference_longitudes>=tile_longitudes[k]))
			}
		}
	}
		
	# core function for running a single simulation
	single_simulation = function(r){
		# generate random tree
		tree_sim = generate_tree_hbd_reverse(	Ntips		= Ntips/rarefaction,
												lambda		= lambda,
												mu			= mu,
												rho			= 1,
												crown_age	= crown_age,
												stem_age	= stem_age,
												relative_dt	= 0.01)
		if(!tree_sim$success) return(list(success=FALSE, error=sprintf("Simulation #%d failed: %s",r,tree_sim$error)))
		tree = tree_sim$trees[[1]]
		# simulate SBM on tree
		sbm_sim = simulate_sbm(	tree			= tree, 
								radius			= radius,
								diffusivity		= diffusivity,
								root_latitude	= root_latitude,
								root_longitude	= root_longitude)
		if(!sbm_sim$success) return(list(success=FALSE, error=sprintf("SBM simulation #%d failed: %s",r,sbm_sim$error)))
		# rarefy tree, picking tips according to the previously determined spherical probability density
		tip2tile 		= assign_points_to_tiles_on_sphere(point_latitudes = sbm_sim$tip_latitudes, point_longitudes = sbm_sim$tip_longitudes, tile_latitudes = tile_latitudes, tile_longitudes = tile_longitudes)
		tip2tile_count  = tile_counts[cbind(tip2tile$point2lat,tip2tile$point2lon)] # tip2tile_count[i] will be the tile_count value for tip i, i.e. the number of reference points found in the same tile as this tip
		if(sum(tip2tile_count>0)<2) return(list(success=FALSE, error=sprintf("SBM simulation #%d failed: Nearly all simulated tips are in empty tiles",r)))
		tip2prob 		= tip2tile_count/sum(tip2tile_count)
		keep_tips		= sample(x=seq_len(length(tip2prob)), size=min(Ntips,sum(tip2prob>0)), replace=FALSE, prob=tip2prob)
		if(length(keep_tips)<2) return(list(success=FALSE, error=sprintf("SBM simulation #%d failed: Insufficient number of tips sampled",r)))
		rarefying		 = get_subtree_with_tips(tree, only_tips=keep_tips)
		return(list(success=TRUE, tree=rarefying$subtree, latitudes=sbm_sim$tip_latitudes[rarefying$new2old_tip], longitudes=sbm_sim$tip_longitudes[rarefying$new2old_tip]))
	}

	
	# simulate multiple trees & SBMs
    if((Nsims>1) && (Nthreads>1) && (.Platform$OS.type!="windows")){
		# simulate trees in parallel using multiple forks
		# Note: Forks (and hence shared memory) are not available on Windows
		sims = parallel::mclapply(	seq_len(Nsims), 
									FUN = function(r){ single_simulation(r) }, 
									mc.cores = min(Nthreads, Nsims), 
									mc.preschedule = TRUE, 
									mc.cleanup = TRUE)
	}else{
		# run in serial mode
		sims = vector(mode="list", Nsims)
		for(r in 1:Nsims){
			sims[[r]] = single_simulation(r)
		}
	}

	# remove failed sims if requested
	if(omit_failed_sims){
		sims = sims[which(sapply(seq_len(Nsims), FUN=function(r) sims[[r]]$success))]
	}

	return(list(success			= TRUE,
				sims			= sims,
				Nlat			= Nlat,
				Nlon			= Nlon,
				tile_latitudes 	= tile_latitudes,
				tile_longitudes	= tile_longitudes,
				tile_counts		= tile_counts))
}


# remove values in the left- and right-tail of a distribution of numbers
# For example, if outlier_prob=0.5, then only the percentile 0.025 - 0.975 will be kept.
remove_outliers = function(X, outlier_prob){
	Q = quantile(X, probs=c(outlier_prob/2,1-outlier_prob/2), na.rm=TRUE, type=8)
	return(X[(X>Q[1]) & (X<Q[2])])
}



# simulate a single trajectory on a sphere according to Spherical Brownian Motion
simulate_SBM_trajectory = function(	times,						# numeric vector, listing times in ascending order, at which to evaluate the SBM trajectory
									radius, 					# numeric, radius of the sphere
									diffusivity,				# diffusivity, in distance units^2 per time
									start_latitude	= NULL,		# initial latitude decimal degrees, from -90 to 90. If NULL, it will be randomly generated
									start_longitude	= NULL){	# initial longitude decimal degrees, from -180 to 180. If NULL, it will be randomly generated
	# determine initial coordinates if needed
	if(is.null(start_latitude) && is.null(start_longitude)){
		# randomly choose root latitude & longitude
		start_longitude = runif(1, min=-180, max=180)
		start_latitude  = (180/pi) * asin(2*runif(1, min=0, max=1)-1) # randomly drawn latitude = arcsin(2*U-1), where U is uniformly distributed in [0,1]
	}else if(is.null(start_longitude)){
		# root latitude is given, so randomly choose root longitude
		start_longitude = runif(1, min=-180, max=180)
	}else if(is.null(start_latitude)){
		# root longitude is given, so randomly choose root latitude
		start_latitude = (180/pi) * asin(2*runif(1, min=0, max=1)-1) # randomly drawn latitude = arcsin(2*U-1), where U is uniformly distributed in [0,1]
	}

	# simulate trajectory on sphere
	sim = simulate_SBM_trajectory_CPP(	times		= times,
										radius		= radius,
										diffusivity	= diffusivity,
										start_theta	= pi*start_latitude/180,
										start_phi	= pi*start_longitude/180)
	return(list(success		= TRUE,
				latitudes 	= sim$thetas * 180/pi,
				longitudes	= sim$phis * 180/pi,
				distances	= sim$omegas * radius)) # geodesic distances between successive time points. So distances[t] is the geodesic distance between times[t-1] and times[t].
}


# estimate empirical cumulative distribution function from random samples
# this function can accommodate discrete as well as continuous distributions
CDF_from_samples = function(samplesX){
	samplesX 	= sort(samplesX[is.finite(samplesX)])
	N 			= length(samplesX)
	
	unique_samples	= 1 + N - rev(c(1,1+which(diff(rev(samplesX))<0))) # indices of unique values in samplesX, keeping always the last instance of each value. Assuming that samplesX is monotonically increasing.
	CDF_grid 		= samplesX[unique_samples]
	CDF_values		= unique_samples/N

	return(list(CDF_grid=CDF_grid, CDF_values=CDF_values))
}


# run a  Kolmogorov-Smirnov test to check whether an empirical set of samples was generated by some null model (or some fitted model), using parametric boostraps of that model
# For example, empirical[] may be branch lengths of some empirical tree, while bootstraps[] are branch lengths of trees simulated under a fitted birth-death model
bootstrap_Kolmogorov_Smirnov_test = function(	bootstraps,	# list of length NB, listing samples from NB bootstraps (e.g., simulations of some stochastic null model or fitted model). Hence, bootstraps[[b]] is a numeric vector, listing values generated during the b-th bootstrap.
												empirical){	# numeric vector of arbitrary size, listing samples whose deviation from the null/fitted model is to be evaluated
	NB = length(bootstraps)
	
	# calculate empirical CDF
	empirical_CDF 		 = CDF_from_samples(empirical)
	empirical_CDF_grid 	 = empirical_CDF$CDF_grid
	empirical_CDF_values = empirical_CDF$CDF_values

	# calculate bootstrap CDFs and mean bootstrap CDF (will be used as "reference"), evaluated at the same points as the empirical CDF
	bootstrap_CDF_values = matrix(NA, nrow=NB, ncol=length(empirical_CDF_grid))
	for(b in seq_len(NB)){
		NBG = length(bootstraps[[b]])
		bootstrap_CDF = CDF_from_samples(bootstraps[[b]])
		bootstrap_CDF_values[b,] = evaluate_spline(Xgrid=bootstrap_CDF$CDF_grid, Ygrid=bootstrap_CDF$CDF_values, splines_degree=0, Xtarget=empirical_CDF_grid, extrapolate="const", derivative=0)
	}
	reference_CDF_values = colMeans(bootstrap_CDF_values)	

	# calculate KS distance (i.e. the Kolmogorov-Smirnov test statistic) of empirical data from the reference_CDF
	empirical_KS = max(abs(empirical_CDF_values-reference_CDF_values))
	
	# calculate KS distance of each boostrap to the reference CDF
	boostrap_KSs = rep(NA, NB)
	for(b in seq_len(NB)){
		boostrap_KSs[b] = max(abs(bootstrap_CDF_values[b,]-reference_CDF_values))
	}

	# compare empirical_KS to bootstrap_KSs
	mean_bootstrap_KS 	= mean(boostrap_KSs)
	std_bootstrap_KS 	= sd(boostrap_KSs)
	median_bootstrap_KS = median(boostrap_KSs)
	Pvalue 				= mean(boostrap_KSs>=empirical_KS)
	return(list(empirical_KS		= empirical_KS,
				mean_bootstrap_KS	= mean_bootstrap_KS, 
				std_bootstrap_KS	= std_bootstrap_KS, 
				median_bootstrap_KS	= median_bootstrap_KS, 
				Pvalue				= Pvalue,
				RES					= (empirical_KS-mean_bootstrap_KS)/abs(mean_bootstrap_KS), # relative effect size of the empirical KS
				SES					= (empirical_KS-mean_bootstrap_KS)/std_bootstrap_KS, # standardized effect size of the empirical KS
				CDF_grid			= empirical_CDF_grid,
				reference_CDF_values= reference_CDF_values,
				empirical_CDF_values= empirical_CDF_values))
}



# calculate equal-tailed credible intervals of a collection of curves defined on a time grid of size NG (or more abstractly, of NG-dimensional numeric vectors)
# each curve is considered a "replicate", and the CIs are computed at each grid point based on all replicates
calculate_equal_tailed_CIs_of_curves = function(curves){ 	# 2D numeric matrix of size NB x NG, listing the values of NB replicate curves on NG grid points
	NB 			= nrow(curves)
	NG			= ncol(curves)
	CI50lower 	= numeric(NG)
	CI50upper 	= numeric(NG)
	CI95lower 	= numeric(NG)
	CI95upper 	= numeric(NG)
	medians 	= numeric(NG)
	means	 	= colMeans(curves, na.rm=TRUE)
	for(g in seq_len(NG)){
		quantiles = quantile(curves[,g], probs=c(0.25, 0.75, 0.025, 0.975, 0.5), na.rm=TRUE)
		CI50lower[g] = quantiles[1]
		CI50upper[g] = quantiles[2]
		CI95lower[g] = quantiles[3]
		CI95upper[g] = quantiles[4]
		medians[g] 	 = quantiles[5]	
	}
	return(list(means		= means,
				medians 	= medians,
				CI50lower 	= CI50lower,
				CI50upper 	= CI50upper,
				CI95lower	= CI95lower,
				CI95upper	= CI95upper))
}



# draw random numbers from the 4-parameter beta distribution, i.e. a beta distribution scaled and shifted to an arbitrary interval [minx, maxx]
# The parameterization used here is in terms of the mean (mu), standard deviation (sigma) and the interval bounds minx & maxx
# Note that mu & sigma must satisfy the condition sigma^2 < (mu-minx)*(maxx-mu).
rbeta4 = function(n, mu, sigma, minx=0, maxx=1){
	# determine the corresponding parameterization to use for the standard (2-parameter) beta distribution in [0,1]
	mu0 	= (mu-minx)/(maxx-minx)
	sigma0 	= sigma/(maxx-minx) # note that sigma0 and mean0 must satisfy the condition sigma0^2 < mu0*(1-mu0), which is equivalent to sigma^2 < (mu-minx)*(maxx-mu)
	nu0		= mu0*(1-mu0)/(sigma0^2) - 1
	alpha0	= mu0 * nu0
	beta0	= (1-mu0)*nu0
	# draw from the standard beta distribution, then rescale & shift
	x0 = rbeta(n=n, shape1=alpha0, shape2=beta0)
	return(minx + (maxx-minx)*x0)
}


# calculate the probability density of the 4-parameter beta distribution, i.e. a beta distribution scaled and shifted to an arbitrary interval [minx, maxx]
# The parameterization used here is in terms of the mean (mu), standard deviation (sigma) and the interval bounds minx & maxx
# Note that mu & sigma must satisfy the condition sigma^2 < (mu-minx)*(maxx-mu).
dbeta4 = function(x, mu, sigma, minx=0, maxx=1){
	# determine the corresponding parameterization to use for the standard (2-parameter) beta distribution in [0,1]
	mu0 	= (mu-minx)/(maxx-minx)
	sigma0 	= sigma/(maxx-minx) # note that sigma0 and mean0 must satisfy the condition sigma0^2 < mu0*(1-mu0), which is equivalent to sigma^2 < (mu-minx)*(maxx-mu)
	nu0		= mu0*(1-mu0)/(sigma0^2) - 1
	alpha0	= mu0 * nu0
	beta0	= (1-mu0)*nu0
	# get density of standard beta distribution
	x0	= (x-minx)/(maxx-minx)
	rho0 = dbeta(x=x0, shape1=alpha0, shape2=beta0)
	return(rho0/(maxx-minx))
}


# calculate the variance of entries in each row, thus returning a numeric vector of length nrow(A)
rowVars = function(A, d=1, na.rm=FALSE){
	means = rowMeans(A, na.rm=na.rm)
	vars  = rowSums((A-means)^2, na.rm=na.rm)/(ncol(A)-d)
	return(vars)
}


# calculate the covvariance of entries in each row between two equally-sized matrixes A & B, thus returning a numeric vector of length nrow(A) = nrow(B)
rowCovs = function(A, B, d=1, na.rm=FALSE){
	meansA = rowMeans(A, na.rm=na.rm)
	meansB = rowMeans(B, na.rm=na.rm)
	covs   = rowSums((A-meansA)*(B-meansB), na.rm=na.rm)/(ncol(A)-d)
	return(covs)
}



# a basic ABC-MCMC Metropolis-Hastings sampler for Bayesian parameter inference when the likelihood is intractable but the model itself can be efficiently simulated to generate random samples for any given choice of parameters
# ABC-MCMC: Approximate Bayesian Computation Markov Chain Monte Carlo
# Literature:
#   First introduced by: Marjoram et a. (2003). Markov chain Monte Carlo without likelihoods. PNAS. 100:15324-15328.
#   Reviewed by: Beaumont (2019). Approximate Bayesian Computation. Annual Review of Statistics and Its Application. 6:379-403. Section 4.1.
ABC_MCMC = function(observation,					# a data structure representing the original observations to which the model should be fitted, or summary statistics thereof. Must be of the same type as generated by the model.
					model,							# functional, generating data (e.g., a gene tree in the case of a Multispecies Coalescent model) according to specific input parameters, or returning low-dimensional summary statistics of generated data. Thus, model(theta) returns data or a low-dimensional summary statistic thereof. May also occasionally return NULL (e.g. model failure for some parameter values).
					metric,							# functional, returning the distance between two data points (e.g., between two gene trees), or between low-dimensional summary statistics of two data points. Hence, metric(observation, model(theta)) should return a numeric scalar.
					prior_sampler,					# functional, generating random draws from the prior parameter distribution. prior_sampler() should return a random choice of parameters. If numeric_theta==TRUE, then prior_sampler() must not generate thetas outside of the box [min_theta,max_theta].
					prior_density,					# functional, returning the prior probability density at a specific location in parameter space (need not be normalized). prior_density(theta) should be a numeric scalar.
					proposal_sampler		= "uniform",# functional, generating new proposals conditional on a specific current location in parameter space. proposal_sampler(theta_old) should return a new random choice of parameters. If numeric_theta==TRUE, then proposal_sampler() must not generate thetas outside of the box [min_theta,max_theta]. May also be "beta" or "uniform" (if numeric_theta==TRUE).
					proposal_density		= NULL,		# functional, returning the density of new proposals conditional on a specific current location in parameter space. proposal_density(theta_new, theta_old) should be a numeric scalar, corresponding to the probability density P(theta_new | theta_old). May also be NULL (if numeric_theta==TRUE), in which case a default proposal is used.
					bandwidth				= NULL,		# numeric, acceptance threshold for generated samples ("epsilon"). Either bandwidth or relative_bandwidth must be provided.
					relative_bandwidth		= NULL,		# numeric, specifying the relative acceptance threshold for generated samples (epsilon relative to the typical distances of samples drawn from the prior). Typical suitable values are 0.01 - 0.1, but this also strongly depends on your metric.
					numeric_theta			= TRUE,		# logical, specifying whether it can be assumed that the parameters theta are always a numeric of a specific size. If FALSE, theta is considered an abstract data structure and thus computation is somewhat less efficient (also, statistical analyses of theta will be omitted).
					min_theta				= -Inf,		# either a single numeric, or a numeric vector of length NP, specifying the lower bounds for the parameters theta. Only relevant if numeric_theta==TRUE. May include -Inf.
					max_theta				= +Inf,		# either a single numeric, or a numeric vector of length NP, specifying the upper bounds for the parameters theta. Only relevant if numeric_theta==TRUE. May include +Inf.
					proposal_rescaling		= 1,		# singla numeric or a numeric vector of size NP, specifying a factor for the scales (e.g., standard deviation) of the proposal. Only relevant if numeric_theta==TRUE and proposal_sampler==NULL.
					chain_length			= 10000,	# integer, the length of each MCMC chain (including burnin and prior to thinning)
					burnin					= 1000,		# integer, how many initial samples should be discarded as burnin from each MCMC chain
					thinning_step			= 1,		# integer, thinning step. A thinning step of 10 means that every 10-th sample is kept from the chain (after burnin)
					max_ACF_lag				= 1000,		# integer, maximum time lag to consider when calculating ACFs. Only relevant if numeric_theta=TRUE.
					Nchains					= 1,		# integer, number of independent MCMC chains to run. If Nthreads>1, each chain is run on a separate thread.
					Nthreads				= 1,		# integer, number of parallel threads to use for computing. Only relevant if Nchains>1.
					max_start_attempts		= 100,		# integer, maximum number of attempts to start a chain (i.e., initiate a jump)
					start_bandwidth_factor	= NULL,		# optional numeric, specifying the how much larger the initial bandwidth of each chain should be relative to the final bandwidth, during the early phase of "burnin". If NULL, this is automatically chosen.
					verbose_prefix			= "",		# character, line prefix to use before any console output
					verbose					= FALSE,	# logical, whether to print progress reports to standard output
					diagnostics				= FALSE){	# logical, whether to print technical details at each iteration. For debugging purposes mainly.
	# basic error checking and preparations
	if(chain_length<=burnin) return(list(success=FALSE, error="The chain length cannot be shorter than the burnin"))
	if(chain_length-burnin<=thinning_step) return(list(success=FALSE, error="The chain length minus the burnin cannot be shorter than the thinning_step"))
	thinning_step = max(1,thinning_step)
	adaptive_proposal = FALSE
	if(is.null(bandwidth) && is.null(relative_bandwidth)) return(list(success=FALSE, error="Either bandwidth or relative_bandwidth must be specified"))
	if((!is.null(bandwidth)) && (!is.null(relative_bandwidth))) return(list(success=FALSE, error="Only one of bandwidth or relative_bandwidth must be specified"))

	# check if prior sampler and density are functional
	if(verbose) cat(sprintf("%sChecking functionality of prior..\n",verbose_prefix))
	example_thetas = lapply(seq_len(10), FUN=function(k) prior_sampler())
	example_prior_densities = unlist(lapply(example_thetas, FUN=function(theta) prior_density(theta)))
	if(any(!is.finite(example_prior_densities))) return(list(success=FALSE, error=sprintf("%d out of %d randomly drawn prior thetas have a non-finite probability density",sum(!is.finite(example_prior_densities)),length(example_prior_densities))))

	if(numeric_theta){
		example_theta = prior_sampler()
		if(!is.numeric(example_theta)) return(list(success=FALSE, error="Example theta (drawn from the prior) is not numeric. Consider setting numeric_theta=FALSE"))
		NP = length(example_theta) # dimensionality of parameter space
		if(length(min_theta)==1) min_theta = rep(min_theta, NP)
		if(length(min_theta)!=NP) return(list(success=FALSE, error=sprintf("min_theta must either be of length 1 or %d (NP); instead, got length %d",NP,length(min_theta))))
		if(length(max_theta)==1) max_theta = rep(max_theta, NP)
		if(length(max_theta)!=NP) return(list(success=FALSE, error=sprintf("max_theta must either be of length 1 or %d (NP); instead, got length %d",NP,length(max_theta))))
		if(is.character(proposal_sampler) && (!is.null(proposal_density))) return(list(success=FALSE, error=sprintf("Since proposal sampler is set to '%s', the proposal density must be left at NULL.",proposal_sampler)))
		if((!is.character(proposal_sampler)) && is.null(proposal_density)) return(list(success=FALSE, error=sprintf("Missing proposal density")))
		if(is.character(proposal_sampler)){
			# define a reasonable proposal sampler and its corresponding probability density, accounting for potential lower & upper bounds
			if(length(proposal_rescaling)==1) proposal_rescaling = rep(proposal_rescaling,NP)
			if(length(proposal_rescaling)!=NP) return(list(success=FALSE, error=sprintf("proposal_rescaling must either be of length 1 or %d (NP); instead, got length %d",NP,length(proposal_rescaling))))
			typical_thetas 	  = sapply(seq_len(1000), FUN=function(k) prior_sampler())
			proposal_scales	  = proposal_rescaling*0.1*rowMeans(abs(typical_thetas), na.rm=TRUE)
			adaptive_proposal = TRUE
			if(proposal_sampler=="beta"){
				# Beta distribution
				if(verbose) cat(sprintf("%sSetting proposal sampler to beta distribution\n",verbose_prefix))
				proposal_sampler = function(old, scales){ rbeta4(n=NP, mu=old, sigma=pmin(scales,0.5*(old-min_theta),0.5*(max_theta-old)), minx=min_theta, maxx=max_theta) }
				proposal_density = function(theta, old, scales){ dbeta4(x=theta, mu=old, sigma=pmin(scales,0.5*(old-min_theta),0.5*(max_theta-old)), minx=min_theta, maxx=max_theta) } # note that the density must be consistent with proposal_sampler()
			}else if(proposal_sampler=="uniform"){
				# uniform distribution within a small box
				if(verbose) cat(sprintf("%sSetting proposal sampler to uniform distribution in a box\n",verbose_prefix))
				proposal_sampler = function(old, scales){ return(runif(n=NP, min=pmax(min_theta,old-scales), max=pmin(max_theta,old+scales))) }
				proposal_density = function(theta, old, scales){
					box_mins = pmax(min_theta,old-scales)
					box_maxs = pmin(max_theta,old+scales)
					return(1.0/prod(box_maxs-box_mins))
				}
			}else{
				return(list(success=FALSE, error=sprintf("Invalid proposal_sampler '%s'",proposal_sampler)))
			}
		}
	}else{
		if(is.null(proposal_sampler)) return(list(success=FALSE, error="Proposal sampler must be non-NULL when numeric_theta=FALSE"))
		if(is.null(proposal_density)) return(list(success=FALSE, error="Proposal density must be non-NULL when numeric_theta=FALSE"))
	}
	
	# determine bandwidths if needed
	if(is.null(bandwidth) || is.null(start_bandwidth_factor)){
		# determine typical distances based on prior
		if(verbose) cat(sprintf("%sDetermining typical distances from observation based on prior models..\n",verbose_prefix))
		Ncalibrations = 1000
		get_typical_distance = function(k){ theta = prior_sampler(); sam = model(theta); return(if(is.null(sam)) NA else metric(observation, sam)); }
		if((Ncalibrations>1) && (Nthreads>1) && (.Platform$OS.type!="windows")){
			typical_distances = parallel::mcmapply(seq_len(Ncalibrations), FUN=get_typical_distance, mc.cores=min(Nthreads, Ncalibrations), mc.preschedule=TRUE, mc.cleanup=TRUE, SIMPLIFY=TRUE, USE.NAMES=FALSE)
		}else{
			typical_distances = sapply(seq_len(Ncalibrations), FUN=get_typical_distance)
		}
		mean_typical_distance = mean(typical_distances, na.rm=TRUE)
		if(verbose) cat(sprintf("%s  Note: Mean prior distance = %g\n",verbose_prefix,mean_typical_distance))
	}
	if(is.null(bandwidth)){
		bandwidth = relative_bandwidth * mean_typical_distance
		if(verbose) cat(sprintf("%sSetting long-term bandwidth to %g (corresponding to quantile %g among prior distances)\n",verbose_prefix,bandwidth,mean(typical_distances<=bandwidth,na.rm=TRUE)))
	}
	if(is.null(start_bandwidth_factor) || (!is.finite(start_bandwidth_factor))){
		if(burnin<=1){
			start_bandwidth = bandwidth
		}else{
			# figure out a reasonable start bandwidth, based on the typical distances of randomly generated data from the observation
			start_bandwidth = quantile(typical_distances, probs=0.5, na.rm=TRUE) # choose initial bandwidth so that half of the typical samples would be kept
			if(verbose) cat(sprintf("%sSetting start (early burnin) bandwidth to %g (median of prior distances)\n",verbose_prefix,start_bandwidth))
		}
	}else{
		start_bandwidth = start_bandwidth_factor * bandwidth
	}	
	
	aux_single_ABC_MCMC_chain = function(n){
		if(verbose) cat(sprintf("%s  Running chain %d..\n",verbose_prefix,n))
		if(numeric_theta){
			thetas = matrix(NA, nrow=NP, ncol=floor((chain_length-burnin)/thinning_step))
		}else{
			thetas = vector(mode="list", floor((chain_length-burnin)/thinning_step)) # pre-allocate space for storing samples of this chain (after burnin and thinning)
		}
		if(numeric_theta && adaptive_proposal){
			# reserve space for storing burnin thetas whose sample ended up within the desired bandwidth. Only store up to 1000 thetas.
			burnin_good_thetas  = matrix(NA, nrow=NP, ncol=min(burnin,1000))
			Nburnin_good_thetas = 0
		}
		distances				= rep(NA,ncol(thetas))
		Naccepts				= 0
		Naccepts_after_burnin 	= 0
		Nproposals_after_burnin	= 0
		Nstart_attempts 		= 1
		next_slot 				= 1 # next slot in thetas[] for storing a sample
		theta_old 				= prior_sampler() # draw random starting point from prior
		k						= 1
		while(k<=chain_length){
			current_bandwidth = (if(k>burnin) bandwidth else (bandwidth*(k/burnin) + start_bandwidth*(1-k/burnin))) # if within the burnin phase, gradually reduce the bandwidth to its final value
			if(numeric_theta && adaptive_proposal && (k==burnin+1) && (Nburnin_good_thetas>=10)){
				# finalize proposal scales based on good thetas seen so far falling within the bandwidth
				burnin_theta_stds = sqrt(rowVars(burnin_good_thetas[,seq_len(Nburnin_good_thetas)], na.rm=TRUE))
				if(diagnostics) cat(sprintf("%s  Chain %d, end of burnin: Standard deviations of thetas within bandwidth (N=%d): %s\n",verbose_prefix,n,Nburnin_good_thetas,paste(sprintf("%.3g",burnin_theta_stds),collapse=", ")))
				proposal_scales = proposal_rescaling * burnin_theta_stds # For classical Metropolis-Hastings MCMC, [Yang and Rodriguez, 2013, PNAS. 110:19307-19312] show that for a Gaussian proposal and a Gaussian target distribution, the optimal proposal standard deviation should be 2.5 x the target standard deviation. A similar result is reported by [Thawornwattana et al. 2018. Bayesian Analysis. 13:1033-1059]
			}
			theta_new = (if(adaptive_proposal) proposal_sampler(theta_old, proposal_scales) else proposal_sampler(theta_old))
			if(k>burnin) Nproposals_after_burnin = Nproposals_after_burnin + 1
			#  decide whether to accept this proposal
			acceptance_probability = min(1, (prior_density(theta_new)/prior_density(theta_old)) * (if(adaptive_proposal) (proposal_density(theta_old, theta_new, proposal_scales)/proposal_density(theta_new, theta_old, proposal_scales)) else (proposal_density(theta_old, theta_new)/proposal_density(theta_new, theta_old))))
			accept 	 = TRUE
			distance = NA
			if(runif(n=1, min=0, max=1)>acceptance_probability){
				accept = FALSE
			}else{
				if(diagnostics) cat(sprintf("%s  Chain %d, iteration %d: simulating model for theta=%s..\n",verbose_prefix,n,k,paste(sprintf("%.3g",theta_new),collapse=", ")))
				sample_new = model(theta_new) # simulate data from the model based on the proposed parameters
				if(is.null(sample_new)){
					# the model failed to generate data for this specific choice of parameters, so don't accept it
					accept = FALSE
				}else{
					if(diagnostics) cat(sprintf("%s  Chain %d, iteration %d: Calculating distance to observation..\n",verbose_prefix,n,k))
					distance = metric(observation, sample_new)
					if(is.null(distance) || (!is.finite(distance)) || (distance>current_bandwidth)){
						accept = FALSE
					}
					# keep record of this theta if it is a "good" one (i.e., within the desired bandwidth) and if we are in the burnin phase
					if(numeric_theta && (k<=burnin) && (Nburnin_good_thetas<ncol(burnin_good_thetas)) && (!is.null(distance)) && is.finite(distance) && (distance<=bandwidth)){
						Nburnin_good_thetas = Nburnin_good_thetas + 1
						burnin_good_thetas[,Nburnin_good_thetas] = theta_new
					}
				}
			}
			if(diagnostics) cat(sprintf("%s  Chain %d, iteration %d: distance=%g, accept=%d (proposed theta=%s), Naccepts=%d\n",verbose_prefix,n,k,distance,accept,paste(sprintf("%.3g",theta_new),collapse=", "),Naccepts))
			if(accept){
				Naccepts = Naccepts + 1
				if(k>burnin) Naccepts_after_burnin = Naccepts_after_burnin + 1
			}else{
				theta_new = theta_old
				if((Naccepts==0) && (Nstart_attempts<max_start_attempts)){
					# try to start the chain again
					Nstart_attempts = Nstart_attempts + 1
					if(diagnostics) cat(sprintf("%s  Chain %d: Attempting restart (attempt %d)\n",verbose_prefix,n,Nstart_attempts))
					theta_old = prior_sampler()
					next
				}
			}
			if((k>burnin) && (((k-burnin-1) %% thinning_step) == 0)){
				# store this sample
				if(numeric_theta){
					thetas[,next_slot] = theta_new
				}else{
					thetas[[next_slot]] = theta_new
				}
				distances[next_slot] = distance
				next_slot = next_slot + 1
			}
			theta_old = theta_new
			k = k + 1
			if((k %%100) ==0) gc()
		}
		if(verbose) cat(sprintf("%s  Finished chain %d\n",verbose_prefix,n))
		return(list(success=TRUE, thetas=thetas, Naccepts=Naccepts, Naccepts_after_burnin=Naccepts_after_burnin, acceptance_rate_after_burnin=Naccepts_after_burnin/Nproposals_after_burnin, distances=distances, Nstart_attempts=Nstart_attempts))
	}
	if((Nchains>1) && (Nthreads>1) && (.Platform$OS.type!="windows")){
		if(verbose) cat(sprintf("%sRunning %d MCMC chains (parallelized)..\n",verbose_prefix,Nchains))
		chains = parallel::mclapply(seq_len(Nchains), 
									FUN 			= function(n){ aux_single_ABC_MCMC_chain(n) }, 
									mc.cores 		= min(Nthreads, Nchains), 
									mc.preschedule 	= FALSE, 
									mc.cleanup 		= TRUE)
	}else{
		if(verbose) cat(sprintf("%sRunning %d MCMC chains (sequential)..\n",verbose_prefix,Nchains))
		chains = vector(mode="list", Nchains)
		for(n in seq_len(Nchains)){
			chains[[n]] = aux_single_ABC_MCMC_chain(n)
		}
	}
	
	# discard failed or stuck chains
	valid_chains = which(sapply(seq_len(Nchains), FUN=function(n) chains[[n]]$success))
	if(length(valid_chains)<Nchains){
		if(length(valid_chains)==0) return(list(success=FALSE, error=sprintf("All MCMC chains failed: %s",chains[[1]]$error)))
		chains  = chains[valid_chains]
		Nchains = length(valid_chains)
	}
	stuck_chains = which(sapply(seq_len(Nchains), FUN=function(n) (chains[[n]]$Naccepts_after_burnin==0)))
	if(length(stuck_chains)==Nchains) return(list(success=FALSE, error=sprintf("All MCMC chains got stuck at a single value after burnin")))
	if(length(stuck_chains)>0){
		if(verbose) cat(sprintf("%sWARNING: %d out of %d MCMC chains stayed at a single value after burnin. These chains will be discarded.\n",verbose_prefix,length(stuck_chains),Nchains))
		chains  = chains[get_complement(Nchains,stuck_chains)]
		Nchains = length(chains)
	}
		
	if(numeric_theta){
		# calculate autocorrelation function (ACF) and other statistics, separately for each chain and each component of theta
		# ACFs[[n]] will be a 2D numeric matrix listing the ACF for the n-th chain, with ACFs[[n]][p,l] being the autocorrelation of the p-th component at time-lag l in chain n
		if(verbose) cat(sprintf("%sCalculating autocorrelations..\n",verbose_prefix))
		ACFs 	= vector(mode="list", Nchains)
		ESS 	= rep(NA, Nchains) # effective sample size per chain
		for(n in seq_len(Nchains)){
			thetas		= chains[[n]]$thetas
			max_lag		= min(max_ACF_lag, ncol(thetas)-2)
			ACFs[[n]] 	= matrix(NA, nrow=NP, ncol=max_lag)
			theta_mean 	= rowMeans(thetas)
			theta_var	= rowMeans((thetas-theta_mean)**2)
			for(l in seq_len(max_lag)){
				ACFs[[n]][,l] = rowMeans((thetas[,1:(ncol(thetas)-l),drop=FALSE]-theta_mean)*(thetas[,(1+l):ncol(thetas),drop=FALSE]-theta_mean),na.rm=TRUE)/theta_var
			}
			# determine "effective" number of samples, based on how much lag is needed between two samples to effectively become uncorrelated
			# we use a definition similar to that used by BEAST: https://jrnold.github.io/bayesian_notes/mcmc-diagnostics.html
			decoherence_lags = sapply(seq_len(NP), FUN=function(p) which(ACFs[[n]][p,]<=0)[1])
			if(all(is.finite(decoherence_lags))){
				ESS[n] = min(sapply(seq_len(NP), FUN=function(p) ncol(thetas)/(1 + 2*sum(ACFs[[n]][p,seq_len(max(1,decoherence_lags[p]-1))],na.rm=TRUE))))
			}else{
				# for at least one component of theta the ACF never drops to zero no matter how large the time-lag, so effectively we only have one sample (being conservative here)
				ESS[n] = 1
			}
		}
		
		# calculate the (original) Gelman-Rubin diagnostic, otherwise known as potential scale reduction factor (PSRF), denoted \hat{R}, separately for each parameter component
		# A PSRF < 1.1 is often used as a rough criterion for terminating the chain, although for critical applications the threshold should be closer to 1 (Vats and Knudson, 2020, Revisiting the Gelman-Rubin Diagnostic)
		# Because the commonly used PSRF formula (Brooks and Gelman, 1998) assumes equal-length chains, we only use the last N samples from every chain, where N is the minimum chain length available
		# The formula used here is also the same as used by the R function coda::gelman.diag.
		# Reference:
		#	Gelman and Rubin (1992). Inference from iterative simulation using multiple sequences. Statistical science. 7:457-472.
		#	Brooks and Gelman (1998). General methods for monitoring convergence of iterative simulations. Journal of Computational and Graphical Statistics. 7:434-455.
		if(verbose) cat(sprintf("%sCalculating Gelman-Rubin diagnostic R (for each model parameter)..\n",verbose_prefix))
		chain_means = matrix(NA,nrow=NP, ncol=Nchains)
		chain_vars  = matrix(NA,nrow=NP, ncol=Nchains)
		min_chain_length = min(sapply(seq_len(Nchains), FUN=function(n) ncol(chains[[n]]$thetas)))
		for(n in seq_len(Nchains)){
			N = ncol(chains[[n]]$thetas)
			chain_means[,n] = rowMeans(chains[[n]]$thetas[,(N-min_chain_length):N], na.rm=TRUE)
			chain_vars[,n]  = rowSums((chains[[n]]$thetas[,(N-min_chain_length):N]-chain_means[,n])^2, na.rm=TRUE)/(min_chain_length-1)
		}
		pooled_means = rowMeans(chain_means) # average of the chain means, separately for each parameter component
		Bn 			 = rowVars(chain_means,d=1)
		W  			 = rowMeans(chain_vars)
		sigma2 		 = ((min_chain_length-1)/min_chain_length) * W + Bn # Equation (3) in Gelman and Rubin (1992)
		Vhat		 = sigma2 + Bn/Nchains
		varVhat		 = ((min_chain_length-1)/min_chain_length)^2 * (1/Nchains) * rowVars(chain_vars,d=1) + ((Nchains+1)/Nchains)^2*(2/(Nchains-1))*(Bn^2) + (2*(Nchains+1)*(min_chain_length-1)/(Nchains*Nchains*min_chain_length)) * (rowCovs(chain_vars,chain_means^2,d=1) - 2*pooled_means*rowCovs(chain_vars,chain_means,d=1)) # Equation 4 in Gelman and Rubin (1992).
		degrees		 = 2*(Vhat^2)/varVhat
		PSRF 		 = sqrt(((degrees+3)/(degrees+1)) * Vhat/W) # \hat{R}_c on page 438 in Brooks and Gelman (1998)
		if(verbose) cat(sprintf("%s  Note: max R = %g..\n",verbose_prefix,max(PSRF)))
		
		# calculate equal-tailed credible intervals of parameters (using the pooled chains)
		if(verbose) cat(sprintf("%sCalculating posteror credible intervals..\n",verbose_prefix))
		thetas 			= sapply(seq_len(Nchains), FUN=function(n) chains[[n]]$thetas)
		theta_CI50lower = numeric(NP)
		theta_CI50upper = numeric(NP)
		theta_CI95lower = numeric(NP)
		theta_CI95upper = numeric(NP)
		theta_median	= numeric(NP)
		theta_mean 		= rowMeans(thetas, na.rm=TRUE)
		for(p in seq_len(NP)){
			quantiles 			= quantile(thetas[p,], probs=c(0.25, 0.75, 0.025, 0.975, 0.5), na.rm=TRUE)
			theta_CI50lower[p] 	= quantiles[1]
			theta_CI50upper[p] 	= quantiles[2]
			theta_CI95lower[p] 	= quantiles[3]
			theta_CI95upper[p] 	= quantiles[4]
			theta_median[p] 	= quantiles[5]	
		}
	}
	
	return(list(success					= TRUE,
				Nchains					= Nchains,
				chain_thetas			= lapply(seq_len(Nchains), FUN=function(n) chains[[n]]$thetas),
				chain_distances			= lapply(seq_len(Nchains), FUN=function(n) chains[[n]]$distances),
				acceptance_rate			= sapply(seq_len(Nchains), FUN=function(n) chains[[n]]$acceptance_rate_after_burnin),
				ESS 					= (if(numeric_theta) ESS else NULL), # numeric vector of size Nchains
				PSRF					= (if(numeric_theta) PSRF else NULL), # numeric vector of size NP, Gelman-Rubin convergence diagnostic, aka. potential scale reduction factor, per parameter.
				Nstart_attempts			= sapply(seq_len(Nchains), FUN=function(n) chains[[n]]$Nstart_attempts),
				ACF						= (if(numeric_theta) ACFs else NULL),
				theta_mean				= (if(numeric_theta) theta_mean else NULL),
				theta_median			= (if(numeric_theta) theta_median else NULL),
				theta_CI50lower			= (if(numeric_theta) theta_CI50lower else NULL),
				theta_CI50upper			= (if(numeric_theta) theta_CI50upper else NULL),
				theta_CI95lower			= (if(numeric_theta) theta_CI95lower else NULL),
				theta_CI95upper			= (if(numeric_theta) theta_CI95upper else NULL)))
	
}


# calculate the first Wasserstein distance between two sets of numbers, X & Y
# This is the 1-Wasserstein distance between two probability distributions, each consisting of a sum of Dirac distributions
# As shown by Ramdas et al. (2017), the p-Wasserstein distance between two probability distributions can be written as:
#	D(X,Y) = \int_0^1 |g(u) - f(u)| du
# where g and f are the quantile functions of the two distributions. For the special case where p=1 (first Wasserstein distance), this is equivalent to:
#	D(X,Y) = \int_{-\infty}^\infty |G(t) - F(t)| dt
# where G and F are the cumulative distribution functions (CDFs). For finite sets of numbers X and Y, generated by the two distributions, one replaces G and F by their empirical CDFs.
# This approach is used in python's function scipy.stats.wasserstein_distance
# References:
#   Ramdas et al. (2017). On Wasserstein two-sample testing and related families of nonparametric tests. Entropy. 19(2):47. Proposition 1.
first_Wasserstein_distance = function(X, Y){
	return(first_Wasserstein_distance_CPP(sort(X), sort(Y)))
}


# calculate the weighted Graph Laplacian of a phylogenetic tree (Lewitus and Morlon, 2016, Systematic Biology. 65:495-507)
# If normalized=TRUE, then the Laplacian is ensured to have eigenvalues in [0,2], see for example: http://www2.cs.cas.cz/semincm/lectures/2010-04-13-Hall.pdf
weighted_graph_Laplacian_of_tree = function(tree, normalized=FALSE, sparse=FALSE){
	Nclades = length(tree$tip.label) + tree$Nnode
	edge_length_sums = get_sum_of_edge_lengths_per_clade_CPP(	Ntips 		= length(tree$tip.label),
																Nnodes		= tree$Nnode,
																Nedges		= nrow(tree$edge),
																tree_edge	= as.vector(t(tree$edge)) - 1,
																edge_length	= (if(is.null(tree$edge.length)) numeric() else tree$edge.length))
	bidirectional_edges   = rbind(tree$edge,tree$edge[,c(2,1)]) # duplicate list of edges, to include both directions
	bidirectional_lengths = c(tree$edge.length, tree$edge.length)
	if(normalized){
		diagonal = rep(1, Nclades)
		bidirectional_weights = -bidirectional_lengths/sqrt(edge_length_sums[bidirectional_edges[,1]]*edge_length_sums[bidirectional_edges[,2]])
	}else{
		diagonal = edge_length_sums
		bidirectional_weights = -bidirectional_lengths
	}
	if(sparse){
		# construct matrix in sparse symmetric format
		upper_triangulars = which(bidirectional_edges[,1]>=bidirectional_edges[,2])
		L = Matrix::sparseMatrix(i=c(seq_len(Nclades),bidirectional_edges[upper_triangulars,1]), j=c(seq_len(Nclades),bidirectional_edges[upper_triangulars,2]), x=c(diagonal,bidirectional_weights[upper_triangulars]), dims=c(Nclades,Nclades), symmetric=TRUE)
	}else{
		L 			 = matrix(0, ncol=Nclades, nrow=Nclades)
		diag(L) 	 = diagonal
		L[bidirectional_edges] = bidirectional_weights
	}
	return(L)
}


# Given a phylogenetic tree, extract a subset of pairwise tip distances (with replacement)
sample_pairwise_tip_distances = function(	tree, 
											Npairs,	# number of pairwise distances to consider
											as_edge_counts	= FALSE){
	Ntips  = length(tree$tip.label)
	Nnodes = tree$Nnode
	distances = get_distances_between_clades_CPP(	Ntips 			= Ntips,
													Nnodes 			= Nnodes,
													Nedges 			= nrow(tree$edge),
													tree_edge 		= as.vector(t(tree$edge)) - 1, # flatten in row-major format and adjust clade indices to 0-based
													edge_length		= (if(as_edge_counts || is.null(tree$edge.length)) numeric() else tree$edge.length),
													cladesA			= sample.int(n=Ntips,size=Npairs, replace=TRUE)-1,
													cladesB			= sample.int(n=Ntips,size=Npairs, replace=TRUE)-1,
													verbose			= FALSE,
													verbose_prefix	= "")
	return(distances)
}




eliminate_bifurcating_root = function(tree){
	Ntips  = length(tree$tip.label)
	Nnodes = tree$Nnode
	if(Ntips<=2) return(list(success=FALSE, error="Input tree must have at least 3 tips"))
	results = eliminate_bifurcating_root_CPP(	Ntips 			= Ntips,
												Nnodes 			= Nnodes,
												Nedges 			= nrow(tree$edge),
												tree_edge 		= as.vector(t(tree$edge)) - 1, # flatten in row-major format and adjust clade indices to 0-based
												edge_length		= (if(is.null(tree$edge.length)) numeric() else tree$edge.length));
	if(!results$changed){
		return(list(success=TRUE, changed=FALSE, tree=tree))
	}else{
		new2old_clade = results$new2old_clade+1
		new_tree = list(Nnode 		= results$Nnodes,
						tip.label 	= (if(is.null(tree$tip.label)) NULL else tree$tip.label[new2old_clade[1:Ntips]]),
						node.label 	= (if(is.null(tree$node.label)) NULL else tree$node.label[new2old_clade[(Ntips+1):results$Nclades]-Ntips]),
						edge 		= matrix(as.integer(results$new_tree_edge),ncol=2,byrow=TRUE) + 1L,
						edge.length = results$new_edge_length)
		class(new_tree) = "phylo"
		attr(new_tree,"order") = NULL
		return(list(success=TRUE, changed=TRUE, tree=new_tree, new2old_clade=new2old_clade))
	}
}



read_distances_list = function(	file_path,
								delimiter			= "\t",
								comment_prefix		= "#",
								distances_column	= 3,
								min_distance		= -Inf, # omit edges shorter than this number
								max_distance		= Inf, # omit edges longer than this number
								max_Nedges			= -1, # optional maximum number of edges to load. If negative, this filter is ignored.
								verbose_interval	= 100000, # number of lines loaded between successive progress messages
								verbose_prefix		= ""){
	dlist = read_distances_list_CPP(file_path			= file_path,
									delimiter			= delimiter,
									comment_prefix		= comment_prefix,
									distances_column	= distances_column-1,
									min_distance		= min_distance,
									max_distance		= max_distance,
									max_Nedges			= max_Nedges,
									verbose_interval	= verbose_interval,
									verbose_prefix		= verbose_prefix)
	dlist$edges = matrix(1 + dlist$edges, ncol=2, byrow=TRUE)
	return(dlist)
}


# extract a subtree with each tip label represented only once, i.e., remove tips with duplicate labels
keep_unique_tip_labels = function(tree){
	keep_tips = which(!duplicated(tree$tip.label))
	return(get_subtree_with_tips(tree, only_tips=keep_tips)$subtree)
}